openvinotoolkit · atobiszei · May 21, 2026 · Jan 16, 2025 · Feb 17, 2026 · Feb 18, 2026
diff --git a/common_settings.bzl b/common_settings.bzl
@@ -209,8 +209,6 @@ COMMON_STATIC_TEST_COPTS = select({
                     "-Wall",
                     "-Wno-unknown-pragmas",
                     "-Werror",
-                    # ov::Tensor::data method call results in deprecated warning and we use it in multiple places
-                    "-Wno-deprecated-declarations",
                     "-Isrc",
                     "-fconcepts", # for gmock related utils
                     "-fvisibility=hidden",# Needed for pybind targets

diff --git a/demos/benchmark/v3/benchmark.py b/demos/benchmark/v3/benchmark.py
@@ -438,4 +438,8 @@ async def limited_request_func(request_func_input, pbar):
 print(f"Throughput - Tokens per second: {num_tokens / benchmark_results['duration']:^,.1f}")
 print(f"Mean latency: {np.mean(benchmark_results['latencies'])*1000:.2f} ms")
 print(f"Median latency: {np.median(benchmark_results['latencies'])*1000:.2f} ms")
+# add printing 10 percentiles of latency to better understand latency distribution
+percentiles = [10, 25, 50, 75, 90, 95, 99]
+for p in percentiles:
+    print(f"{p}th percentile latency: {np.percentile(benchmark_results['latencies'], p)*1000:.2f} ms")
 print(f"Average document length: {num_tokens / len(docs)} tokens")
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -103,7 +103,7 @@ def add_common_arguments(parser):
 parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps')
 args = vars(parser.parse_args())
 
-t2s_graph_template = """
+t2s_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
 input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
 node {
@@ -129,7 +129,7 @@ def add_common_arguments(parser):
 }
 """
 
-s2t_graph_template = """
+s2t_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
 input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
 node {
@@ -165,7 +165,7 @@ def add_common_arguments(parser):
 }
 """
 
-embedding_graph_ov_template = """
+embedding_graph_ov_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
 input_stream: "REQUEST_PAYLOAD:input"
 output_stream: "RESPONSE_PAYLOAD:output"
 node {
@@ -189,7 +189,7 @@ def add_common_arguments(parser):
 }
 """
 
-rerank_graph_ov_template = """
+rerank_graph_ov_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
 input_stream: "REQUEST_PAYLOAD:input"
 output_stream: "RESPONSE_PAYLOAD:output"
 node {
@@ -208,7 +208,8 @@ def add_common_arguments(parser):
 }
 """
 
-text_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input"
+text_generation_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
 
 node: {
@@ -262,7 +263,8 @@ def add_common_arguments(parser):
   }
 }"""
 
-image_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input"
+image_generation_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
 
 node: {

diff --git a/docs/mediapipe.md b/docs/mediapipe.md
@@ -215,6 +215,53 @@ Nodes in the MediaPipe graphs can reference both the models configured in model_
 
 Subconfig file may only contain *model_config_list* section  - in the same format as in [models config file](starting_server.md).
 
+### Graph Pool (Pre-initialized Graph Queue)
+
+OpenVINO Model Server can pre-initialize a pool of MediaPipe `CalculatorGraph` instances for a graph definition. Graphs in the pool are started once during server initialization and reused across inference requests, eliminating per-request graph initialization and teardown overhead. This is especially beneficial for graphs that involve expensive setup, done in calculators `Open()` method.
+
+#### How it works
+
+Without graph pool, each incoming request creates a new `CalculatorGraph`, calls `StartRun()` with side packets, processes the request, then tears down the graph via `CloseAllPacketSources()` and `WaitUntilDone()`.
+
+With graph pool enabled, a fixed number of graphs are pre-initialized and kept in a queue. When a request arrives, an idle graph is acquired from the queue. After processing, the graph is returned to the queue for the next request. The graph is never torn down — instead, `WaitUntilIdle()` is called between requests and the internal timestamp is incremented.
+
+#### Configuration
+
+The graph pool size is controlled via a comment directive in the graph `.pbtxt` file:
+
+```
+# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
+```
+
+| Value | Behavior |
+|:------|:---------|
+| `AUTO` | Pool size is set to the number of hardware threads (`std::thread::hardware_concurrency()`), or 16 if detection fails |
+| Positive integer (e.g. `4`) | Pool size set to the given value (must not exceed hardware thread count) |
+| `0` | Graph pool disabled — falls back to per-request graph creation |
+| *(directive absent)* | Default: graph pool is disabled |
+
+**Default behavior:** graph pool stays disabled unless `OVMS_GRAPH_QUEUE_MAX_SIZE` is explicitly present in `graph.pbtxt`. Since the OVMS CLI graph exporter (`--pull --task`) always emits this directive, **graphs created via the CLI exporter have the pool enabled by default**.
+
+**Generated graphs from exporters:**
+- OVMS `--task ...` graph export emits `# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO` for all graph types.
+- `demos/common/export_models/export_model.py` also emits `# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO` for all graph types.
+
+**Runtime kill-switch:**
+Setting the environment variable `OVMS_GRAPH_QUEUE_OFF=1` globally disables graph pools at runtime, regardless of the directive in `graph.pbtxt`. 
+
+#### Important considerations for graph developers
+
+**Stateful calculators:**
+Since graphs in the pool are reused across requests, any state held by a calculator between `Process()` calls will persist across requests. If your calculator accumulates state (e.g. counters, buffers, history), that state will carry over to the next request that reuses the same graph instance. Design your calculators to either:
+- Be stateless (reset any per-request state at the beginning of each `Process()` call), or
+- Explicitly handle the fact that the graph may have already processed prior requests.
+
+**Input side packets from requests are not supported:**
+When graph pool is enabled, side packets are set once at pool construction time and cannot be overridden per request. If a client sends request parameters that would normally become input side packets (e.g. KServe request parameters other than `OVMS_MP_TIMESTAMP`), the request will be rejected with an error. If your graph relies on per-request side packets to configure calculator behavior, either disable the graph pool (`# OVMS_GRAPH_QUEUE_MAX_SIZE: 0`) or redesign the graph to accept such parameters as regular input stream packets instead of side packets.
+
+**Python generative nodes (LOOPBACK) are not compatible with graph pool:**
+Python nodes using generative mode (`execute` that `yield`s) rely on per-calculator state (`pyIteratorPtr`) that persists across `Process()` calls within a single request. With graph pool enabled, if a generator does not fully complete (e.g. client disconnects mid-stream), the stale iterator remains on the reused graph instance and subsequent requests will fail. Only Python nodes using regular mode (stateless `execute` that `return`s a list) are safe to use with graph pool.
+
 
 ## Deployment testing
 ### Debug logs

diff --git a/docs/python_support/reference.md b/docs/python_support/reference.md
@@ -749,6 +749,8 @@ node {
 
 When using generative mode, the `execute` method in [`OvmsPythonModel`](#ovmspythonmodel-class) class must `yield` value.
 
+> **Note:** Generative mode is not compatible with the [graph pool](../mediapipe.md#graph-pool-scalability-feature). Do not add `# OVMS_GRAPH_QUEUE_MAX_SIZE` directive to graphs that use generative Python nodes with LOOPBACK.
+
 ```python
 from pyovms import Tensor
 ...

diff --git a/src/BUILD b/src/BUILD
@@ -2432,8 +2432,10 @@ cc_test(
         "test/configs/config_dummy_dynamic_shape.json",
         "test/configs/emptyConfigWithMetrics.json",
         "test/llm/config.json",
+        "test/llm/config_queue.json",
         "test/llm/assisted_decoding_config.json",
         "test/llm/lm_cb_regular.pbtxt",
+        "test/llm/lm_cb_regular_queue.pbtxt",
         "test/llm/lm_cb_with_tool_parser.pbtxt",
         "test/llm/lm_legacy_regular.pbtxt",
         "test/llm/lm_cb_speculative.pbtxt",
@@ -2454,6 +2456,7 @@ cc_test(
         "test/increment_1x3x4x5/1/increment_1x3x4x5.xml",
         "test/increment_1x3x4x5/1/increment_1x3x4x5.bin",
         "test/mediapipe/config_mediapipe_openai_chat_completions_mock.json",
+        "test/mediapipe/config_mediapipe_openai_chat_completions_mock_with_queue.json",
         "test/mediapipe/config_mediapipe_add_adapter_full.json",
         "test/mediapipe/config_mediapipe_all_graphs_adapter_full.json",
         "test/mediapipe/config_mediapipe_dummy_adapter_full_dag.json",
@@ -2483,6 +2486,10 @@ cc_test(
         "test/mediapipe/graphscalar.pbtxt",
         "test/mediapipe/graphWithParams.pbtxt",
         "test/mediapipe/graphdummyadapterfull_dummyinputnames.pbtxt",
+        "test/mediapipe/graphdummyadapterfull_dummyinputnames_newpath.pbtxt",
+        "test/mediapipe/graph_gpt_with_queue.pbtxt",
+        "test/mediapipe/graph_queue_dummyadapterfull_dummyinputnames.pbtxt",
+        "test/mediapipe/graph_queue_dummyadapterfull_dummyinputnames_newpath.pbtxt",
         "test/mediapipe/graphadapterfull_two_outputs_dag.pbtxt",
         "test/mediapipe/graphdummyadapterfull_two_outputs.pbtxt",
         "test/mediapipe/graph_multipart.pbtxt",
@@ -2905,6 +2912,7 @@ cc_library(
         ":test_test_with_temp_dir",
         "//src/graph_export:graph_export",
         "//src:libovms_server_settings",
+        "//src:libovms_systeminfo",
         "@com_google_googletest//:gtest",
     ],
     local_defines = COMMON_LOCAL_DEFINES,

diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp
@@ -191,6 +191,7 @@ struct ExportSettings {
     std::string modelName = "";
     std::string modelPath = "./";
     std::string targetDevice = "CPU";
+    std::optional<uint32_t> restWorkers;
     std::optional<std::string> extraQuantizationParams;
     std::optional<std::string> vocoder;
     std::string precision = "int8";

diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp
@@ -757,6 +757,7 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl&
             hfSettings.exportSettings.extraQuantizationParams = result->operator[]("extra_quantization_params").as<std::string>();
         if (result->count("vocoder"))
             hfSettings.exportSettings.vocoder = result->operator[]("vocoder").as<std::string>();
+        hfSettings.exportSettings.restWorkers = serverSettings.restWorkers;
         hfSettings.downloadPath = result->operator[]("model_repository_path").as<std::string>();
         // When --task is used with --model_path but without --pull/--source_model,
         // use model_path as the model location (no HF download needed)

diff --git a/src/graph_export/BUILD b/src/graph_export/BUILD
@@ -29,6 +29,7 @@ ovms_cc_library(
         "@ovms//src:libovms_module",
         "@ovms//src/filesystem:libovmsfilesystem",
         "@ovms//src/filesystem:libovmslocalfilesystem",
+        "@ovms//src:libovms_systeminfo",
         "@com_github_tencent_rapidjson//:rapidjson",
         "@ovms//src:libovmsschema",
         "@ovms//src:libovms_version",

diff --git a/src/graph_export/graph_export.cpp b/src/graph_export/graph_export.cpp
@@ -67,6 +67,15 @@ void GraphExport::clearInMemoryGraphContent() {
 }
 
 static const std::string OVMS_VERSION_GRAPH_LINE = std::string("# File created with: ") + PROJECT_NAME + std::string(" ") + PROJECT_VERSION + std::string("\n");
+static const std::string OVMS_GRAPH_QUEUE_MAX_SIZE_LINE_PREFIX = "# OVMS_GRAPH_QUEUE_MAX_SIZE: ";
+static const std::string OVMS_GRAPH_QUEUE_SIZE_AUTO = "AUTO";
+
+static std::string buildGraphHeader() {
+    std::ostringstream oss;
+    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << OVMS_GRAPH_QUEUE_MAX_SIZE_LINE_PREFIX << OVMS_GRAPH_QUEUE_SIZE_AUTO << "\n";
+    return oss.str();
+}
 
 static std::string constructModelsPath(const std::string& modelPath, const std::optional<std::string>& ggufFilenameOpt) {
     std::string modelsPath;
@@ -134,7 +143,7 @@ static Status createTextGenerationGraphTemplate(const std::string& directoryPath
     auto& exportSettings = hfSettings.exportSettings;
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
@@ -229,7 +238,7 @@ static Status createRerankGraphTemplate(const std::string& directoryPath, const
     auto& exportSettings = hfSettings.exportSettings;
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     // Windows path creation - graph parser needs forward slashes in paths
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
@@ -273,7 +282,7 @@ static Status createEmbeddingsGraphTemplate(const std::string& directoryPath, co
     auto& exportSettings = hfSettings.exportSettings;
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
@@ -319,7 +328,7 @@ static Status createTextToSpeechGraphTemplate(const std::string& directoryPath,
     auto& exportSettings = hfSettings.exportSettings;
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
@@ -374,7 +383,7 @@ static Status createSpeechToTextGraphTemplate(const std::string& directoryPath,
     auto& exportSettings = hfSettings.exportSettings;
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     std::string modelsPath = constructModelsPath(exportSettings.modelPath, ggufFilename);
     SPDLOG_TRACE("modelsPath: {}, directoryPath: {}, ggufFilename: {}", modelsPath, directoryPath, ggufFilename.value_or("std::nullopt"));
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
@@ -448,7 +457,7 @@ static Status createImageGenerationGraphTemplate(const std::string& directoryPat
     GET_PLUGIN_CONFIG_OPT_OR_FAIL_AND_RETURN(exportSettings);
 
     std::ostringstream oss;
-    oss << OVMS_VERSION_GRAPH_LINE;
+    oss << buildGraphHeader();
     // clang-format off
     oss << R"(
 input_stream: "HTTP_REQUEST_PAYLOAD:input"

diff --git a/src/http_frontend/http_graph_executor_impl.cpp b/src/http_frontend/http_graph_executor_impl.cpp
@@ -38,6 +38,10 @@ namespace ovms {
 
 static const std::string UNUSED_REQUEST_ID = "";
 
+bool requestHasInputSidePackets(const HttpPayload& request) {
+    return false;
+}
+
 Status deserializeInputSidePacketsFromFirstRequestImpl(
     std::map<std::string, mediapipe::Packet>& inputSidePackets,  // out
     const HttpPayload& request) {                                // in

diff --git a/src/http_frontend/http_graph_executor_impl.hpp b/src/http_frontend/http_graph_executor_impl.hpp
@@ -48,6 +48,8 @@ class PythonBackend;
 
 using HttpReaderWriter = HttpAsyncWriter;
 
+bool requestHasInputSidePackets(const HttpPayload& request);
+
 // Deserialization of parameters inside KServe gRPC request
 // into mediapipe Packets.
 // To be used by both - infer & inferStream.

diff --git a/src/kfs_frontend/kfs_graph_executor_impl.cpp b/src/kfs_frontend/kfs_graph_executor_impl.cpp
@@ -24,6 +24,7 @@
 
 #include "../kfs_frontend/kfs_utils.hpp"
 #include "../logging.hpp"
+#include "../mediapipe_internal/graph_executor_constants.hpp"
 #include "../mediapipe_internal/mediapipe_utils.hpp"
 #include "../mediapipe_internal/mediapipegraphdefinition.hpp"
 #include "../predict_request_validation_utils.hpp"
@@ -925,6 +926,7 @@ static Status createPacketAndPushIntoGraph(const std::string& name, std::shared_
     }
     std::unique_ptr<T> inputTensor;
     OVMS_RETURN_ON_FAIL(deserializeTensor(name, *request, inputTensor, pythonBackend));
+    SPDLOG_TRACE("Current Timestamp before actual pushing:{}", timestamp.Value());
     MP_RETURN_ON_FAIL(graph.AddPacketToInputStream(
                           name,
                           ::mediapipe::packet_internal::Create(
@@ -1152,10 +1154,19 @@ Status createAndPushPacketsImpl(
     return StatusCode::OK;
 }
 
+bool requestHasInputSidePackets(const KFSRequest& request) {
+    static const std::string TIMESTAMP_PARAM{"OVMS_MP_TIMESTAMP"};
+    for (const auto& [name, valueChoice] : request.parameters()) {
+        if (name != TIMESTAMP_PARAM) {
+            return true;
+        }
+    }
+    return false;
+}
+
 Status deserializeInputSidePacketsFromFirstRequestImpl(
     std::map<std::string, mediapipe::Packet>& inputSidePackets,
     const KFSRequest& request) {
-    static const std::string PYTHON_SESSION_SIDE_PACKET_TAG{"py"};
     for (const auto& [name, valueChoice] : request.parameters()) {
         SPDLOG_DEBUG("Found: {}; parameter in request for: {};", name, request.model_name());
         if (name == TIMESTAMP_PARAMETER_NAME) {

diff --git a/src/kfs_frontend/kfs_graph_executor_impl.hpp b/src/kfs_frontend/kfs_graph_executor_impl.hpp
@@ -36,6 +36,10 @@ namespace ovms {
 class PythonBackend;
 class Status;
 
+// Checks whether the request contains user-provided input side packets
+// (parameters other than the reserved OVMS_MP_TIMESTAMP).
+bool requestHasInputSidePackets(const KFSRequest& request);
+
 // Deserialization of parameters inside KServe gRPC request
 // into mediapipe Packets.
 // To be used by both - infer & inferStream.

diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -25,6 +25,7 @@ ovms_cc_library(
         "//third_party:openvino",
         "@mediapipe//mediapipe/framework:calculator_framework",
         "@com_github_tencent_rapidjson//:rapidjson",
+        "//src/mediapipe_internal:graph_side_packets",
         "//src/kfserving_api:kfserving_api_cpp",
         "//src:libovmsprofiler",
         ":genai_servables",
@@ -350,6 +351,19 @@ ovms_cc_library(
     additional_copts = COPTS_PYTHON
 )
 
+ovms_cc_library(
+    name = "execution_context_utils",
+    hdrs = ["execution_context_utils.hpp"],
+    srcs = ["execution_context_utils.cpp"],
+    deps = [
+        ":genai_servables",
+        "//src/mediapipe_internal:graph_side_packets",
+        "//src:libovmslogging",
+        "//src:libovmsstatus",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 ovms_cc_library(
     name = "py_jinja_template_processor",
     hdrs = ["py_jinja_template_processor.hpp"],