openvinotoolkit · dtrawins · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/src/BUILD b/src/BUILD
@@ -348,6 +348,7 @@ ovms_cc_library(
         "libovms_cliparser",
         "libovms_systeminfo",
         "ovms_exit_codes",
+        "//src/utils:env_guard",
     ],
     visibility = ["//visibility:public",],
     additional_copts = COPTS_DROGON,
@@ -2076,6 +2077,7 @@ ovms_cc_library(
             "libovmsshape",
             "libovmsprofiler",
             "libovms_tensorinfo",
+            "libovms_systeminfo",
     ],
     visibility = ["//visibility:public"],
 )
@@ -3109,7 +3111,8 @@ ovms_cc_library(
             "@mediapipe//mediapipe/framework:calculator_framework",
             "//third_party:openvino",
             "@com_github_tencent_rapidjson//:rapidjson", 
-            "//third_party:genai",],
+            "//third_party:genai",
+            "//src:libovms_ov_utils",],
     visibility = ["//visibility:public"],
     alwayslink = 1,
 )
diff --git a/src/config.cpp b/src/config.cpp
@@ -14,7 +14,8 @@
 // limitations under the License.
 //*****************************************************************************
 #include "config.hpp"
-
+#include <algorithm>
+#include <atomic>
 #include <filesystem>
 #include <limits>
 #include <regex>
@@ -36,19 +37,37 @@
 #include "modelconfig.hpp"
 #include "stringutils.hpp"
 #include "systeminfo.hpp"
+#include "utils/env_guard.hpp"
 
 namespace ovms {
 
 const uint32_t AVAILABLE_CORES = getCoreCount();
 const uint32_t WIN_MAX_GRPC_WORKERS = 1;
 const uint32_t MAX_PORT_NUMBER = std::numeric_limits<uint16_t>::max();
 
-// For drogon, we need to minimize the number of default workers since this value is set for both: unary and streaming (making it always double)
-const uint64_t DEFAULT_REST_WORKERS = AVAILABLE_CORES;
 const uint32_t DEFAULT_GRPC_MAX_THREADS = AVAILABLE_CORES * 8.0;
 const size_t DEFAULT_GRPC_MEMORY_QUOTA = (size_t)2 * 1024 * 1024 * 1024;  // 2GB
 const uint64_t MAX_REST_WORKERS = 10'000;
 
+// We need to minimize the number of default drogon workers since this value is set for both: unary and streaming (making it always double)
+// on linux, restrict also based on the max allowed number of open files
+#ifdef __linux__
+
+const uint64_t RESERVED_OPEN_FILES = 15;        // we need to reserve some file descriptors for other operations, so we don't want to use all of them for drogon workers
+const uint64_t OPEN_FILES_PER_REST_WORKER = 7;  // 5x rest_workers to initialize ovms and 2x rest_workers for new connections
+uint64_t getDefaultRestWorkers() {
+    const uint64_t maxOpenFiles = getMaxOpenFilesLimit();
+    if (maxOpenFiles <= RESERVED_OPEN_FILES) {
+        return static_cast<uint64_t>(0);
+    }
+    return std::min(static_cast<uint64_t>(AVAILABLE_CORES), (maxOpenFiles - RESERVED_OPEN_FILES) / OPEN_FILES_PER_REST_WORKER);
+}
+#else
+uint64_t getDefaultRestWorkers() {
+    return AVAILABLE_CORES;
+}
+#endif
+
 Config& Config::parse(int argc, char** argv) {
     ovms::CLIParser parser;
     ovms::ServerSettingsImpl serverSettings;
@@ -73,6 +92,14 @@ Config& Config::parse(int argc, char** argv) {
 bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* modelsSettings) {
     this->serverSettings = *serverSettings;
     this->modelsSettings = *modelsSettings;
+
+    static EnvGuard envGuard;
+#if defined(__linux__) || defined(_WIN32)
+    if (this->serverSettings.logLevel == "DEBUG") {
+        envGuard.set("OPENVINO_LOG_LEVEL", "4");
+    }
+#endif
+
     return validate();
 }
 
@@ -297,7 +324,8 @@ bool Config::validate() {
     }
 
     // check rest_workers value
-    if (((restWorkers() > MAX_REST_WORKERS) || (restWorkers() < 2))) {
+    const uint32_t restWorkersValue = restWorkers();  // Cache to avoid multiple calls
+    if (((restWorkersValue > MAX_REST_WORKERS) || (restWorkersValue < 2))) {
         std::cerr << "rest_workers count should be from 2 to " << MAX_REST_WORKERS << std::endl;
         return false;
     }
@@ -306,6 +334,12 @@ bool Config::validate() {
         std::cerr << "rest_workers is set but rest_port is not set. rest_port is required to start rest servers" << std::endl;
         return false;
     }
+#ifdef __linux__
+    if (restWorkersValue > (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6) {
+        std::cerr << "rest_workers count cannot be larger than " << (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6 << " due to open files limit. Current open files limit: " << getMaxOpenFilesLimit() << std::endl;
+        return false;
+    }
+#endif
 
 #ifdef _WIN32
     if (grpcWorkers() > WIN_MAX_GRPC_WORKERS) {
@@ -368,7 +402,7 @@ const std::string Config::restBindAddress() const { return this->serverSettings.
 uint32_t Config::grpcWorkers() const { return this->serverSettings.grpcWorkers; }
 uint32_t Config::grpcMaxThreads() const { return this->serverSettings.grpcMaxThreads.value_or(DEFAULT_GRPC_MAX_THREADS); }
 size_t Config::grpcMemoryQuota() const { return this->serverSettings.grpcMemoryQuota.value_or(DEFAULT_GRPC_MEMORY_QUOTA); }
-uint32_t Config::restWorkers() const { return this->serverSettings.restWorkers.value_or(DEFAULT_REST_WORKERS); }
+uint32_t Config::restWorkers() const { return static_cast<uint32_t>(std::max(static_cast<uint64_t>(2), static_cast<uint64_t>(this->serverSettings.restWorkers.value_or(getDefaultRestWorkers())))); }
 const std::string& Config::modelName() const { return this->modelsSettings.modelName; }
 const std::string& Config::modelPath() const { return this->modelsSettings.modelPath; }
 const std::string& Config::batchSize() const {

diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -283,6 +283,9 @@ ovms_cc_library(
         "//src:httppayload",
         "//src:libhttpclientconnection",
         "//src:sse_utils",
+        "//src:libovms_systeminfo",
+        "//src:libovms_config",
+        "//src:libovms_ov_utils",
         "//third_party:genai",] + select({
         "//:disable_python": [],
         "//:not_disable_python" : [":py_jinja_template_processor"],

diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
+#include <algorithm>
 #include <fstream>
 #include <memory>
 #include <stdexcept>
@@ -32,10 +33,13 @@
 #pragma GCC diagnostic pop
 #pragma warning(pop)
 
+#include "../../../config.hpp"
 #include "../../../json_parser.hpp"
 #include "../../../logging.hpp"
 #include "../../../mediapipe_internal/mediapipe_utils.hpp"
+#include "../../../ov_utils.hpp"
 #include "../../../status.hpp"
+#include "../../../systeminfo.hpp"
 #include "llm_executor.hpp"
 #include "servable.hpp"
 #include "servable_initializer.hpp"
@@ -204,7 +208,26 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return status;
     }
 
-    properties->tokenizerPluginConfig = {{"PERFORMANCE_HINT", "THROUGHPUT"}};
+    if (properties->device == "CPU") {
+        status = applyDefaultCpuProperties(properties->pluginConfig);
+        if (!status.ok()) {
+            SPDLOG_ERROR("Failed to apply default CPU properties for LLM model: {}", status.string());
+            return status;
+        }
+    }
+
+    ov::AnyMap tokenProperties;
+    const uint32_t tokenizerNumStreams = std::min(static_cast<uint32_t>(Config::instance().restWorkers()), static_cast<uint32_t>(getCoreCount()));
+    tokenProperties[ov::num_streams.name()] = static_cast<int>(tokenizerNumStreams);
+    tokenProperties[ov::hint::performance_mode.name()] = ov::hint::PerformanceMode::THROUGHPUT;
+    SPDLOG_DEBUG("Setting tokenizer/detokenizer NUM_STREAMS to: {}", tokenizerNumStreams);
+    status = applyDefaultCpuProperties(tokenProperties);
+    if (!status.ok()) {
+        SPDLOG_ERROR("Failed to apply default CPU properties for tokenizer: {}", status.string());
+        return status;
+    }
+    properties->tokenizerPluginConfig = tokenProperties;
+
     try {
         properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
             properties->schedulerConfig, properties->device,

diff --git a/src/modelmanager.cpp b/src/modelmanager.cpp
@@ -69,6 +69,7 @@
 #include "schema.hpp"
 #include "servable_definition.hpp"
 #include "stringutils.hpp"
+#include "systeminfo.hpp"
 
 namespace ovms {
 
@@ -79,7 +80,6 @@ const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "c:\\Intel\\openvino_cache";
 const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "/opt/cache";
 #endif
 ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistry* registry, PythonBackend* pythonBackend) :
-    ieCore(std::make_unique<ov::Core>()),
     pipelineFactory(std::make_unique<PipelineFactory>()),
 #if (MEDIAPIPE_DISABLE == 0)
     mediapipeFactory(std::make_unique<MediapipeFactory>(pythonBackend)),
@@ -89,6 +89,20 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
     modelCacheDirectory(modelCacheDirectory),
     metricRegistry(registry),
     pythonBackend(pythonBackend) {
+    try {
+        this->ieCore = std::make_unique<ov::Core>();
+        ov::AnyMap cpuProperties;
+        Status status = applyDefaultCpuProperties(cpuProperties);
+        if (!status.ok()) {
+            SPDLOG_CRITICAL("Failed to apply default CPU properties. Reason: {}", status.string());
+            throw std::runtime_error("Failed to apply default CPU properties");
+        }
+        this->ieCore->set_property("CPU", cpuProperties);
+    } catch (const std::exception& ex) {
+        SPDLOG_CRITICAL("Failed to initialize OpenVINO Core with CPU properties. Reason: {}", ex.what());
+        throw;
+    }
+
     OV_LOGGER("ov::Core(): {}", reinterpret_cast<void*>(this->ieCore.get()));
     // Take --cache_dir from CLI
     if (this->modelCacheDirectory.empty()) {
@@ -151,6 +165,12 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
         throw;
     }
     this->logPluginConfiguration();
+#ifdef __linux__
+    if (isRunningInDocker()) {
+        SPDLOG_INFO("Running inside Docker container");
+        SPDLOG_INFO("cpu quota: {}, cpu affinity: {}, max_open_files: {}", getDockerCpuQuota(), getCpuAffinityCount(), getMaxOpenFilesLimit());
+    }
+#endif
 }
 
 void ModelManager::logPluginConfiguration() {

diff --git a/src/ov_utils.cpp b/src/ov_utils.cpp
@@ -26,6 +26,7 @@
 #include "logging.hpp"
 #include "profiler.hpp"
 #include "status.hpp"
+#include "systeminfo.hpp"
 #include "tensorinfo.hpp"
 
 namespace ovms {
@@ -148,4 +149,50 @@ Status validatePluginConfiguration(const plugin_config_t& pluginConfig, const st
 
     return StatusCode::OK;
 }
+
+Status applyDefaultCpuProperties(ov::AnyMap& properties) {
+    try {
+        const uint16_t coreCount = getCoreCount();
+
+        if (properties.find(ov::inference_num_threads.name()) == properties.end()) {
+            properties[ov::inference_num_threads.name()] = static_cast<int>(coreCount);
+            SPDLOG_DEBUG("applyDefaultCpuProperties: setting inference_num_threads to {}", coreCount);
+        }
+
+#ifdef __linux__
+        if (properties.find(ov::hint::enable_cpu_pinning.name()) == properties.end()) {
+            if (isRunningInDocker()) {
+                const bool cpuPinning = getDockerCpuQuota() <= 0;
+                properties[ov::hint::enable_cpu_pinning.name()] = cpuPinning;
+                SPDLOG_DEBUG("applyDefaultCpuProperties: setting enable_cpu_pinning to {}", cpuPinning);
+            }
+        }
+#endif
+
+        const auto perfIt = properties.find(ov::hint::performance_mode.name());
+        if (perfIt != properties.end()) {
+            bool isThroughput = false;
+            try {
+                isThroughput = (perfIt->second.as<ov::hint::PerformanceMode>() == ov::hint::PerformanceMode::THROUGHPUT);
+            } catch (...) {
+                try {
+                    isThroughput = (perfIt->second.as<std::string>() == "THROUGHPUT");
+                } catch (...) {
+                }
+            }
+            if (isThroughput && properties.find(ov::num_streams.name()) == properties.end()) {
+                properties[ov::num_streams.name()] = static_cast<int>(coreCount);
+                SPDLOG_DEBUG("applyDefaultCpuProperties: setting num_streams to {} (THROUGHPUT hint active)", coreCount);
+            }
+        }
+    } catch (const std::exception& ex) {
+        SPDLOG_ERROR("Exception while applying default CPU properties: {}", ex.what());
+        return StatusCode::INTERNAL_ERROR;
+    } catch (...) {
+        SPDLOG_ERROR("Unknown exception while applying default CPU properties");
+        return StatusCode::INTERNAL_ERROR;
+    }
+    return StatusCode::OK;
+}
+
 }  // namespace ovms
diff --git a/src/ov_utils.hpp b/src/ov_utils.hpp
@@ -52,6 +52,13 @@ std::optional<ov::Layout> getLayoutFromRTMap(const ov::RTMap& rtMap);
 
 Status validatePluginConfiguration(const plugin_config_t& pluginConfig, const std::string& targetDevice, const ov::Core& ieCore);
 
+// Applies resource-aware CPU defaults to an OpenVINO property map.
+// Sets inference_num_threads and (on Linux) enable_cpu_pinning only when not
+// already present in the map.  When PERFORMANCE_HINT=THROUGHPUT is set,
+// num_streams is also capped to the detected core count if not already set.
+// Returns StatusCode::INTERNAL_ERROR on any OpenVINO exception.
+Status applyDefaultCpuProperties(ov::AnyMap& properties);
+
 // Logging
 // #1 model/global plugin  CompiledMode:DUMMY / Global OpenVINO plugin:CPU
 // #2 version/_
@@ -96,4 +103,5 @@ static void logOVPluginConfig(PropertyExtractor&& propertyExtractor, const std::
     std::string pluginConfigNameValuesString = joins(pluginConfigNameValues, ", ");
     SPDLOG_LOGGER_DEBUG(modelmanager_logger, "{}; {}plugin configuration: {{ {} }}", loggingAuthor, loggingDetails, pluginConfigNameValuesString);
 }
+
 }  // namespace ovms
diff --git a/src/sidepacket_servable.cpp b/src/sidepacket_servable.cpp
@@ -20,6 +20,7 @@
 #include "openvino/runtime/core.hpp"
 #include "sidepacket_servable.hpp"
 #include "logging.hpp"
+#include "ov_utils.hpp"
 #include <spdlog/spdlog.h>
 #include <rapidjson/istreamwrapper.h>
 #include <rapidjson/error/en.h>
@@ -134,6 +135,13 @@ void SidepacketServable::initialize(const std::string& modelDir, const std::stri
     ov::Core core;
     std::shared_ptr<ov::Model> m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
     m_model = this->applyPrePostProcessing(core, m_model, properties);
+    if (targetDevice == "CPU") {
+        auto cpuPropertiesStatus = applyDefaultCpuProperties(properties);
+        if (!cpuPropertiesStatus.ok()) {
+            SPDLOG_ERROR("Failed to apply default CPU properties for embeddings model: {}", cpuPropertiesStatus.string());
+            return;
+        }
+    }
     compiledModel = core.compile_model(m_model, targetDevice, properties);
     SPDLOG_DEBUG("Model compiled {} for {}", parsedModelsPath.string(), targetDevice);