Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ ovms_cc_library(
"libovms_cliparser",
"libovms_systeminfo",
"ovms_exit_codes",
"//src/utils:env_guard",
],
visibility = ["//visibility:public",],
additional_copts = COPTS_DROGON,
Expand Down Expand Up @@ -2076,6 +2077,7 @@ ovms_cc_library(
"libovmsshape",
"libovmsprofiler",
"libovms_tensorinfo",
"libovms_systeminfo",
],
visibility = ["//visibility:public"],
)
Expand Down Expand Up @@ -3109,7 +3111,8 @@ ovms_cc_library(
"@mediapipe//mediapipe/framework:calculator_framework",
"//third_party:openvino",
"@com_github_tencent_rapidjson//:rapidjson",
"//third_party:genai",],
"//third_party:genai",
"//src:libovms_ov_utils",],
visibility = ["//visibility:public"],
alwayslink = 1,
)
44 changes: 39 additions & 5 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
// limitations under the License.
//*****************************************************************************
#include "config.hpp"

#include <algorithm>
#include <atomic>
#include <filesystem>
#include <limits>
#include <regex>
Expand All @@ -36,19 +37,37 @@
#include "modelconfig.hpp"
#include "stringutils.hpp"
#include "systeminfo.hpp"
#include "utils/env_guard.hpp"

namespace ovms {

const uint32_t AVAILABLE_CORES = getCoreCount();
const uint32_t WIN_MAX_GRPC_WORKERS = 1;
const uint32_t MAX_PORT_NUMBER = std::numeric_limits<uint16_t>::max();

// For drogon, we need to minimize the number of default workers since this value is set for both: unary and streaming (making it always double)
const uint64_t DEFAULT_REST_WORKERS = AVAILABLE_CORES;
const uint32_t DEFAULT_GRPC_MAX_THREADS = AVAILABLE_CORES * 8.0;
const size_t DEFAULT_GRPC_MEMORY_QUOTA = (size_t)2 * 1024 * 1024 * 1024; // 2GB
const uint64_t MAX_REST_WORKERS = 10'000;

// We need to minimize the number of default drogon workers since this value is set for both: unary and streaming (making it always double)
// on linux, restrict also based on the max allowed number of open files
#ifdef __linux__

const uint64_t RESERVED_OPEN_FILES = 15; // we need to reserve some file descriptors for other operations, so we don't want to use all of them for drogon workers
const uint64_t OPEN_FILES_PER_REST_WORKER = 7; // 5x rest_workers to initialize ovms and 2x rest_workers for new connections
uint64_t getDefaultRestWorkers() {
const uint64_t maxOpenFiles = getMaxOpenFilesLimit();
if (maxOpenFiles <= RESERVED_OPEN_FILES) {
return static_cast<uint64_t>(0);
}
return std::min(static_cast<uint64_t>(AVAILABLE_CORES), (maxOpenFiles - RESERVED_OPEN_FILES) / OPEN_FILES_PER_REST_WORKER);
}
#else
uint64_t getDefaultRestWorkers() {
return AVAILABLE_CORES;
}
#endif

Config& Config::parse(int argc, char** argv) {
ovms::CLIParser parser;
ovms::ServerSettingsImpl serverSettings;
Expand All @@ -73,6 +92,14 @@ Config& Config::parse(int argc, char** argv) {
bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* modelsSettings) {
this->serverSettings = *serverSettings;
this->modelsSettings = *modelsSettings;

static EnvGuard envGuard;
#if defined(__linux__) || defined(_WIN32)
if (this->serverSettings.logLevel == "DEBUG") {
envGuard.set("OPENVINO_LOG_LEVEL", "4");
}
#endif

return validate();
}

Expand Down Expand Up @@ -297,7 +324,8 @@ bool Config::validate() {
}

// check rest_workers value
if (((restWorkers() > MAX_REST_WORKERS) || (restWorkers() < 2))) {
const uint32_t restWorkersValue = restWorkers(); // Cache to avoid multiple calls
if (((restWorkersValue > MAX_REST_WORKERS) || (restWorkersValue < 2))) {
std::cerr << "rest_workers count should be from 2 to " << MAX_REST_WORKERS << std::endl;
return false;
}
Expand All @@ -306,6 +334,12 @@ bool Config::validate() {
std::cerr << "rest_workers is set but rest_port is not set. rest_port is required to start rest servers" << std::endl;
return false;
}
#ifdef __linux__
if (restWorkersValue > (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6) {
std::cerr << "rest_workers count cannot be larger than " << (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6 << " due to open files limit. Current open files limit: " << getMaxOpenFilesLimit() << std::endl;
return false;
}
#endif

#ifdef _WIN32
if (grpcWorkers() > WIN_MAX_GRPC_WORKERS) {
Expand Down Expand Up @@ -368,7 +402,7 @@ const std::string Config::restBindAddress() const { return this->serverSettings.
uint32_t Config::grpcWorkers() const { return this->serverSettings.grpcWorkers; }
uint32_t Config::grpcMaxThreads() const { return this->serverSettings.grpcMaxThreads.value_or(DEFAULT_GRPC_MAX_THREADS); }
size_t Config::grpcMemoryQuota() const { return this->serverSettings.grpcMemoryQuota.value_or(DEFAULT_GRPC_MEMORY_QUOTA); }
uint32_t Config::restWorkers() const { return this->serverSettings.restWorkers.value_or(DEFAULT_REST_WORKERS); }
uint32_t Config::restWorkers() const { return static_cast<uint32_t>(std::max(static_cast<uint64_t>(2), static_cast<uint64_t>(this->serverSettings.restWorkers.value_or(getDefaultRestWorkers())))); }
const std::string& Config::modelName() const { return this->modelsSettings.modelName; }
const std::string& Config::modelPath() const { return this->modelsSettings.modelPath; }
const std::string& Config::batchSize() const {
Expand Down
3 changes: 3 additions & 0 deletions src/llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,9 @@ ovms_cc_library(
"//src:httppayload",
"//src:libhttpclientconnection",
"//src:sse_utils",
"//src:libovms_systeminfo",
"//src:libovms_config",
"//src:libovms_ov_utils",
"//third_party:genai",] + select({
"//:disable_python": [],
"//:not_disable_python" : [":py_jinja_template_processor"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <algorithm>
#include <fstream>
#include <memory>
#include <stdexcept>
Expand All @@ -32,10 +33,13 @@
#pragma GCC diagnostic pop
#pragma warning(pop)

#include "../../../config.hpp"
#include "../../../json_parser.hpp"
#include "../../../logging.hpp"
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
#include "../../../ov_utils.hpp"
#include "../../../status.hpp"
#include "../../../systeminfo.hpp"
#include "llm_executor.hpp"
#include "servable.hpp"
#include "servable_initializer.hpp"
Expand Down Expand Up @@ -204,7 +208,26 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return status;
}

properties->tokenizerPluginConfig = {{"PERFORMANCE_HINT", "THROUGHPUT"}};
if (properties->device == "CPU") {
status = applyDefaultCpuProperties(properties->pluginConfig);
if (!status.ok()) {
SPDLOG_ERROR("Failed to apply default CPU properties for LLM model: {}", status.string());
return status;
}
}

ov::AnyMap tokenProperties;
const uint32_t tokenizerNumStreams = std::min(static_cast<uint32_t>(Config::instance().restWorkers()), static_cast<uint32_t>(getCoreCount()));
tokenProperties[ov::num_streams.name()] = static_cast<int>(tokenizerNumStreams);
tokenProperties[ov::hint::performance_mode.name()] = ov::hint::PerformanceMode::THROUGHPUT;
SPDLOG_DEBUG("Setting tokenizer/detokenizer NUM_STREAMS to: {}", tokenizerNumStreams);
status = applyDefaultCpuProperties(tokenProperties);
if (!status.ok()) {
SPDLOG_ERROR("Failed to apply default CPU properties for tokenizer: {}", status.string());
return status;
}
properties->tokenizerPluginConfig = tokenProperties;

try {
properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
properties->schedulerConfig, properties->device,
Expand Down
22 changes: 21 additions & 1 deletion src/modelmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
#include "schema.hpp"
#include "servable_definition.hpp"
#include "stringutils.hpp"
#include "systeminfo.hpp"

namespace ovms {

Expand All @@ -79,7 +80,6 @@ const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "c:\\Intel\\openvino_cache";
const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "/opt/cache";
#endif
ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistry* registry, PythonBackend* pythonBackend) :
ieCore(std::make_unique<ov::Core>()),
pipelineFactory(std::make_unique<PipelineFactory>()),
#if (MEDIAPIPE_DISABLE == 0)
mediapipeFactory(std::make_unique<MediapipeFactory>(pythonBackend)),
Expand All @@ -89,6 +89,20 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
modelCacheDirectory(modelCacheDirectory),
metricRegistry(registry),
pythonBackend(pythonBackend) {
try {
this->ieCore = std::make_unique<ov::Core>();
ov::AnyMap cpuProperties;
Status status = applyDefaultCpuProperties(cpuProperties);
if (!status.ok()) {
SPDLOG_CRITICAL("Failed to apply default CPU properties. Reason: {}", status.string());
throw std::runtime_error("Failed to apply default CPU properties");
}
this->ieCore->set_property("CPU", cpuProperties);
} catch (const std::exception& ex) {
SPDLOG_CRITICAL("Failed to initialize OpenVINO Core with CPU properties. Reason: {}", ex.what());
throw;
}

OV_LOGGER("ov::Core(): {}", reinterpret_cast<void*>(this->ieCore.get()));
// Take --cache_dir from CLI
if (this->modelCacheDirectory.empty()) {
Expand Down Expand Up @@ -151,6 +165,12 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
throw;
}
this->logPluginConfiguration();
#ifdef __linux__
if (isRunningInDocker()) {
SPDLOG_INFO("Running inside Docker container");
SPDLOG_INFO("cpu quota: {}, cpu affinity: {}, max_open_files: {}", getDockerCpuQuota(), getCpuAffinityCount(), getMaxOpenFilesLimit());
}
#endif
}

void ModelManager::logPluginConfiguration() {
Expand Down
47 changes: 47 additions & 0 deletions src/ov_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "logging.hpp"
#include "profiler.hpp"
#include "status.hpp"
#include "systeminfo.hpp"
#include "tensorinfo.hpp"

namespace ovms {
Expand Down Expand Up @@ -148,4 +149,50 @@ Status validatePluginConfiguration(const plugin_config_t& pluginConfig, const st

return StatusCode::OK;
}

Status applyDefaultCpuProperties(ov::AnyMap& properties) {
try {
const uint16_t coreCount = getCoreCount();

if (properties.find(ov::inference_num_threads.name()) == properties.end()) {
properties[ov::inference_num_threads.name()] = static_cast<int>(coreCount);
SPDLOG_DEBUG("applyDefaultCpuProperties: setting inference_num_threads to {}", coreCount);
}

#ifdef __linux__
if (properties.find(ov::hint::enable_cpu_pinning.name()) == properties.end()) {
if (isRunningInDocker()) {
const bool cpuPinning = getDockerCpuQuota() <= 0;
properties[ov::hint::enable_cpu_pinning.name()] = cpuPinning;
SPDLOG_DEBUG("applyDefaultCpuProperties: setting enable_cpu_pinning to {}", cpuPinning);
}
}
#endif

const auto perfIt = properties.find(ov::hint::performance_mode.name());
if (perfIt != properties.end()) {
bool isThroughput = false;
try {
isThroughput = (perfIt->second.as<ov::hint::PerformanceMode>() == ov::hint::PerformanceMode::THROUGHPUT);
} catch (...) {
try {
isThroughput = (perfIt->second.as<std::string>() == "THROUGHPUT");
} catch (...) {
}
}
if (isThroughput && properties.find(ov::num_streams.name()) == properties.end()) {
properties[ov::num_streams.name()] = static_cast<int>(coreCount);
SPDLOG_DEBUG("applyDefaultCpuProperties: setting num_streams to {} (THROUGHPUT hint active)", coreCount);
}
}
} catch (const std::exception& ex) {
SPDLOG_ERROR("Exception while applying default CPU properties: {}", ex.what());
return StatusCode::INTERNAL_ERROR;
} catch (...) {
SPDLOG_ERROR("Unknown exception while applying default CPU properties");
return StatusCode::INTERNAL_ERROR;
}
return StatusCode::OK;
}

} // namespace ovms
8 changes: 8 additions & 0 deletions src/ov_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ std::optional<ov::Layout> getLayoutFromRTMap(const ov::RTMap& rtMap);

Status validatePluginConfiguration(const plugin_config_t& pluginConfig, const std::string& targetDevice, const ov::Core& ieCore);

// Applies resource-aware CPU defaults to an OpenVINO property map.
// Sets inference_num_threads and (on Linux) enable_cpu_pinning only when not
// already present in the map. When PERFORMANCE_HINT=THROUGHPUT is set,
// num_streams is also capped to the detected core count if not already set.
// Returns StatusCode::INTERNAL_ERROR on any OpenVINO exception.
Status applyDefaultCpuProperties(ov::AnyMap& properties);

// Logging
// #1 model/global plugin CompiledMode:DUMMY / Global OpenVINO plugin:CPU
// #2 version/_
Expand Down Expand Up @@ -96,4 +103,5 @@ static void logOVPluginConfig(PropertyExtractor&& propertyExtractor, const std::
std::string pluginConfigNameValuesString = joins(pluginConfigNameValues, ", ");
SPDLOG_LOGGER_DEBUG(modelmanager_logger, "{}; {}plugin configuration: {{ {} }}", loggingAuthor, loggingDetails, pluginConfigNameValuesString);
}

} // namespace ovms
8 changes: 8 additions & 0 deletions src/sidepacket_servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "openvino/runtime/core.hpp"
#include "sidepacket_servable.hpp"
#include "logging.hpp"
#include "ov_utils.hpp"
#include <spdlog/spdlog.h>
#include <rapidjson/istreamwrapper.h>
#include <rapidjson/error/en.h>
Expand Down Expand Up @@ -134,6 +135,13 @@ void SidepacketServable::initialize(const std::string& modelDir, const std::stri
ov::Core core;
std::shared_ptr<ov::Model> m_model = core.read_model(parsedModelsPath / std::filesystem::path("openvino_model.xml"), {}, properties);
m_model = this->applyPrePostProcessing(core, m_model, properties);
if (targetDevice == "CPU") {
auto cpuPropertiesStatus = applyDefaultCpuProperties(properties);
if (!cpuPropertiesStatus.ok()) {
SPDLOG_ERROR("Failed to apply default CPU properties for embeddings model: {}", cpuPropertiesStatus.string());
return;
}
}
compiledModel = core.compile_model(m_model, targetDevice, properties);
SPDLOG_DEBUG("Model compiled {} for {}", parsedModelsPath.string(), targetDevice);

Expand Down
Loading