diff --git a/docs/model_server_rest_api_chat.md b/docs/model_server_rest_api_chat.md index e2b4d50454..b5d347d200 100644 --- a/docs/model_server_rest_api_chat.md +++ b/docs/model_server_rest_api_chat.md @@ -221,6 +221,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc | tool_choice | ✅ | ✅ | ✅ | string or object | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice) for more details. | | response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/main/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. | | chat_template_kwargs | ✅ | ❌ | ✅ | object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. | +| skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. | #### Beam search sampling specific | Param | OpenVINO Model Server | OpenAI /chat/completions API | vLLM Serving Sampling Params | Type | Description | @@ -281,7 +282,6 @@ If any of those parameters is not specified and request is made to Prompt Lookup - min_tokens - prompt_logprobs - detokenize -- skip_special_tokens - spaces_between_special_tokens - logits_processors - truncate_prompt_tokens diff --git a/docs/model_server_rest_api_completions.md b/docs/model_server_rest_api_completions.md index 1a8ba9257d..5089d068b1 100644 --- a/docs/model_server_rest_api_completions.md +++ b/docs/model_server_rest_api_completions.md @@ -62,6 +62,7 @@ curl http://localhost/v3/completions \ | include_stop_str_in_output | ✅ | ❌ | ✅ | bool (default: `false` if `stream=false`, `true` if `stream=true`) | Whether to include matched stop string in output. Setting it to false when `stream=true` is invalid configuration and will result in error. | | logprobs | ⚠️ | ✅ | ✅ | integer (optional) | Include the log probabilities on the logprob of the returned output token. **_ in stream mode logprobs are not returned. Only value 1 is accepted which returns logarithm or the chosen token _** | | echo | ✅ | ✅ | ✅ | boolean (optional) | Echo back the prompt in addition to the completion | +| skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. | #### Beam search sampling specific | Param | OpenVINO Model Server | OpenAI /completions API | vLLM Serving Sampling Params | Type | Description | @@ -112,7 +113,6 @@ Note that below parameters are valid only for prompt lookup pipeline. Add `"prom - min_tokens - prompt_logprobs - detokenize -- skip_special_tokens - spaces_between_special_tokens - logits_processors - truncate_prompt_tokens diff --git a/docs/model_server_rest_api_responses.md b/docs/model_server_rest_api_responses.md index ee7a84f07b..c27a643813 100644 --- a/docs/model_server_rest_api_responses.md +++ b/docs/model_server_rest_api_responses.md @@ -105,6 +105,7 @@ curl http://localhost/v3/responses \ | tool_choice | ✅ | ✅ | string or object (optional) | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular function via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. | | reasoning | ⚠️ | ✅ | object (optional) | Configuration for reasoning/thinking mode. The `effort` field accepts `"low"`, `"medium"`, or `"high"` — any value enables thinking mode (`enable_thinking: true` is injected into chat template kwargs). The `summary` field is accepted but ignored. | | chat_template_kwargs | ✅ | ❌ | object (optional) | Additional keyword arguments passed to the chat template. When `reasoning` is also provided, `enable_thinking: true` is merged into these kwargs. | +| skip_special_tokens | ✅ | ❌ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. | | stream_options | ❌ | ❌ | | Not supported in Responses API. Usage statistics are always included in the `response.completed` event. | #### Beam search sampling specific diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index c2cc3cd653..dc7b9031d8 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -530,7 +530,7 @@ ParsedOutput OpenAIApiHandler::parseOutputIfNeeded(const std::vector& g OVMS_PROFILE_FUNCTION(); ParsedOutput parsedOutput; if ((endpoint != Endpoint::CHAT_COMPLETIONS && endpoint != Endpoint::RESPONSES) || outputParser == nullptr) { - parsedOutput.content = this->tokenizer.decode(generatedIds); + parsedOutput.content = this->tokenizer.decode(generatedIds, ov::genai::skip_special_tokens(request.skipSpecialTokens)); } else { parsedOutput = outputParser->parse(generatedIds, this->areToolsAvailable()); } @@ -853,6 +853,17 @@ absl::Status OpenAIApiHandler::parseCommonPart(std::optional maxTokens if (maxNgramSizeItHasValue) { request.maxNgramSize = maxNgramSizeIt->value.GetUint(); } + + it = doc.FindMember("skip_special_tokens"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsBool()) + return absl::InvalidArgumentError("skip_special_tokens is not a bool"); + request.skipSpecialTokens = it->value.GetBool(); + } + if (!request.skipSpecialTokens && outputParser != nullptr) { + outputParser.reset(); + } + request.maxModelLength = maxModelLength; // TODO: logit_bias diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 9071e6addc..7c56bcbf95 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -164,7 +164,7 @@ class OpenAIApiHandler { // Serialization - pure virtual, each handler produces its own response format virtual std::string serializeUnaryResponse(const std::vector& generationOutputs) = 0; virtual std::string serializeUnaryResponse(ov::genai::EncodedResults& results) = 0; - virtual std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) = 0; + virtual std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) = 0; virtual std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) = 0; virtual std::string serializeStreamingUsageChunk() = 0; virtual std::string serializeStreamingHandshakeChunk() = 0; diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 433ce59c3f..810fcdc50a 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -315,7 +315,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect jsonResponse.StartArray("content"); for (int i = 0; i < generationOutput.generated_ids.size(); i++) { - std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]})); + std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]}), ov::genai::skip_special_tokens(this->request.skipSpecialTokens)); float logprob = generationOutput.generated_log_probs[i]; jsonResponse.LogprobObject(token, logprob); } @@ -324,7 +324,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect if (endpoint == Endpoint::COMPLETIONS) { jsonResponse.StartArray("tokens"); for (int i = 0; i < generationOutput.generated_ids.size(); i++) { - std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]})); + std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]}), ov::genai::skip_special_tokens(this->request.skipSpecialTokens)); jsonResponse.String(token); } jsonResponse.EndArray(); @@ -339,7 +339,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect jsonResponse.StartArray("top_logprobs"); for (int i = 0; i < generationOutput.generated_ids.size(); i++) { jsonResponse.StartObject(); - std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]})); + std::string token = tokenizer.decode(std::vector({generationOutput.generated_ids[i]}), ov::genai::skip_special_tokens(this->request.skipSpecialTokens)); float logprob = generationOutput.generated_log_probs[i]; jsonResponse.Logprob(token, logprob); jsonResponse.EndObject(); @@ -351,7 +351,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect if (i == 0) { jsonResponse.TextOffsetValue(0); } else { - std::string textBeforeToken = tokenizer.decode(std::vector({generationOutput.generated_ids.begin(), generationOutput.generated_ids.begin() + i})); + std::string textBeforeToken = tokenizer.decode(std::vector({generationOutput.generated_ids.begin(), generationOutput.generated_ids.begin() + i}), ov::genai::skip_special_tokens(this->request.skipSpecialTokens)); jsonResponse.TextOffsetValue(textBeforeToken.size()); } } @@ -458,7 +458,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco return jsonResponse.ToString(); } -std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results) { +std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) { OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); @@ -470,13 +470,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD jsonResponse.StartArray("choices"); int index = 0; - for (int i = 0; i < results.texts.size(); i++) { - const std::string& text = results.texts[i]; - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text); + if (!textResponse.empty()) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", textResponse); // Workaround to use OVMS unary parsers: get tokens from string // This way we have detokenized text from GenAI and calculate tokens, to further convert back to text again, in parseOutputIfNeeded... - auto generatedTokens = encodeTextToTokens(text); + auto generatedTokens = encodeTextToTokens(textResponse); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens); ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 9ebe529637..cbb8f2645f 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -39,7 +39,7 @@ class OpenAIChatCompletionsHandler : public OpenAIApiHandler { std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; - std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) override; + std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) override; std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) override; std::string serializeStreamingUsageChunk() override; std::string serializeStreamingHandshakeChunk() override; diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index e1d05282eb..8ffef1da1d 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -81,6 +81,8 @@ struct OpenAIRequest { // Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice std::string toolChoice; + bool skipSpecialTokens{true}; + OpenAIRequest() = default; ~OpenAIRequest() = default; }; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index e5d63985e6..49703c0fc2 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -655,21 +655,21 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes return serializeUnaryResponseImpl(parsedOutputs); } -std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results) { +std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) { OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); // Usage is already correctly set from perf_metrics above — no need for updateUsage. std::vector parsedOutputs; - for (const std::string& text : results.texts) { + if (!textResponse.empty()) { if (outputParser != nullptr) { // Same workaround as in chat completions - auto generatedTokens = encodeTextToTokens(text); + auto generatedTokens = encodeTextToTokens(textResponse); parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); } else { // Fast path: no output parser, use decoded text directly. ParsedOutput output; - output.content = text; + output.content = textResponse; parsedOutputs.push_back(std::move(output)); } } diff --git a/src/llm/apis/openai_responses.hpp b/src/llm/apis/openai_responses.hpp index 0e5fd892b7..6a10400952 100644 --- a/src/llm/apis/openai_responses.hpp +++ b/src/llm/apis/openai_responses.hpp @@ -97,7 +97,7 @@ class OpenAIResponsesHandler : public OpenAIApiHandler { std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; - std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results) override; + std::string serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) override; std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) override; std::string serializeStreamingUsageChunk() override; std::string serializeStreamingHandshakeChunk() override; diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index aadbf9e683..4234088a2a 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -114,8 +114,9 @@ absl::Status LegacyServable::parseRequest(std::shared_ptrapiHandler->isStream() && - legacyExecutionContext->apiHandler->getOutputParser() != nullptr && - (legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) { + ((legacyExecutionContext->apiHandler->getOutputParser() != nullptr && + legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens()) || + !legacyExecutionContext->apiHandler->getRequest().skipSpecialTokens)) { streamerConfig.insert(ov::genai::skip_special_tokens(false)); } legacyExecutionContext->textStreamer = std::make_shared(getProperties()->tokenizer, callback, streamerConfig); diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index e2ccd06e78..5a0955b4f5 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -146,8 +146,9 @@ absl::Status GenAiServable::parseRequest(std::shared_ptrapiHandler->getOutputParser() != nullptr && - (executionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) { + if ((executionContext->apiHandler->getOutputParser() != nullptr && + executionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens()) || + !executionContext->apiHandler->getRequest().skipSpecialTokens) { streamerConfig.insert(ov::genai::skip_special_tokens(false)); } executionContext->textStreamer = std::make_shared(getProperties()->tokenizer, callback, streamerConfig); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index bc3ecaf71f..6297745360 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "../../../logging.hpp" @@ -110,14 +111,16 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrexecutionInProgress, &mutex = legacyExecutionContext->mutex, &lastStreamerCallbackOutput = legacyExecutionContext->lastStreamerCallbackOutput, - &clientDisconnected = legacyExecutionContext->clientDisconnected, - streamMode = legacyExecutionContext->apiHandler->isStream()](std::string text) { + &clientDisconnected = legacyExecutionContext->clientDisconnected](std::string text) { SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Streamer callback executed with text: [{}]", text); if (clientDisconnected.load()) { executionInProgress.notify_one(); return ov::genai::StreamingStatus::CANCEL; } - if (streamMode) { + + // TODO(mzegla): unconditional streaming-like behavior also for unary flow due to GenAI generate limitations. + // This diverges from the general flow - we should have unified systematic approach. + { std::lock_guard lock(mutex); lastStreamerCallbackOutput += text; executionInProgress.notify_one(); @@ -125,8 +128,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrapiHandler->getOutputParser() != nullptr && - (legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) { + if ((legacyExecutionContext->apiHandler->getOutputParser() != nullptr && + legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens()) || + !legacyExecutionContext->apiHandler->getRequest().skipSpecialTokens) { streamerConfig.insert(ov::genai::skip_special_tokens(false)); } legacyExecutionContext->textStreamer = std::make_shared(getProperties()->tokenizer, callback, streamerConfig); @@ -178,7 +182,20 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha if (legacyExecutionContext->payload.client->isDisconnected()) { return absl::CancelledError(); } - executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results); + + // TODO(mzegla): Usage of streaming flow here due to GenAI generate limitations. + // This diverges from the general flow - we should have unified systematic approach. + + executionContext->textStreamer->end(); + + std::string completeText; + { + std::lock_guard lock(legacyExecutionContext->mutex); + completeText = std::move(executionContext->lastStreamerCallbackOutput); + executionContext->lastStreamerCallbackOutput.clear(); + } + executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completeText); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response); return absl::OkStatus(); } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 17813f5a2c..c3a40cba3c 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -451,6 +451,16 @@ class HttpOpenAIHandlerCommonParsingValidationTest : public HttpOpenAIHandlerPar } return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; } + + std::string createRequestWithSkipSpecialTokensRawValue(const std::string& rawValue) const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"model\":\"llama\",\"prompt\":\"valid prompt\",\"skip_special_tokens\":" + rawValue + "}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"model\":\"llama\",\"input\":\"valid prompt\",\"skip_special_tokens\":" + rawValue + "}"; + } + return "{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}],\"skip_special_tokens\":" + rawValue + "}"; + } }; TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, StreamFieldNotABooleanFails) { @@ -492,6 +502,116 @@ TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldNotStringFails) { EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); } +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensDefaultIsTrue) { + std::string jsonWithoutField; + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + jsonWithoutField = "{\"model\":\"llama\",\"prompt\":\"valid prompt\"}"; + } else if (endpoint() == ovms::Endpoint::RESPONSES) { + jsonWithoutField = "{\"model\":\"llama\",\"input\":\"valid prompt\"}"; + } else { + jsonWithoutField = "{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + doc.Parse(jsonWithoutField.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint()); + + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getRequest().skipSpecialTokens); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensFalseNoParser) { + std::string json = createRequestWithSkipSpecialTokensRawValue("false"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint()); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->getRequest().skipSpecialTokens); + EXPECT_EQ(apiHandler->getOutputParser(), nullptr); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensTrueWithToolParser) { + std::string json = createRequestWithSkipSpecialTokensRawValue("true"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint(), "llama3"); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getRequest().skipSpecialTokens); + EXPECT_NE(apiHandler->getOutputParser(), nullptr); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensFalseWithToolParserSilentlyDisablesParser) { + std::string json = createRequestWithSkipSpecialTokensRawValue("false"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint(), "llama3"); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->getRequest().skipSpecialTokens); + EXPECT_EQ(apiHandler->getOutputParser(), nullptr); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensFalseWithReasoningParserSilentlyDisablesParser) { + std::string json = createRequestWithSkipSpecialTokensRawValue("false"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint(), "", "qwen3"); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->getRequest().skipSpecialTokens); + EXPECT_EQ(apiHandler->getOutputParser(), nullptr); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensFalseWithBothParsersSilentlyDisablesParsers) { + std::string json = createRequestWithSkipSpecialTokensRawValue("false"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint(), "llama3", "qwen3"); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->getRequest().skipSpecialTokens); + EXPECT_EQ(apiHandler->getOutputParser(), nullptr); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, SkipSpecialTokensNotBoolFails) { + std::string json = createRequestWithSkipSpecialTokensRawValue("\"yes\""); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = createHandler(endpoint()); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), + absl::InvalidArgumentError("skip_special_tokens is not a bool")); +} + INSTANTIATE_TEST_SUITE_P( CommonParsingValidation, HttpOpenAIHandlerCommonParsingValidationTest, @@ -929,7 +1049,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ov::genai::VLMDecodedResults results; std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; results.texts = {toolCall}; - std::string serialized = apiHandler->serializeUnaryResponse(results); + std::string serialized = apiHandler->serializeUnaryResponse(results, toolCall); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; } @@ -2972,10 +3092,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWith ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); ov::genai::VLMDecodedResults results; - results.texts.push_back( - "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"); + std::string vlmText = + "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"; + results.texts.push_back(vlmText); - std::string serialized = apiHandler->serializeUnaryResponse(results); + std::string serialized = apiHandler->serializeUnaryResponse(results, vlmText); rapidjson::Document responseDoc; responseDoc.Parse(serialized.c_str()); diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index ad69a9692f..ecd1d598d7 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -897,6 +897,42 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJson) { EXPECT_STREQ(parsedResponse["object"].GetString(), "chat.completion"); } +TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsSkipSpecialTokensFalse) { + auto params = GetParam(); + std::string requestBody = R"( + { + "model": ")" + params.modelName + + R"(", + "stream": false, + "seed": 1, + "temperature": 0, + "max_tokens": 5, + "ignore_eos": true, + "skip_special_tokens": false, + "messages": [ + { + "role": "user", + "content": "What is OpenVINO?" + } + ] + } + )"; + + ASSERT_EQ( + handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse["choices"].IsArray()); + ASSERT_EQ(parsedResponse["choices"].Capacity(), 1); + for (auto& choice : parsedResponse["choices"].GetArray()) { + ASSERT_TRUE(choice["message"].IsObject()); + ASSERT_TRUE(choice["message"]["content"].IsString()); + EXPECT_STREQ(choice["message"]["role"].GetString(), "assistant"); + } + ASSERT_EQ(parsedResponse["usage"].GetObject()["completion_tokens"].GetInt(), 5 /* max_tokens */); + EXPECT_STREQ(parsedResponse["object"].GetString(), "chat.completion"); +} + TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonContentArray) { auto params = GetParam(); std::string requestBody = R"( @@ -1752,6 +1788,52 @@ TEST_P(LLMFlowHttpTestParameterized, inferChatCompletionsStream) { ovms::StatusCode::PARTIAL_END); } +TEST_P(LLMFlowHttpTestParameterized, inferChatCompletionsStreamSkipSpecialTokensFalse) { + auto params = GetParam(); + std::string requestBody = R"( + { + "model": ")" + params.modelName + + R"(", + "stream": true, + "seed": 1, + "max_tokens": 5, + "ignore_eos": true, + "skip_special_tokens": false, + "messages": [ + { + "role": "user", + "content": "What is OpenVINO?" + } + ] + } + )"; + int replyCounter = 0; + ON_CALL(*writer, PartialReply).WillByDefault([this, ¶ms, &replyCounter](std::string response) { + if (replyCounter == 0 && params.checkHandshakeChunk) { + replyCounter++; + assertInitialStreamChatCompletionChunk(response, params.modelName); + return; + } + rapidjson::Document d; + std::string dataPrefix = "data:"; + ASSERT_STREQ(response.substr(0, dataPrefix.size()).c_str(), dataPrefix.c_str()); + size_t pos = response.find("\n"); + ASSERT_NE(pos, response.npos); + rapidjson::ParseResult parsingSucceeded = d.Parse(response.substr(dataPrefix.size(), (pos - dataPrefix.size())).c_str()); + ASSERT_EQ(parsingSucceeded.Code(), 0); + ASSERT_TRUE(d["choices"].IsArray()); + ASSERT_EQ(d["choices"].Capacity(), 1); + for (auto& choice : d["choices"].GetArray()) { + ASSERT_TRUE(choice["delta"].IsObject()); + ASSERT_TRUE(choice["delta"]["content"].IsString()); + } + EXPECT_STREQ(d["object"].GetString(), "chat.completion.chunk"); + }); + ASSERT_EQ( + handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); +} + TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsStreamOptionsSetFail) { auto params = GetParam(); std::string requestBody = R"(