openvinotoolkit · przepeck · Apr 15, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -138,8 +138,8 @@ Task specific parameters for different tasks (text generation/image generation/e
 | `--max_prompt_len`                    | `integer`    | Sets NPU specific property for maximum number of tokens in the prompt.                                                     |
 | `--kv_cache_precision`                | `string`     | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default).            |
 | `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss]                     |
-| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2]            |
+| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss, gemma4]                     |
+| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2, gemma4]            |
 | `--enable_tool_guided_generation`     | `bool`       | Enables enforcing tool schema during generation. Requires setting response parser. Default: false.                         |
 
 ### Image generation

diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -39,6 +39,7 @@ MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 GPT_OSS_MODEL="openai/gpt-oss-20b"
 DEVSTRAL_MODEL="unsloth/Devstral-Small-2507"
 LFM2_MODEL="LiquidAI/LFM2-2.6B"
+GEMMA4_MODEL="OpenVINO/gemma-4-E4B-it-int4-ov"
 
 if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi
 
@@ -228,4 +229,13 @@ fi
 if [ ! -f "$1/$LFM2_MODEL/$TOKENIZER_FILE" ]; then
   echo "[ERROR] Models file $1/$LFM2_MODEL/$TOKENIZER_FILE does not exist."
   exit 1
-fi
+fi
+if [ -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+else
+  hf download "$GEMMA4_MODEL" --local-dir $1/$GEMMA4_MODEL --include *tokenizer*
+fi
+if [ ! -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE does not exist."
+  exit 1
+fi
diff --git a/spelling-whitelist.txt b/spelling-whitelist.txt
@@ -29,3 +29,4 @@ demos/vlm_npu/README.md:157: mane ==> main, many, maine
 demos/vlm_npu/README.md:218: mane ==> main, many, maine
 demos/integration_with_OpenWebUI/README.md:423: Buildin ==> Building, Build in
 src/test/llm/output_parsers/lfm2_output_parser_test.cpp
+src/test/llm/output_parsers/gemma4_output_parser_test.cpp
diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -197,6 +197,38 @@ ovms_cc_library(
     ],
     visibility = ["//visibility:public"],
 )
+ovms_cc_library(
+    name = "io_processing_gemma4_tool_parser",
+    hdrs = ["io_processing/gemma4/tool_parser.hpp", "io_processing/gemma4/reasoning_parser.hpp"],
+    srcs = ["io_processing/gemma4/tool_parser.cpp", "io_processing/gemma4/reasoning_parser.cpp"],
+    deps = [
+        "@com_github_tencent_rapidjson//:rapidjson",
+        "//src/port:rapidjson_document",
+        "//src:libovmslogging",
+        "//src:libovmsstring_utils",
+        ":io_processing_utils",
+        ":io_processing_base_output_parser",
+        ":io_processing_qwen3_reasoning_parser",
+        "//third_party:genai",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+ovms_cc_library(
+    name = "io_processing_qwen3_reasoning_parser",
+    hdrs = ["io_processing/qwen3/reasoning_parser.hpp"],
+    srcs = ["io_processing/qwen3/reasoning_parser.cpp"],
+    deps = [
+        "@com_github_tencent_rapidjson//:rapidjson",
+        "//src/port:rapidjson_document",
+        "//src:libovmslogging",
+        "//src:libovmsstring_utils",
+        ":io_processing_utils",
+        ":io_processing_base_output_parser",
+        "//third_party:genai",
+    ],
+    visibility = ["//visibility:public"],
+)
 
 ovms_cc_library( # TODO split further so we don't have to recompile everything when changing one parser ...
     name = "output_parsers",
@@ -206,7 +238,6 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/phi4/tool_parser.hpp",
             "io_processing/devstral/tool_parser.hpp",
             "io_processing/mistral/tool_parser.hpp",
-            "io_processing/qwen3/reasoning_parser.hpp",
             "io_processing/gptoss/reasoning_parser.hpp",
             "io_processing/gptoss/tool_parser.hpp",
             "io_processing/gptoss/harmony.hpp",
@@ -218,7 +249,6 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/phi4/tool_parser.cpp",
             "io_processing/devstral/tool_parser.cpp",
             "io_processing/mistral/tool_parser.cpp",
-            "io_processing/qwen3/reasoning_parser.cpp",
             "io_processing/gptoss/reasoning_parser.cpp",
             "io_processing/gptoss/tool_parser.cpp",
             "io_processing/gptoss/harmony.cpp",
@@ -234,6 +264,8 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
         ":io_processing_base_output_parser",
         ":io_processing_qwen3coder_tool_parser",
         ":io_processing_lfm2_tool_parser",
+        ":io_processing_gemma4_tool_parser",
+        ":io_processing_qwen3_reasoning_parser",
         ":io_processing_utils",
         ":apis_tool_schema_wrapper",
     ],

diff --git a/src/llm/io_processing/gemma4/reasoning_parser.cpp b/src/llm/io_processing/gemma4/reasoning_parser.cpp
@@ -0,0 +1,41 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <vector>
+
+#include "src/port/rapidjson_document.hpp"
+
+#include "../../../logging.hpp"
+#include "reasoning_parser.hpp"
+#include "../utils.hpp"
+
+namespace ovms {
+void Gemma4ReasoningParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
+    size_t startPos = std::find(generatedTokens.begin(), generatedTokens.end(), reasoningTokenId) - generatedTokens.begin();
+    size_t endPos = std::find(generatedTokens.begin(), generatedTokens.end(), reasoningEndTokenId) - generatedTokens.begin();
+
+    if (startPos != std::string::npos && endPos != std::string::npos && startPos < endPos) {
+        size_t reasoningStart = startPos + 3;  // deleting "<|channel>thought\n"
+        std::string reasoningText = tokenizer.decode(std::vector<int64_t>(generatedTokens.begin() + reasoningStart, generatedTokens.begin() + endPos), ov::genai::skip_special_tokens(true));
+        parsedOutput.reasoning = reasoningText;
+        // Remove reasoning from content
+        std::string contentWithoutReasoning = tokenizer.decode(std::vector<int64_t>(generatedTokens.begin() + endPos + 1, generatedTokens.end()), ov::genai::skip_special_tokens(true));
+        parsedOutput.content = contentWithoutReasoning;
+    }
+}
+}  // namespace ovms
diff --git a/src/llm/io_processing/gemma4/reasoning_parser.hpp b/src/llm/io_processing/gemma4/reasoning_parser.hpp
@@ -0,0 +1,39 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <openvino/genai/tokenizer.hpp>
+#include <vector>
+
+#include "../qwen3/reasoning_parser.hpp"
+
+namespace ovms {
+class Gemma4ReasoningParser : public Qwen3ReasoningParser {
+protected:
+    const int64_t reasoningTokenId = 100;
+    const int64_t reasoningEndTokenId = 101;
+
+public:
+    Gemma4ReasoningParser() = delete;
+    explicit Gemma4ReasoningParser(ov::genai::Tokenizer& tokenizer) :
+        Qwen3ReasoningParser(tokenizer) {}
+    void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
+
+    bool requiresStreamingWithSpecialTokens() const override {
+        return true;
+    }
+};
+}  // namespace ovms