feat(server): add model-load chat_template_kwargs

abetlen · abetlen · commit 704039b30cad · 2026-03-29T22:51:07.000-07:00
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -243,6 +243,7 @@ def raise_exception(message: str):
             tools=tools,
             tool_choice=tool_choice,
             strftime_now=self.strftime_now,
+            **kwargs,
         )
 
         stopping_criteria = None
@@ -617,6 +618,7 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            **kwargs,
         )
         prompt = llama.tokenize(
             result.prompt.encode("utf-8"),
@@ -734,7 +736,9 @@ def format_autotokenizer(
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         tokenizer.use_default_system_prompt = False  # type: ignore
-        prompt: str = tokenizer.apply_chat_template(messages, tokenize=False)  # type: ignore
+        prompt: str = tokenizer.apply_chat_template(  # type: ignore
+            messages, tokenize=False, **kwargs
+        )
         assert isinstance(prompt, str)
         # Return formatted prompt and eos token by default
         return ChatFormatterResponse(
@@ -791,6 +795,7 @@ def format_tokenizer_config(
             messages=messages,
             bos_token=bos_token,
             eos_token=eos_token,
+            **kwargs,
         )
         return ChatFormatterResponse(
             prompt=prompt, stop=[eos_token, bos_token], added_special=True
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 import argparse
+import json
 
-from typing import List, Literal, Union, Any, Type, TypeVar
+from typing import List, Literal, Union, Any, Type, TypeVar, Dict
 
 from pydantic import BaseModel
 
@@ -40,6 +41,17 @@ def _contains_list_type(annotation: Type[Any] | None) -> bool:
         return False
 
 
+def _contains_dict_type(annotation: Type[Any] | None) -> bool:
+    origin = getattr(annotation, "__origin__", None)
+
+    if origin is dict or origin is Dict:
+        return True
+    elif origin in (Literal, Union):
+        return any(_contains_dict_type(arg) for arg in annotation.__args__)  # type: ignore
+    else:
+        return False
+
+
 def _parse_bool_arg(arg: str | bytes | bool) -> bool:
     if isinstance(arg, bytes):
         arg = arg.decode("utf-8")
@@ -57,6 +69,16 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
         raise ValueError(f"Invalid boolean argument: {arg}")
 
 
+def _parse_json_object_arg(arg: str | bytes) -> dict[str, Any]:
+    if isinstance(arg, bytes):
+        arg = arg.decode("utf-8")
+
+    value = json.loads(arg)
+    if not isinstance(value, dict):
+        raise ValueError(f"Invalid JSON object argument: {arg}")
+    return value
+
+
 def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
     """Add arguments from a pydantic model to an argparse parser."""
 
@@ -68,7 +90,15 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel])
             _get_base_type(field.annotation) if field.annotation is not None else str
         )
         list_type = _contains_list_type(field.annotation)
-        if base_type is not bool:
+        dict_type = _contains_dict_type(field.annotation)
+        if dict_type:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                type=_parse_json_object_arg,
+                help=description,
+            )
+        elif base_type is not bool:
             parser.add_argument(
                 f"--{name}",
                 dest=name,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -299,6 +299,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             # Misc
             verbose=settings.verbose,
         )
+        if settings.chat_template_kwargs:
+            base_chat_handler = (
+                _model.chat_handler
+                or _model._chat_handlers.get(_model.chat_format)
+                or llama_cpp.llama_chat_format.get_chat_completion_handler(
+                    _model.chat_format
+                )
+            )
+
+            def chat_handler_with_kwargs(*args, **kwargs):
+                return base_chat_handler(
+                    *args, **{**settings.chat_template_kwargs, **kwargs}
+                )
+
+            _model.chat_handler = chat_handler_with_kwargs
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 
-from typing import Optional, List, Literal, Union, Dict, cast
+from typing import Any, Optional, List, Literal, Union, Dict, cast
 from typing_extensions import Self
 
 from pydantic import Field, model_validator
@@ -131,6 +131,10 @@ class ModelSettings(BaseSettings):
         default=None,
         description="Chat format to use.",
     )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Extra keyword arguments forwarded to chat templates at model load time. Matches llama.cpp server `chat_template_kwargs`.",
+    )
     clip_model_path: Optional[str] = Field(
         default=None,
         description="Path to a CLIP model to use for multi-modal chat completion.",
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
@@ -92,3 +92,22 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
     )
 
     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>")
+
+
+def test_hf_tokenizer_config_chat_formatter_passes_template_kwargs():
+    tokenizer_config = {
+        "chat_template": "{{ bos_token }}{{ enable_thinking | default(false) }} {{ messages[0]['content'] }}",
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+    }
+    chat_formatter = hf_tokenizer_config_to_chat_formatter(
+        tokenizer_config, add_generation_prompt=False
+    )
+    response = chat_formatter(
+        messages=[
+            ChatCompletionRequestUserMessage(role="user", content="Hello, world!"),
+        ],
+        enable_thinking=True,
+    )
+
+    assert response.prompt == "<s>True Hello, world!"
diff --git a/tests/test_server_model.py b/tests/test_server_model.py
@@ -0,0 +1,70 @@
+import argparse
+
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
+import llama_cpp.server.model as server_model
+from llama_cpp.server.settings import ModelSettings
+
+
+def test_cli_parses_chat_template_kwargs_json():
+    parser = argparse.ArgumentParser()
+    add_args_from_model(parser, ModelSettings)
+
+    args = parser.parse_args(
+        [
+            "--model",
+            "test.gguf",
+            "--chat_template_kwargs",
+            '{"enable_thinking": true, "template_mode": "extended"}',
+        ]
+    )
+    settings = parse_model_from_args(ModelSettings, args)
+
+    assert settings.chat_template_kwargs == {
+        "enable_thinking": True,
+        "template_mode": "extended",
+    }
+
+
+def test_load_llama_from_model_settings_merges_chat_template_kwargs(monkeypatch):
+    captured = {}
+
+    def base_handler(*args, **kwargs):
+        captured["args"] = args
+        captured["kwargs"] = kwargs
+        return "ok"
+
+    class FakeLlama:
+        def __init__(self, **kwargs):
+            self.chat_handler = kwargs["chat_handler"]
+            self.chat_format = kwargs["chat_format"]
+            self._chat_handlers = {}
+
+        def set_cache(self, cache):
+            raise AssertionError("cache should not be set in this test")
+
+    monkeypatch.setattr(server_model.llama_cpp, "Llama", FakeLlama)
+    monkeypatch.setattr(
+        server_model.llama_cpp.llama_chat_format,
+        "get_chat_completion_handler",
+        lambda chat_format: base_handler,
+    )
+
+    model = server_model.LlamaProxy.load_llama_from_model_settings(
+        ModelSettings(
+            model="test.gguf",
+            chat_format="chatml",
+            chat_template_kwargs={
+                "enable_thinking": True,
+                "template_mode": "default",
+            },
+        )
+    )
+
+    result = model.chat_handler(template_mode="override", extra_flag="x")
+
+    assert result == "ok"
+    assert captured["kwargs"] == {
+        "enable_thinking": True,
+        "template_mode": "override",
+        "extra_flag": "x",
+    }