From 100b275c88763e2fada18d009e9bca8287cdb7c9 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 2 Apr 2026 22:44:24 -0700
Subject: [PATCH 1/2] feat: Update llama.cpp to
 ggerganov/llama.cpp@f49e9178767d557a522618b16ce8694f9ddac628 (#2169)

---
 llama_cpp/llama_cpp.py | 417 +++++++++++++++++++++++++++++++++++++++--
 llama_cpp/mtmd_cpp.py  |  26 ++-
 vendor/llama.cpp       |   2 +-
 3 files changed, 423 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 5a6c06b07..204f1e626 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -45,6 +45,14 @@
 ctypes_function = ctypes_function_for_shared_library(_lib)
 
 
+def _warn_deprecated(symbol: str, hint: str) -> None:
+    warnings.warn(
+        f"{symbol} is deprecated; {hint}",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+
 # from ggml.h
 # // NOTE: always add types at the end of the enum to keep backward compatibility
 # enum ggml_type {
@@ -711,6 +719,43 @@ class llama_model_kv_override(ctypes.Structure):
         value: Union[int, float, bool, bytes]
 
 
+# struct llama_model_tensor_override {
+#     const char * pattern;
+#     enum ggml_type type;
+# };
+class llama_model_tensor_override(ctypes.Structure):
+    """Override the quantization type for tensors matching a pattern."""
+
+    _fields_ = [
+        ("pattern", ctypes.c_char_p),
+        ("type", ctypes.c_int),
+    ]
+
+    if TYPE_CHECKING:
+        pattern: Optional[bytes]
+        type: int
+
+
+# struct llama_model_imatrix_data {
+#     const char * name;
+#     const float * data;
+#     size_t size;
+# };
+class llama_model_imatrix_data(ctypes.Structure):
+    """Importance matrix data for a tensor used during quantization."""
+
+    _fields_ = [
+        ("name", ctypes.c_char_p),
+        ("data", ctypes.POINTER(ctypes.c_float)),
+        ("size", ctypes.c_size_t),
+    ]
+
+    if TYPE_CHECKING:
+        name: Optional[bytes]
+        data: CtypesPointer[ctypes.c_float]
+        size: int
+
+
 # struct llama_model_tensor_buft_override {
 #     const char * pattern;
 #     ggml_backend_buffer_type_t buft;
@@ -1022,10 +1067,10 @@ class llama_context_params(ctypes.Structure):
 #     bool pure;                            // quantize all tensors to the default type
 #     bool keep_split;                      // quantize to the same number of shards
 #     bool dry_run;                         // calculate and show the final quantization size without performing quantization
-#     void * imatrix;                       // pointer to importance matrix data
-#     void * kv_overrides;                  // pointer to vector containing overrides
-#     void * tensor_types;                  // pointer to vector containing tensor types
-#     void * prune_layers;                  // pointer to vector containing layer indices to prune
+#     const struct llama_model_imatrix_data * imatrix;         // pointer to importance matrix data
+#     const struct llama_model_kv_override * kv_overrides;     // pointer to kv overrides
+#     const struct llama_model_tensor_override * tt_overrides; // pointer to tensor overrides
+#     const int32_t * prune_layers;                            // pointer to layer indices to prune
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
     """Parameters for llama_model_quantize
@@ -1041,10 +1086,10 @@ class llama_model_quantize_params(ctypes.Structure):
         pure (bool): quantize all tensors to the default type
         keep_split (bool): quantize to the same number of shards
         dry_run (bool): calculate and show the final quantization size without performing quantization
-        imatrix (ctypes.c_void_p): pointer to importance matrix data
-        kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
-        tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
-        prune_layers (ctypes.c_void_p): pointer to vector containing layer indices to prune
+        imatrix (ctypes.Array[llama_model_imatrix_data]): pointer to importance matrix data
+        kv_overrides (ctypes.Array[llama_model_kv_override]): pointer to kv overrides
+        tt_overrides (ctypes.Array[llama_model_tensor_override]): pointer to tensor overrides
+        prune_layers (ctypes.Array[ctypes.c_int32]): pointer to layer indices to prune
     """
 
     if TYPE_CHECKING:
@@ -1058,10 +1103,10 @@ class llama_model_quantize_params(ctypes.Structure):
         pure: bool
         keep_split: bool
         dry_run: bool
-        imatrix: ctypes.c_void_p
-        kv_overrides: ctypes.c_void_p
-        tensor_types: ctypes.c_void_p
-        prune_layers: ctypes.c_void_p
+        imatrix: CtypesPointer[llama_model_imatrix_data]
+        kv_overrides: CtypesPointer[llama_model_kv_override]
+        tt_overrides: CtypesPointer[llama_model_tensor_override]
+        prune_layers: CtypesPointer[ctypes.c_int32]
 
     _fields_ = [
         ("nthread", ctypes.c_int32),
@@ -1074,10 +1119,10 @@ class llama_model_quantize_params(ctypes.Structure):
         ("pure", ctypes.c_bool),
         ("keep_split", ctypes.c_bool),
         ("dry_run", ctypes.c_bool),
-        ("imatrix", ctypes.c_void_p),
-        ("kv_overrides", ctypes.c_void_p),
-        ("tensor_types", ctypes.c_void_p),
-        ("prune_layers", ctypes.c_void_p),
+        ("imatrix", ctypes.POINTER(llama_model_imatrix_data)),
+        ("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
+        ("tt_overrides", ctypes.POINTER(llama_model_tensor_override)),
+        ("prune_layers", ctypes.POINTER(ctypes.c_int32)),
     ]
 
 
@@ -1272,6 +1317,19 @@ def llama_load_model_from_file(
 ) -> Optional[llama_model_p]: ...
 
 
+_llama_load_model_from_file = llama_load_model_from_file
+
+
+def llama_load_model_from_file(
+    path_model: bytes, params: llama_model_params, /
+) -> Optional[llama_model_p]:
+    _warn_deprecated(
+        "llama_load_model_from_file",
+        "use llama_model_load_from_file instead",
+    )
+    return _llama_load_model_from_file(path_model, params)
+
+
 # // Load the model from a file
 # // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
 # // If the split file name does not follow this pattern, use llama_model_load_from_splits
@@ -1353,6 +1411,14 @@ def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
 def llama_free_model(model: llama_model_p, /): ...
 
 
+_llama_free_model = llama_free_model
+
+
+def llama_free_model(model: llama_model_p, /):
+    _warn_deprecated("llama_free_model", "use llama_model_free instead")
+    return _llama_free_model(model)
+
+
 # LLAMA_API void llama_model_free(struct llama_model * model);
 @ctypes_function(
     "llama_model_free",
@@ -1419,6 +1485,19 @@ def llama_new_context_with_model(
 ) -> Optional[llama_context_p]: ...
 
 
+_llama_new_context_with_model = llama_new_context_with_model
+
+
+def llama_new_context_with_model(
+    model: llama_model_p, params: llama_context_params, /
+) -> Optional[llama_context_p]:
+    _warn_deprecated(
+        "llama_new_context_with_model",
+        "use llama_init_from_model instead",
+    )
+    return _llama_new_context_with_model(model, params)
+
+
 # // Frees all allocated memory
 # LLAMA_API void llama_free(struct llama_context * ctx);
 @ctypes_function(
@@ -1557,26 +1636,66 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
 
 
+_llama_n_ctx_train = llama_n_ctx_train
+
+
+def llama_n_ctx_train(model: llama_model_p, /) -> int:
+    _warn_deprecated("llama_n_ctx_train", "use llama_model_n_ctx_train instead")
+    return _llama_n_ctx_train(model)
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
 @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_embd(model: llama_model_p, /) -> int: ...
 
 
+_llama_n_embd = llama_n_embd
+
+
+def llama_n_embd(model: llama_model_p, /) -> int:
+    _warn_deprecated("llama_n_embd", "use llama_model_n_embd instead")
+    return _llama_n_embd(model)
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
 @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_layer(model: llama_model_p, /) -> int: ...
 
 
+_llama_n_layer = llama_n_layer
+
+
+def llama_n_layer(model: llama_model_p, /) -> int:
+    _warn_deprecated("llama_n_layer", "use llama_model_n_layer instead")
+    return _llama_n_layer(model)
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_head(model: llama_model_p, /) -> int: ...
 
 
+_llama_n_head = llama_n_head
+
+
+def llama_n_head(model: llama_model_p, /) -> int:
+    _warn_deprecated("llama_n_head", "use llama_model_n_head instead")
+    return _llama_n_head(model)
+
+
 # DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 @ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32)
 def llama_n_vocab(model: llama_vocab_p, /) -> int: ...
 
 
+_llama_n_vocab = llama_n_vocab
+
+
+def llama_n_vocab(model: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_n_vocab", "use llama_vocab_n_tokens instead")
+    return _llama_n_vocab(model)
+
+
 # LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
 def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
@@ -2381,6 +2500,14 @@ def llama_get_state_size(ctx: llama_context_p, /) -> int:
     ...
 
 
+_llama_get_state_size = llama_get_state_size
+
+
+def llama_get_state_size(ctx: llama_context_p, /) -> int:
+    _warn_deprecated("llama_get_state_size", "use llama_state_get_size instead")
+    return _llama_get_state_size(ctx)
+
+
 # // Copies the state to the specified destination address.
 # // Destination needs to have allocated enough memory.
 # // Returns the number of bytes copied
@@ -2428,6 +2555,16 @@ def llama_copy_state_data(
     ...
 
 
+_llama_copy_state_data = llama_copy_state_data
+
+
+def llama_copy_state_data(
+    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
+) -> int:
+    _warn_deprecated("llama_copy_state_data", "use llama_state_get_data instead")
+    return _llama_copy_state_data(ctx, dst)
+
+
 # // Set the state reading from the specified address
 # // Returns the number of bytes read
 # LLAMA_API size_t llama_state_set_data(
@@ -2466,6 +2603,16 @@ def llama_set_state_data(
     ...
 
 
+_llama_set_state_data = llama_set_state_data
+
+
+def llama_set_state_data(
+    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
+) -> int:
+    _warn_deprecated("llama_set_state_data", "use llama_state_set_data instead")
+    return _llama_set_state_data(ctx, src)
+
+
 # Save/load session file
 # LLAMA_API bool llama_state_load_file(
 #         struct llama_context * ctx,
@@ -2522,6 +2669,23 @@ def llama_load_session_file(
 ) -> bool: ...
 
 
+_llama_load_session_file = llama_load_session_file
+
+
+def llama_load_session_file(
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens_out: CtypesArray[llama_token],
+    n_token_capacity: Union[ctypes.c_size_t, int],
+    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
+    /,
+) -> bool:
+    _warn_deprecated("llama_load_session_file", "use llama_state_load_file instead")
+    return _llama_load_session_file(
+        ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
+    )
+
+
 # LLAMA_API bool llama_state_save_file(
 #         struct llama_context * ctx,
 #                   const char * path_session,
@@ -2571,6 +2735,20 @@ def llama_save_session_file(
 ) -> bool: ...
 
 
+_llama_save_session_file = llama_save_session_file
+
+
+def llama_save_session_file(
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens: CtypesArray[llama_token],
+    n_token_count: Union[ctypes.c_size_t, int],
+    /,
+) -> bool:
+    _warn_deprecated("llama_save_session_file", "use llama_state_save_file instead")
+    return _llama_save_session_file(ctx, path_session, tokens, n_token_count)
+
+
 # // Get the exact size needed to copy the state of a single sequence
 # LLAMA_API size_t llama_state_seq_get_size(
 #         struct llama_context * ctx,
@@ -3300,6 +3478,16 @@ def llama_token_get_text(
 ) -> bytes: ...
 
 
+_llama_token_get_text = llama_token_get_text
+
+
+def llama_token_get_text(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> bytes:
+    _warn_deprecated("llama_token_get_text", "use llama_vocab_get_text instead")
+    return _llama_token_get_text(vocab, token)
+
+
 # DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
 @ctypes_function(
     "llama_token_get_score",
@@ -3311,6 +3499,16 @@ def llama_token_get_score(
 ) -> float: ...
 
 
+_llama_token_get_score = llama_token_get_score
+
+
+def llama_token_get_score(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> float:
+    _warn_deprecated("llama_token_get_score", "use llama_vocab_get_score instead")
+    return _llama_token_get_score(vocab, token)
+
+
 # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
 @ctypes_function(
     "llama_token_get_attr",
@@ -3322,6 +3520,16 @@ def llama_token_get_attr(
 ) -> int: ...
 
 
+_llama_token_get_attr = llama_token_get_attr
+
+
+def llama_token_get_attr(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> int:
+    _warn_deprecated("llama_token_get_attr", "use llama_vocab_get_attr instead")
+    return _llama_token_get_attr(vocab, token)
+
+
 # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
 @ctypes_function(
     "llama_token_is_eog",
@@ -3333,6 +3541,14 @@ def llama_token_is_eog(
 ) -> bool: ...
 
 
+_llama_token_is_eog = llama_token_is_eog
+
+
+def llama_token_is_eog(vocab: llama_vocab_p, token: Union[llama_token, int], /) -> bool:
+    _warn_deprecated("llama_token_is_eog", "use llama_vocab_is_eog instead")
+    return _llama_token_is_eog(vocab, token)
+
+
 # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
 @ctypes_function(
     "llama_token_is_control",
@@ -3344,6 +3560,19 @@ def llama_token_is_control(
 ) -> bool: ...
 
 
+_llama_token_is_control = llama_token_is_control
+
+
+def llama_token_is_control(
+    vocab: llama_vocab_p, token: Union[llama_token, int], /
+) -> bool:
+    _warn_deprecated(
+        "llama_token_is_control",
+        "use llama_vocab_is_control instead",
+    )
+    return _llama_token_is_control(vocab, token)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
 @ctypes_function(
     "llama_token_bos",
@@ -3353,6 +3582,14 @@ def llama_token_is_control(
 def llama_token_bos(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_bos = llama_token_bos
+
+
+def llama_token_bos(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_bos", "use llama_vocab_bos instead")
+    return _llama_token_bos(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
 @ctypes_function(
     "llama_token_eos",
@@ -3362,6 +3599,14 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_eos(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_eos = llama_token_eos
+
+
+def llama_token_eos(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_eos", "use llama_vocab_eos instead")
+    return _llama_token_eos(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
 @ctypes_function(
     "llama_token_eot",
@@ -3371,6 +3616,14 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_eot(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_eot = llama_token_eot
+
+
+def llama_token_eot(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_eot", "use llama_vocab_eot instead")
+    return _llama_token_eot(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
 @ctypes_function(
     "llama_token_cls",
@@ -3380,6 +3633,14 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_cls(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_cls = llama_token_cls
+
+
+def llama_token_cls(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_cls", "use llama_vocab_cls instead")
+    return _llama_token_cls(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
 @ctypes_function(
     "llama_token_sep",
@@ -3389,6 +3650,14 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_sep(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_sep = llama_token_sep
+
+
+def llama_token_sep(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_sep", "use llama_vocab_sep instead")
+    return _llama_token_sep(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
 @ctypes_function(
     "llama_token_nl",
@@ -3398,6 +3667,14 @@ def llama_token_sep(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_nl(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_nl = llama_token_nl
+
+
+def llama_token_nl(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_nl", "use llama_vocab_nl instead")
+    return _llama_token_nl(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
 @ctypes_function(
     "llama_token_pad",
@@ -3407,6 +3684,14 @@ def llama_token_nl(vocab: llama_vocab_p, /) -> int: ...
 def llama_token_pad(vocab: llama_vocab_p, /) -> int: ...
 
 
+_llama_token_pad = llama_token_pad
+
+
+def llama_token_pad(vocab: llama_vocab_p, /) -> int:
+    _warn_deprecated("llama_token_pad", "use llama_vocab_pad instead")
+    return _llama_token_pad(vocab)
+
+
 # DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
 @ctypes_function(
     "llama_add_bos_token",
@@ -3416,6 +3701,14 @@ def llama_token_pad(vocab: llama_vocab_p, /) -> int: ...
 def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ...
 
 
+_llama_add_bos_token = llama_add_bos_token
+
+
+def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
+    _warn_deprecated("llama_add_bos_token", "use llama_vocab_get_add_bos instead")
+    return _llama_add_bos_token(vocab)
+
+
 # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
 @ctypes_function(
     "llama_add_eos_token",
@@ -3425,6 +3718,14 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ...
 def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ...
 
 
+_llama_add_eos_token = llama_add_eos_token
+
+
+def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
+    _warn_deprecated("llama_add_eos_token", "use llama_vocab_get_add_eos instead")
+    return _llama_add_eos_token(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
 @ctypes_function(
     "llama_token_fim_pre",
@@ -3434,6 +3735,14 @@ def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ...
 def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_pre = llama_token_fim_pre
+
+
+def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_pre", "use llama_vocab_fim_pre instead")
+    return _llama_token_fim_pre(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
 @ctypes_function(
     "llama_token_fim_suf",
@@ -3443,6 +3752,14 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_suf = llama_token_fim_suf
+
+
+def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_suf", "use llama_vocab_fim_suf instead")
+    return _llama_token_fim_suf(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
 @ctypes_function(
     "llama_token_fim_mid",
@@ -3452,6 +3769,14 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_mid = llama_token_fim_mid
+
+
+def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_mid", "use llama_vocab_fim_mid instead")
+    return _llama_token_fim_mid(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
 @ctypes_function(
     "llama_token_fim_pad",
@@ -3461,6 +3786,14 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_pad = llama_token_fim_pad
+
+
+def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_pad", "use llama_vocab_fim_pad instead")
+    return _llama_token_fim_pad(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
 @ctypes_function(
     "llama_token_fim_rep",
@@ -3470,6 +3803,14 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_rep = llama_token_fim_rep
+
+
+def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_rep", "use llama_vocab_fim_rep instead")
+    return _llama_token_fim_rep(vocab)
+
+
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
 @ctypes_function(
     "llama_token_fim_sep",
@@ -3479,6 +3820,14 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_token_fim_sep = llama_token_fim_sep
+
+
+def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_token_fim_sep", "use llama_vocab_fim_sep instead")
+    return _llama_token_fim_sep(vocab)
+
+
 # // CLS is equivalent to BOS
 # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
 #         "use llama_vocab_bos instead");
@@ -3490,6 +3839,14 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
 def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
+_llama_vocab_cls = llama_vocab_cls
+
+
+def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
+    _warn_deprecated("llama_vocab_cls", "use llama_vocab_bos instead")
+    return _llama_vocab_cls(vocab)
+
+
 # //
 # // Tokenization
 # //
@@ -4146,6 +4503,34 @@ def llama_sampler_init_grammar_lazy(
 ) -> llama_sampler_p: ...
 
 
+_llama_sampler_init_grammar_lazy = llama_sampler_init_grammar_lazy
+
+
+def llama_sampler_init_grammar_lazy(
+    vocab: llama_vocab_p,
+    grammar_str: bytes,
+    grammar_root: bytes,
+    trigger_words: CtypesArray[bytes],
+    num_trigger_words: int,
+    trigger_tokens: CtypesArray[llama_token],
+    num_trigger_tokens: int,
+    /,
+) -> llama_sampler_p:
+    _warn_deprecated(
+        "llama_sampler_init_grammar_lazy",
+        "use llama_sampler_init_grammar_lazy_patterns instead",
+    )
+    return _llama_sampler_init_grammar_lazy(
+        vocab,
+        grammar_str,
+        grammar_root,
+        trigger_words,
+        num_trigger_words,
+        trigger_tokens,
+        num_trigger_tokens,
+    )
+
+
 # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
 # LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
 #     const struct llama_vocab * vocab,
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 787683179..f28402775 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -79,6 +79,12 @@
 
 # Structures
 class mtmd_context_params(Structure):
+    """Context parameters for MTMD initialization.
+
+    `image_marker` is deprecated upstream and kept for compatibility; use
+    `media_marker` for multimodal prompt placeholders.
+    """
+
     if TYPE_CHECKING:
         use_gpu: bool
         print_timings: bool
@@ -108,6 +114,8 @@ class mtmd_context_params(Structure):
 
 
 class mtmd_input_text(Structure):
+    """Text input passed to `mtmd_tokenize`."""
+
     _fields_ = [
         ("text", c_char_p),
         ("add_special", c_bool),
@@ -122,12 +130,16 @@ class mtmd_input_text(Structure):
 
 # MTMD_API const char * mtmd_default_marker(void);
 @ctypes_function("mtmd_default_marker", [], c_char_p)
-def mtmd_default_marker() -> bytes: ...
+def mtmd_default_marker() -> bytes:
+    """Return the default media marker."""
+    ...
 
 
 # MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 @ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
-def mtmd_context_params_default() -> mtmd_context_params: ...
+def mtmd_context_params_default() -> mtmd_context_params:
+    """Return the default MTMD context parameters."""
+    ...
 
 
 # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
@@ -143,7 +155,9 @@ def mtmd_init_from_file(
     text_model: llama_cpp.llama_model_p,
     ctx_params: mtmd_context_params,
     /,
-) -> Optional[mtmd_context_p]: ...
+) -> Optional[mtmd_context_p]:
+    """Initialize the MTMD context from a projector file. Returns None on failure."""
+    ...
 
 
 # MTMD_API void mtmd_free(mtmd_context * ctx);
@@ -167,7 +181,9 @@ def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool:
 
 # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
-def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ...
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
+    """Check whether the current model supports vision input."""
+    ...
 
 
 # MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
@@ -180,7 +196,7 @@ def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool:
 # MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
 @ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int)
 def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int:
-    """Get the MTMD audio sample rate."""
+    """Get the audio sample rate in Hz. Returns -1 if audio is not supported."""
     ...
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c0159f9c1..f49e91787 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c0159f9c1f874da15e94f371d136f5920b4b5335
+Subproject commit f49e9178767d557a522618b16ce8694f9ddac628

From 08e088cde15e0b56e46337d6f49000518c18c203 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 2 Apr 2026 23:22:02 -0700
Subject: [PATCH 2/2] fix(misc): replace deprecated llama.cpp references
 (#2170)

* refactor: replace deprecated llama.cpp references

* docs: update changelog for recent llama.cpp changes
---
 CHANGELOG.md                                  |  2 ++
 README.md                                     | 16 ++++++----
 examples/batch-processing/server.py           |  4 +--
 .../low_level_api/low_level_api_chat_cpp.py   | 31 ++++++++++---------
 .../low_level_api/low_level_api_llama_cpp.py  | 26 +++++++++-------
 examples/notebooks/Batching.ipynb             | 15 ++++-----
 llama_cpp/_internals.py                       |  6 ++--
 llama_cpp/llama.py                            | 11 ++++---
 8 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e577324db..f5f3677e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- refactor: Replace deprecated llama.cpp references in library, docs, and examples by @abetlen in #2170
+- feat: Update llama.cpp to ggerganov/llama.cpp@f49e9178767d557a522618b16ce8694f9ddac628 by @abetlen in #2169
 - feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
 - ci: Publish release wheels as `py3-none` by @Bing-su in #2166
 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
diff --git a/README.md b/README.md
index 8ba4dbb5e..69a0f8234 100644
--- a/README.md
+++ b/README.md
@@ -717,16 +717,20 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 ```python
 import llama_cpp
 import ctypes
-llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
-params = llama_cpp.llama_context_default_params()
+llama_cpp.llama_backend_init()  # Must be called once at the start of each program
+model_params = llama_cpp.llama_model_default_params()
+ctx_params = llama_cpp.llama_context_default_params()
+prompt = b"Q: Name the planets in the solar system? A: "
 # use bytes for char * params
-model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
-ctx = llama_cpp.llama_new_context_with_model(model, params)
-max_tokens = params.n_ctx
+model = llama_cpp.llama_model_load_from_file(b"./models/7b/llama-model.gguf", model_params)
+ctx = llama_cpp.llama_init_from_model(model, ctx_params)
+vocab = llama_cpp.llama_model_get_vocab(model)
+max_tokens = ctx_params.n_ctx
 # use ctypes arrays for array params
 tokens = (llama_cpp.llama_token * int(max_tokens))()
-n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
+n_tokens = llama_cpp.llama_tokenize(vocab, prompt, len(prompt), tokens, max_tokens, True, False)
 llama_cpp.llama_free(ctx)
+llama_cpp.llama_model_free(model)
 ```
 
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py
index 0b36746f9..2b6fa759e 100644
--- a/examples/batch-processing/server.py
+++ b/examples/batch-processing/server.py
@@ -6,14 +6,14 @@
 # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
 
 # model_params = llama_cpp.llama_model_default_params()
-# model = llama_cpp.llama_load_model_from_file(path, model_params)
+# model = llama_cpp.llama_model_load_from_file(path, model_params)
 
 # if model is None:
 #     raise RuntimeError(f"Failed to load model from file: {path}")
 
 
 # ctx_params = llama_cpp.llama_context_default_params()
-# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
+# ctx = llama_cpp.llama_init_from_model(model, ctx_params)
 
 # if ctx is None:
 #     raise RuntimeError("Failed to create context")
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 39081be17..20f7a158a 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -79,19 +79,22 @@ def __init__(self, params: GptParams) -> None:
         self.lparams.use_mlock = self.params.use_mlock
         self.lparams.use_mmap = self.params.use_mmap
 
-        self.model = llama_cpp.llama_load_model_from_file(
+        self.model = llama_cpp.llama_model_load_from_file(
             self.params.model.encode("utf8"), self.lparams
         )
+        self.vocab = llama_cpp.llama_model_get_vocab(self.model)
 
         # Context Params.
         self.cparams = llama_cpp.llama_context_default_params()
 
-        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
+        self.ctx = llama_cpp.llama_init_from_model(self.model, self.cparams)
         if not self.ctx:
             raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
         if self.params.ignore_eos:
-            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+            self.params.logit_bias[llama_cpp.llama_vocab_eos(self.vocab)] = -float(
+                "inf"
+            )
 
         if len(self.params.lora_adapter) > 0:
             if (
@@ -153,7 +156,7 @@ def __init__(self, params: GptParams) -> None:
                 _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
                 _n_token_count_out = llama_cpp.c_size_t()
                 if (
-                    llama_cpp.llama_load_session_file(
+                    llama_cpp.llama_state_load_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         _session_tokens,
@@ -314,7 +317,7 @@ def __init__(self, params: GptParams) -> None:
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
         _n = llama_cpp.llama_tokenize(
-            self.model,
+            self.vocab,
             prompt.encode("utf8", errors="ignore"),
             len(prompt),
             _arr,
@@ -406,7 +409,7 @@ def generate(self):
             if len(self.embd_inp) <= self.input_consumed:  # && !is_interacting
                 # out of user input, sample next token
                 top_k = (
-                    llama_cpp.llama_n_vocab(self.ctx)
+                    llama_cpp.llama_vocab_n_tokens(self.vocab)
                     if self.params.top_k <= 0
                     else self.params.top_k
                 )
@@ -419,7 +422,7 @@ def generate(self):
                 # optionally save the session on first sample (for faster prompt loading next time)
                 if len(self.params.path_session) > 0 and self.need_to_save_session:
                     self.need_to_save_session = False
-                    llama_cpp.llama_save_session_file(
+                    llama_cpp.llama_state_save_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         (llama_cpp.llama_token * len(self.session_tokens))(
@@ -431,7 +434,7 @@ def generate(self):
                 id = 0
 
                 logits = llama_cpp.llama_get_logits(self.ctx)
-                n_vocab = llama_cpp.llama_n_vocab(self.model)
+                n_vocab = llama_cpp.llama_vocab_n_tokens(self.vocab)
 
                 # Apply params.logit_bias map
                 for key, value in self.params.logit_bias.items():
@@ -448,7 +451,7 @@ def generate(self):
                 )
 
                 # Apply penalties
-                nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
+                nl_logit = logits[llama_cpp.llama_vocab_nl(self.vocab)]
                 last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
                 _arr = (llama_cpp.llama_token * last_n_repeat)(
@@ -470,7 +473,7 @@ def generate(self):
                 # 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
                 if not self.params.penalize_nl:
-                    logits[llama_cpp.llama_token_nl()] = nl_logit
+                    logits[llama_cpp.llama_vocab_nl(self.vocab)] = nl_logit
 
                 if self.params.temp <= 0:
                     # Greedy sampling
@@ -539,7 +542,7 @@ def generate(self):
 
                 # replace end of text token with newline token when in interactive mode
                 if (
-                    id == llama_cpp.llama_token_eos(self.ctx)
+                    id == llama_cpp.llama_vocab_eos(self.vocab)
                     and self.params.interactive
                     and not self.params.instruct
                 ):
@@ -599,8 +602,8 @@ def generate(self):
                     break
 
             # end of text token
-            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
-                self.ctx
+            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_vocab_eos(
+                self.vocab
             ):
                 if not self.params.instruct:
                     for i in self.llama_token_eot:
@@ -636,7 +639,7 @@ def token_to_str(self, token_id: int) -> bytes:
         size = 32
         buffer = (ctypes.c_char * size)()
         n = llama_cpp.llama_token_to_piece(
-            self.model, llama_cpp.llama_token(token_id), buffer, size
+            self.vocab, llama_cpp.llama_token(token_id), buffer, size, 0, False
         )
         assert n <= size
         return bytes(buffer[:n])
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index ba3545771..9fb3424ec 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -4,7 +4,7 @@
 
 import llama_cpp
 
-llama_cpp.llama_backend_init(numa=False)
+llama_cpp.llama_backend_init()
 
 N_THREADS = multiprocessing.cpu_count()
 MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
@@ -13,8 +13,9 @@
 
 lparams = llama_cpp.llama_model_default_params()
 cparams = llama_cpp.llama_context_default_params()
-model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
-ctx = llama_cpp.llama_new_context_with_model(model, cparams)
+model = llama_cpp.llama_model_load_from_file(MODEL_PATH.encode("utf-8"), lparams)
+ctx = llama_cpp.llama_init_from_model(model, cparams)
+vocab = llama_cpp.llama_model_get_vocab(model)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
@@ -28,13 +29,13 @@
 
 embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
 n_of_tok = llama_cpp.llama_tokenize(
-    model=model,
-    text=bytes(str(prompt), "utf-8"),
-    text_len=len(embd_inp),
+    vocab=vocab,
+    text=prompt,
+    text_len=len(prompt),
     tokens=embd_inp,
-    n_max_tokens=len(embd_inp),
-    add_bos=False,
-    special=False,
+    n_tokens_max=len(embd_inp),
+    add_special=False,
+    parse_special=False,
 )
 embd_inp = embd_inp[:n_of_tok]
 
@@ -70,7 +71,7 @@
     embd = []
     if len(embd_inp) <= input_consumed:
         logits = llama_cpp.llama_get_logits(ctx)
-        n_vocab = llama_cpp.llama_n_vocab(model)
+        n_vocab = llama_cpp.llama_vocab_n_tokens(vocab)
 
         _arr = (llama_cpp.llama_token_data * n_vocab)(
             *[
@@ -114,7 +115,7 @@
             size = 32
             buffer = (ctypes.c_char * size)()
             n = llama_cpp.llama_token_to_piece(
-                model, llama_cpp.llama_token(id), buffer, size
+                vocab, llama_cpp.llama_token(id), buffer, size, 0, False
             )
             assert n <= size
             print(
@@ -123,7 +124,7 @@
                 flush=True,
             )
 
-    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
+    if len(embd) > 0 and embd[-1] == llama_cpp.llama_vocab_eos(vocab):
         break
 
 print()
@@ -131,3 +132,4 @@
 llama_cpp.llama_print_timings(ctx)
 
 llama_cpp.llama_free(ctx)
+llama_cpp.llama_model_free(model)
diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
index be7fe9b52..0b36cd9c3 100644
--- a/examples/notebooks/Batching.ipynb
+++ b/examples/notebooks/Batching.ipynb
@@ -122,9 +122,10 @@
    "source": [
     "params = llama_cpp.llama_model_default_params()\n",
     "params.n_gpu_layers = 35\n",
-    "model = llama_cpp.llama_load_model_from_file(\n",
+    "model = llama_cpp.llama_model_load_from_file(\n",
     "    b\"/workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf\", params\n",
-    ")  # Update this to whatever"
+    ")  # Update this to whatever\n",
+    "vocab = llama_cpp.llama_model_get_vocab(model)"
    ]
   },
   {
@@ -149,7 +150,7 @@
     "\n",
     "tokens = (llama_cpp.llama_token * n_ctx)()\n",
     "tokens_len = llama_cpp.llama_tokenize(\n",
-    "    model, prompt, len(prompt), tokens, len(tokens), True, True\n",
+    "    vocab, prompt, len(prompt), tokens, len(tokens), True, True\n",
     ")\n",
     "print(tokens[:tokens_len])\n",
     "\n",
@@ -188,7 +189,7 @@
     "ctx_params.n_batch = max(n_len, n_parallel)\n",
     "ctx_params.n_threads = 1\n",
     "ctx_params.n_threads_batch = 1\n",
-    "ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)"
+    "ctx = llama_cpp.llama_init_from_model(model, ctx_params)"
    ]
   },
   {
@@ -338,14 +339,14 @@
     "        # Sample the next token using the sampler chain\n",
     "        new_token_id = llama_cpp.llama_sampler_sample(sampler_chain, ctx, -1)\n",
     "\n",
-    "        if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:\n",
+    "        if new_token_id == llama_cpp.llama_vocab_eos(vocab) or n_cur == n_len:\n",
     "            i_batch[i] = -1\n",
     "            continue\n",
     "\n",
     "        buf = (ctypes.c_char * 32)()\n",
     "        \n",
     "        # Convert token ID to text\n",
-    "        outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf), 0, False)\n",
+    "        outlen = llama_cpp.llama_token_to_piece(vocab, new_token_id, buf, len(buf), 0, False)\n",
     "        streams[i] += bytes(buf[:outlen]).decode(\"utf-8\")\n",
     "\n",
     "        batch.token[batch.n_tokens] = new_token_id\n",
@@ -411,7 +412,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llama_cpp.llama_free_model(model)"
+    "llama_cpp.llama_model_free(model)"
    ]
   },
   {
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 9e9bcd407..cde52c8c8 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -135,7 +135,7 @@ def token_eos(self) -> int:
         return llama_cpp.llama_vocab_eos(self.vocab)
 
     def token_cls(self) -> int:
-        return llama_cpp.llama_vocab_cls(self.vocab)
+        return llama_cpp.llama_vocab_bos(self.vocab)
 
     def token_sep(self) -> int:
         return llama_cpp.llama_vocab_sep(self.vocab)
@@ -317,9 +317,9 @@ def get_state_size(self) -> int:
 
     # TODO: set_state_data
 
-    # TODO: llama_load_session_file
+    # TODO: llama_state_load_file
 
-    # TODO: llama_save_session_file
+    # TODO: llama_state_save_file
 
     def decode(self, batch: LlamaBatch):
         return_code = llama_cpp.llama_decode(
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ad484c4d5..11fe169cf 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1351,7 +1351,7 @@ def logit_bias_processor(
             logits_processor=logits_processor,
             grammar=grammar,
         ):
-            if llama_cpp.llama_token_is_eog(self._model.vocab, token):
+            if llama_cpp.llama_vocab_is_eog(self._model.vocab, token):
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
@@ -2148,13 +2148,13 @@ def __setstate__(self, state):
     def save_state(self) -> LlamaState:
         if self.verbose:
             print("Llama.save_state: saving llama state", file=sys.stderr)
-        state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        state_size = llama_cpp.llama_state_get_size(self._ctx.ctx)
         if self.verbose:
             print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
         llama_state = (ctypes.c_uint8 * int(state_size))()
         if self.verbose:
             print("Llama.save_state: allocated state", file=sys.stderr)
-        n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
+        n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size)
         if self.verbose:
             print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
         if int(n_bytes) > int(state_size):
@@ -2187,7 +2187,10 @@ def load_state(self, state: LlamaState) -> None:
         LLamaStateArrayType = ctypes.c_uint8 * state_size
         llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
 
-        if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
+        if (
+            llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size)
+            != state_size
+        ):
             raise RuntimeError("Failed to set llama state data")
 
     def n_ctx(self) -> int: