From 4d6136a8fa2832c18736935dfd5ab338c11e848f Mon Sep 17 00:00:00 2001 From: Ralf Waldukat Date: Sat, 4 Apr 2026 14:21:55 +0700 Subject: [PATCH 1/2] chore: Update llama.cpp to ggerganov/llama.cpp@d006858 --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f49e91787..d00685831 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f49e9178767d557a522618b16ce8694f9ddac628 +Subproject commit d006858316d4650bb4da0c6923294ccd741caefd From f9dd86ce33af9ef8529422eae03373c1b8ac1c5b Mon Sep 17 00:00:00 2001 From: Ralf Waldukat Date: Tue, 13 Jan 2026 23:08:46 +0700 Subject: [PATCH 2/2] fix: critical fixes for recurrent/hybrid model support After external code review (GPT-5.2), fixed 4 critical issues: 1. CRITICAL: Fixed tokens[:-1] bug in prefix matching - Was silently breaking prefix matching for ALL models - Caused false rewind detection and cache inefficiency - Impact: Transformers AND recurrent models 2. CRITICAL: Implement proper reset() for recurrent models - Now actually clears llama_memory backend state - Root cause fix for 'sequence positions not consecutive' crash - Without this, reset was a no-op for recurrent models 3. CRITICAL: Enforce strict append policy for recurrent models - Prevents KV cache rewinding that's impossible without state snapshots - Forces full reset on history edits instead of crashing 4. Performance: Cache _is_recurrent to avoid repeated FFI calls 5. Documentation: Simplified comments and updated docstring 6. Testing: All existing tests pass + Mistral-Small-3.2-24B validated Resolves multi-turn crashes for Nemotron-A3B, Mamba, RWKV, Jamba models. Reviewed-by: GPT-5.2 (OpenAI) Tested-by: pytest + Mistral-Small-3.2-24B Fixes: #2108 (recurrent model crashes) Compatible-with: #2109 (Granite-Docling/SmolVLM special tokens) --- llama_cpp/llama.py | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 11fe169cf..5eec03532 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -192,6 +192,11 @@ def __init__( type_v: KV cache data type for V (default: f16) spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. + Note: + Recurrent and hybrid models (Mamba, RWKV, Nemotron-A3B, Jamba) cannot + rewind their state and require full reset on history edits. This is handled + automatically to maintain compatibility. Standard transformers are unaffected. + Raises: ValueError: If the model path does not exist. @@ -553,6 +558,11 @@ def free_lora_adapter(): self._sampler = None + # Cache recurrent/hybrid model detection to avoid repeated FFI calls + self._is_recurrent_model = llama_cpp.llama_model_is_recurrent( + self._model.model + ) or llama_cpp.llama_model_is_hybrid(self._model.model) + @property def ctx(self) -> llama_cpp.llama_context_p: return self._ctx.ctx @@ -580,6 +590,19 @@ def eval_logits(self) -> Deque[List[float]]: maxlen=self._n_ctx if self._logits_all else 1, ) + @property + def _is_recurrent(self) -> bool: + """Check if model is recurrent (SSM) or hybrid (SSM+Attention). + + These models (Mamba, RWKV, Nemotron, Jamba, etc.) cannot rewind their + recurrent state without snapshots. Only strict forward progression or + full reset is allowed. + + Returns: + True if model has recurrent state that cannot be rewound. + """ + return self._is_recurrent_model + def tokenize( self, text: bytes, add_bos: bool = True, special: bool = False ) -> List[int]: @@ -638,6 +661,11 @@ def reset(self): """Reset the model state.""" self.n_tokens = 0 + if self._is_recurrent: + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + if mem is not None: + llama_cpp.llama_memory_clear(mem, True) + def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. @@ -888,11 +916,22 @@ def generate( # Check for kv cache prefix match if reset and self.n_tokens > 0: longest_prefix = 0 - for a, b in zip(self._input_ids, tokens[:-1]): + for a, b in zip(self._input_ids, tokens): if a == b: longest_prefix += 1 else: break + + # Recurrent models cannot rewind state; reset if needed + if self._is_recurrent and longest_prefix < self.n_tokens: + longest_prefix = 0 + reset = True + if self.verbose: + print( + "Llama.generate: recurrent model requires full state reset", + file=sys.stderr, + ) + if longest_prefix > 0: if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): reset = False