fix: remove unconditional kv_cache_seq_rm from eval() and consolidate recurrent/SWA guards

Ralf Waldukat · Ralf Waldukat · commit c9bbd6d8d355 · 2026-04-13T12:39:58.000+07:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -651,13 +651,7 @@ def reset(self):
             llama_cpp.llama_memory_clear(mem, True)
 
     def eval(self, tokens: Sequence[int]):
-        """Evaluate a list of tokens.
-
-        Args:
-            tokens: The list of tokens to evaluate.
-        """
-        if len(tokens) < self.n_tokens:
-            self._ctx.kv_cache_seq_rm(-1, len(tokens), -1)
+        """Evaluate a list of tokens."""
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
@@ -666,26 +660,12 @@ def eval(self, tokens: Sequence[int]):
                 batch=batch, n_past=n_past, logits_all=self._logits_all
             )
             self._ctx.decode(self._batch)
-            # Save tokens
             self.input_ids[n_past : n_past + n_tokens] = batch
-            # Save logits
             if self._logits_all:
-                rows = n_tokens
-                cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,)
+                    self._ctx.get_logits(), shape=(n_tokens, self._n_vocab)
                 )
-                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
-            else:
-                # rows = 1
-                # cols = self._n_vocab
-                # logits = np.ctypeslib.as_array(
-                #     self._ctx.get_logits(), shape=(rows * cols,)
-                # )
-                # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
-                # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
-                pass
-            # Update n_tokens
+                self.scores[n_past : n_past + n_tokens, :] = logits
             self.n_tokens += n_tokens
 
     def _init_sampler(
@@ -907,34 +887,14 @@ def generate(
                 else:
                     break
 
-            # Recurrent models cannot rewind state; reset if needed
-            if self._is_recurrent_model and longest_prefix < self.n_tokens:
-                longest_prefix = 0
-                reset = True
-                if self.verbose:
-                    print(
-                        "Llama.generate: recurrent model requires full state reset",
-                        file=sys.stderr,
-                    )
-
-            # SWA/ISWA models (e.g. Gemma-4) have split KV caches whose
-            # position-tracking maps are only cleared by a full reset.
-            # Partial seq_rm leaves stale positions and causes decode failure.
-            if self._has_swa_model and longest_prefix < self.n_tokens:
+            if (self._is_recurrent_model or self._has_swa_model) and longest_prefix < self.n_tokens:
                 longest_prefix = 0
-                reset = True
 
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False
                     tokens = tokens[longest_prefix:]
                     self.n_tokens = longest_prefix
-                    if self.verbose:
-                        print(
-                            f"Llama.generate: {longest_prefix} prefix-match hit, "
-                            f"remaining {len(tokens)} prompt tokens to eval",
-                            file=sys.stderr,
-                        )
                 elif self.verbose:
                     print(
                         f"Llama.generate: {longest_prefix} prefix-match found "