fix: prevent KV cache corruption on SWA/ISWA models (e.g. Gemma-4)

Ralf Waldukat · Ralf Waldukat · commit 9609c824b0d2 · 2026-04-12T22:58:51.000+07:00
SWA/ISWA KV caches maintain global position maps (g_iswa_pos_max/min) that
are only cleared by llama_memory_clear(), not by kv_cache_seq_rm(). When
generate() finds a prefix match (e.g. shared BOS token), it calls
kv_cache_seq_rm which returns True for ISWA, skipping the full reset. But
the stale position maps cause batch allocator inconsistency and
llama_decode returned -1 on subsequent prompts.

Changes:
- Add _has_swa property via llama_model_n_swa() &gt; 0
- reset() now calls llama_memory_clear() unconditionally
- generate() bypasses prefix-match optimization for SWA models,
  forcing full state reset (same path as recurrent models)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -553,6 +553,14 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Cache model architecture flags to avoid repeated FFI calls
+        self._is_recurrent_model = llama_cpp.llama_model_is_recurrent(
+            self._model.model
+        ) or llama_cpp.llama_model_is_hybrid(self._model.model)
+        self._has_swa_model = llama_cpp.llama_model_n_swa(
+            self._model.model
+        ) > 0
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -638,6 +646,10 @@ def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
 
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
+
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
@@ -889,11 +901,29 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
+
+            # Recurrent models cannot rewind state; reset if needed
+            if self._is_recurrent_model and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+                if self.verbose:
+                    print(
+                        "Llama.generate: recurrent model requires full state reset",
+                        file=sys.stderr,
+                    )
+
+            # SWA/ISWA models (e.g. Gemma-4) have split KV caches whose
+            # position-tracking maps are only cleared by a full reset.
+            # Partial seq_rm leaves stale positions and causes decode failure.
+            if self._has_swa_model and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False
@@ -1259,6 +1289,8 @@ def _create_completion(
                 RuntimeWarning,
             )
 
+        # NOTE: This likely doesn't work correctly for the first token in the prompt
+        # because of the extra space added to the start of the prompt_tokens
         if logit_bias is not None:
             logit_bias_map = {int(k): float(v) for k, v in logit_bias.items()}
 
@@ -1682,6 +1714,7 @@ def logit_bias_processor(
                 for i, token in enumerate(all_tokens)
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
+            # TODO: may be able to change this loop to use np.take_along_dim
             for idx, (token, token_str, logprobs_token) in enumerate(
                 zip(all_tokens, all_token_strs, all_logprobs)
             ):
diff --git a/test_gemma4_iswa.py b/test_gemma4_iswa.py
@@ -0,0 +1,75 @@
+"""Test Gemma-4 ISWA model with sequential chat prompts.
+
+Tests:
+1. ISWA fix: no 'llama_decode returned -1' on sequential prompts
+2. Output quality: coherent text with proper chat template
+"""
+
+import sys
+import time
+
+from llama_cpp import Llama
+
+MODEL_PATH = "/Users/avion/Documents.nosync/projects/llama-cpp-python/vendor/llama.cpp/build/bin/../../../Downloads/models/supergemma4-26b-uncensored-fast-v2-Q4_K_M.gguf"
+
+PROMPTS = [
+    [{"role": "user", "content": "What is 2+2? Answer briefly."}],
+    [{"role": "user", "content": "Write a Python hello world in one line."}],
+    [{"role": "user", "content": "Explain recursion in one sentence."}],
+]
+
+def main():
+    print(f"Loading model: {MODEL_PATH}")
+    t0 = time.time()
+    llm = Llama(
+        model_path="/Users/avion/Downloads/models/supergemma4-26b-uncensored-fast-v2-Q4_K_M.gguf",
+        n_gpu_layers=-1,
+        n_ctx=4096,
+        verbose=False,
+    )
+    print(f"Model loaded in {time.time() - t0:.1f}s")
+    print(f"  _has_swa: {llm._has_swa}")
+    print(f"  _is_recurrent: {llm._is_recurrent}")
+    print(f"  n_ctx: {llm.n_ctx()}")
+    print()
+
+    results = []
+    for i, messages in enumerate(PROMPTS):
+        prompt = messages[0]["content"]
+        print(f"--- Chat {i+1}: {prompt!r} ---")
+        t1 = time.time()
+        try:
+            resp = llm.create_chat_completion(
+                messages=messages,
+                max_tokens=128,
+                temperature=0.6,
+                top_p=0.95,
+                repeat_penalty=1.1,
+            )
+            elapsed = time.time() - t1
+            text = resp["choices"][0]["message"]["content"]
+            print(f"[OK] {elapsed:.1f}s, {len(text)} chars:")
+            print(text[:300])
+            print()
+            results.append(("OK", prompt, text))
+        except RuntimeError as e:
+            elapsed = time.time() - t1
+            print(f"[FAIL] {elapsed:.1f}s: {e}")
+            print()
+            results.append(("FAIL", prompt, str(e)))
+
+    print("=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for i, (status, prompt, detail) in enumerate(results):
+        print(f"  [{status}] Chat {i+1}: {prompt!r}")
+
+    ok = sum(1 for s, _, _ in results if s == "OK")
+    fail = sum(1 for s, _, _ in results if s == "FAIL")
+    print(f"\n  Passed: {ok}/{len(results)}, Failed: {fail}/{len(results)}")
+
+    if fail > 0:
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3bd9aa1f9250cd15f5371f3622d73d954b68a747
+Subproject commit 535f7618dad3d266df749e0bbf85f2ceb9d6706c