From 56a05711994ff21d7c768c60d69d3b7408e47d50 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 14:19:20 -0700
Subject: [PATCH 01/65] compile_modeld.py

---
 selfdrive/modeld/SConscript                   |  47 +++++-
 .../{compile_warp.py => compile_modeld.py}    | 120 +++++++++++---
 selfdrive/modeld/modeld.py                    | 156 +++++-------------
 3 files changed, 181 insertions(+), 142 deletions(-)
 rename selfdrive/modeld/{compile_warp.py => compile_modeld.py} (57%)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index bad1cdd5003d7d..53d063d58e52c9 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -30,14 +30,46 @@ for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy',
 image_flag = {
      'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
-script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
-compile_warp_cmd = f'{tg_flags} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
+
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-warp_targets = []
+
+# DM warp (compile_modeld.py handles dm warp separately)
+dm_warp_script = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
+dm_warp_targets = []
+for cam in [_ar_ox_fisheye, _os_fisheye]:
+  w, h = cam.width, cam.height
+  dm_warp_targets.append(File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath)
+dm_warp_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_dm_warp, CAMERA_CONFIGS; [compile_dm_warp(w, h) for w, h in CAMERA_CONFIGS]"'
+lenv.Command(dm_warp_targets, tinygrad_files + dm_warp_script, dm_warp_cmd)
+
+# Combined modeld JIT (warp + vision + on_policy + off_policy)
+compile_modeld_script = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
+modeld_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
+modeld_meta_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
+modeld_targets = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
-  warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
-lenv.Command(warp_targets, tinygrad_files + script_files, compile_warp_cmd)
+  modeld_targets.append(File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath)
+compile_modeld_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
+modeld_compile_node = lenv.Command(modeld_targets, modeld_onnx_deps + modeld_meta_deps + tinygrad_files + compile_modeld_script + [chunker_file], compile_modeld_cmd)
+
+# Chunk modeld pickles
+modeld_chunk_targets = []
+for cam in [_ar_ox_fisheye, _os_fisheye]:
+  w, h = cam.width, cam.height
+  pkl = File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath
+  total_onnx_size = sum(os.path.getsize(File(f"models/{m}.onnx").abspath) for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy'])
+  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size))
+  modeld_chunk_targets.append((pkl, chunk_paths))
+
+def do_modeld_chunk(target, source, env):
+  for pkl, chunk_paths in modeld_chunk_targets:
+    chunk_file(pkl, chunk_paths)
+lenv.Command(
+  [p for _, paths in modeld_chunk_targets for p in paths],
+  modeld_compile_node,
+  do_modeld_chunk,
+)
 
 def tg_compile(flags, model_name):
   pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
@@ -58,6 +90,5 @@ def tg_compile(flags, model_name):
     do_chunk,
   )
 
-# Compile small models
-for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:
-  tg_compile(tg_flags, model_name)
+# Compile DM model (still separate)
+tg_compile(tg_flags, 'dmonitoring_model')
diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_modeld.py
similarity index 57%
rename from selfdrive/modeld/compile_warp.py
rename to selfdrive/modeld/compile_modeld.py
index 47511f2a2b6b23..5ca251c154e72f 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -7,10 +7,12 @@
 from tinygrad.helpers import Context
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
+from tinygrad.nn.onnx import OnnxRunner
 
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+from openpilot.selfdrive.modeld.constants import ModelConstants
 
 MODELS_DIR = Path(__file__).parent / 'models'
 
@@ -22,11 +24,13 @@
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 
-IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
+IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2) # TODO keep n images / n channels separate
 
+FREQ_RATIO = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ  # 20Hz / 5Hz = 4
 
-def warp_pkl_path(w, h):
-  return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
+
+def modeld_pkl_path(w, h):
+  return MODELS_DIR / f'modeld_{w}x{h}_tinygrad.pkl'
 
 
 def dm_warp_pkl_path(w, h):
@@ -124,42 +128,114 @@ def warp_dm(input_frame, M_inv):
   return warp_dm
 
 
-def compile_modeld_warp(cam_w, cam_h):
+def compile_modeld(cam_w, cam_h):
   model_w, model_h = MEDMODEL_INPUT_SIZE
   _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
 
-  print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
-
+  print(f"Compiling combined modeld JIT for {cam_w}x{cam_h}...")
+
+  # load model metadata for shapes and output slices
+  with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
+    vision_meta = pickle.load(f)
+  with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
+    on_policy_meta = pickle.load(f)
+
+  hidden_state_slice = vision_meta['output_slices']['hidden_state']
+  feature_dim = hidden_state_slice.stop - hidden_state_slice.start
+  features_buffer_shape = on_policy_meta['input_shapes']['features_buffer']  # (1, 25, 512)
+  desire_pulse_shape = on_policy_meta['input_shapes']['desire_pulse']        # (1, 25, 8)
+  n_features_steps = features_buffer_shape[1]
+  n_desire_steps = desire_pulse_shape[1]
+  desire_dim = desire_pulse_shape[2]
+  feature_queue_shape = (1, n_features_steps * FREQ_RATIO, feature_dim)
+  desire_queue_shape = (1, n_desire_steps * FREQ_RATIO, desire_dim)
+
+  # load ONNX models
+  vision_runner = OnnxRunner(str(MODELS_DIR / 'driving_vision.onnx'))
+  on_policy_runner = OnnxRunner(str(MODELS_DIR / 'driving_on_policy.onnx'))
+  off_policy_runner = OnnxRunner(str(MODELS_DIR / 'driving_off_policy.onnx'))
+
+  # create warp pipeline
   frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
   update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
-  update_img_jit = TinyJit(update_both_imgs, prune=True)
 
-  full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  def run_modeld(img_buf, frame, M_inv,
+                 big_img_buf, big_frame, M_inv_big,
+                 feat_queue, desire_q, desire_in, traffic_in):
+    # warp both camera images
+    img, big_img = update_both_imgs(img_buf, frame, M_inv, big_img_buf, big_frame, M_inv_big)
+
+    # run vision model
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
+
+    # extract features from vision output and update feature queue
+    features = vision_out[:, hidden_state_slice].reshape(1, 1, feature_dim)
+    feat_queue.assign(feat_queue[:, 1:].cat(features, dim=1).contiguous())
+
+    # update desire queue
+    desire_new = desire_in.to(Device.DEFAULT).reshape(1, 1, desire_dim)
+    desire_q.assign(desire_q[:, 1:].cat(desire_new, dim=1).contiguous())
+
+    # subsample features for policy: take every FREQ_RATIO-th from end
+    features_buffer = feat_queue[:, (FREQ_RATIO - 1)::FREQ_RATIO, :]
+
+    # subsample desire: group by FREQ_RATIO and take max (pulse detection)
+    desire_pulse = desire_q.reshape(1, n_desire_steps, FREQ_RATIO, desire_dim).max(axis=2)
+
+    # run both policy models
+    policy_inputs = {
+      'features_buffer': features_buffer,
+      'desire_pulse': desire_pulse,
+      'traffic_convention': traffic_in.to(Device.DEFAULT),
+    }
+    on_policy_out = next(iter(on_policy_runner(policy_inputs).values())).cast('float32')
+    off_policy_out = next(iter(off_policy_runner(policy_inputs).values())).cast('float32')
+
+    return vision_out, on_policy_out, off_policy_out
+
+  run_modeld_jit = TinyJit(run_modeld, prune=True)
+
+  # create state tensors for JIT tracing
+  img_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  big_img_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  feat_queue = Tensor.zeros(feature_queue_shape, dtype='float32').contiguous().realize()
+  desire_queue = Tensor.zeros(desire_queue_shape, dtype='float32').contiguous().realize()
+  desire_np = np.zeros((1, desire_dim), dtype=np.float32)
+  desire_tensor = Tensor(desire_np, device='NPY')
+  traffic_np = np.zeros((1, 2), dtype=np.float32)
+  traffic_tensor = Tensor(traffic_np, device='NPY')
+
   for i in range(10):
-    img_inputs = [full_buffer,
-                  Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
-                  Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    big_img_inputs = [big_full_buffer,
-                      Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
-                      Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    inputs = img_inputs + big_img_inputs
+    frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
+    M_inv_big = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
+    desire_np[:] = np.random.randn(1, desire_dim).astype(np.float32)
+    traffic_np[:] = np.random.randn(1, 2).astype(np.float32)
     Device.default.synchronize()
 
     st = time.perf_counter()
-    _ = update_img_jit(*inputs)
+    outs = run_modeld_jit(img_buffer, frame, M_inv,
+                          big_img_buffer, big_frame, M_inv_big,
+                          feat_queue, desire_queue, desire_tensor, traffic_tensor)
     mt = time.perf_counter()
+    for o in outs:
+      o.realize()
     Device.default.synchronize()
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
-  pkl_path = warp_pkl_path(cam_w, cam_h)
+  pkl_path = modeld_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
-    pickle.dump(update_img_jit, f)
+    pickle.dump(run_modeld_jit, f)
   print(f"  Saved to {pkl_path}")
 
-  jit = pickle.load(open(pkl_path, "rb"))
-  jit(*inputs)
+  # validate pickle roundtrip
+  jit_loaded = pickle.load(open(pkl_path, "rb"))
+  jit_loaded(img_buffer, frame, M_inv,
+             big_img_buffer, big_frame, M_inv_big,
+             feat_queue, desire_queue, desire_tensor, traffic_tensor)
+  print("  Pickle roundtrip validated")
 
 
 def compile_dm_warp(cam_w, cam_h):
@@ -190,7 +266,7 @@ def compile_dm_warp(cam_w, cam_h):
 
 def run_and_save_pickle():
   for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld_warp(cam_w, cam_h)
+    compile_modeld(cam_w, cam_h)
     compile_dm_warp(cam_w, cam_h)
 
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 82e750cf8b3be2..4ce8d3a5401512 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -27,6 +27,7 @@
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
+from openpilot.selfdrive.modeld.compile_modeld import modeld_pkl_path, FREQ_RATIO
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
 
@@ -35,11 +36,8 @@
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
 MODELS_DIR = Path(__file__).parent / 'models'
-VISION_PKL_PATH = MODELS_DIR / 'driving_vision_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
-ON_POLICY_PKL_PATH = MODELS_DIR / 'driving_on_policy_tinygrad.pkl'
 ON_POLICY_METADATA_PATH = MODELS_DIR / 'driving_on_policy_metadata.pkl'
-OFF_POLICY_PKL_PATH = MODELS_DIR / 'driving_off_policy_tinygrad.pkl'
 OFF_POLICY_METADATA_PATH = MODELS_DIR / 'driving_off_policy_metadata.pkl'
 
 LAT_SMOOTH_SECONDS = 0.0
@@ -82,114 +80,50 @@ def __init__(self, vipc=None):
     if vipc is not None:
       self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 
-class InputQueues:
-  def __init__ (self, model_fps, env_fps, n_frames_input):
-    assert env_fps % model_fps == 0
-    assert env_fps >= model_fps
-    self.model_fps = model_fps
-    self.env_fps = env_fps
-    self.n_frames_input = n_frames_input
-
-    self.dtypes = {}
-    self.shapes = {}
-    self.q = {}
-
-  def update_dtypes_and_shapes(self, input_dtypes, input_shapes) -> None:
-    self.dtypes.update(input_dtypes)
-    if self.env_fps == self.model_fps:
-      self.shapes.update(input_shapes)
-    else:
-      for k in input_shapes:
-        shape = list(input_shapes[k])
-        if 'img' in k:
-          n_channels = shape[1] // self.n_frames_input
-          shape[1] = (self.env_fps // self.model_fps + (self.n_frames_input - 1)) * n_channels
-        else:
-          shape[1] = (self.env_fps // self.model_fps) * shape[1]
-        self.shapes[k] = tuple(shape)
-
-  def reset(self) -> None:
-    self.q = {k: np.zeros(self.shapes[k], dtype=self.dtypes[k]) for k in self.dtypes.keys()}
-
-  def enqueue(self, inputs:dict[str, np.ndarray]) -> None:
-    for k in inputs.keys():
-      if inputs[k].dtype != self.dtypes[k]:
-        raise ValueError(f'supplied input <{k}({inputs[k].dtype})> has wrong dtype, expected {self.dtypes[k]}')
-      input_shape = list(self.shapes[k])
-      input_shape[1] = -1
-      single_input = inputs[k].reshape(tuple(input_shape))
-      sz = single_input.shape[1]
-      self.q[k][:,:-sz] = self.q[k][:,sz:]
-      self.q[k][:,-sz:] = single_input
-
-  def get(self, *names) -> dict[str, np.ndarray]:
-    if self.env_fps == self.model_fps:
-      return {k: self.q[k] for k in names}
-    else:
-      out = {}
-      for k in names:
-        shape = self.shapes[k]
-        if 'img' in k:
-          n_channels = shape[1] // (self.env_fps // self.model_fps + (self.n_frames_input - 1))
-          out[k] = np.concatenate([self.q[k][:, s:s+n_channels] for s in np.linspace(0, shape[1] - n_channels, self.n_frames_input, dtype=int)], axis=1)
-        elif 'pulse' in k:
-          # any pulse within interval counts
-          out[k] = self.q[k].reshape((shape[0], shape[1] * self.model_fps // self.env_fps, self.env_fps // self.model_fps, -1)).max(axis=2)
-        else:
-          idxs = np.arange(-1, -shape[1], -self.env_fps // self.model_fps)[::-1]
-          out[k] = self.q[k][:, idxs]
-      return out
-
 class ModelState:
-  inputs: dict[str, np.ndarray]
-  output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self):
     with open(VISION_METADATA_PATH, 'rb') as f:
       vision_metadata = pickle.load(f)
-      self.vision_input_shapes =  vision_metadata['input_shapes']
-      self.vision_input_names = list(self.vision_input_shapes.keys())
       self.vision_output_slices = vision_metadata['output_slices']
-      vision_output_size = vision_metadata['output_shapes']['outputs'][1]
+      hidden_state_slice = vision_metadata['output_slices']['hidden_state']
+      self.feature_dim = hidden_state_slice.stop - hidden_state_slice.start
 
     with open(OFF_POLICY_METADATA_PATH, 'rb') as f:
       off_policy_metadata = pickle.load(f)
-      self.off_policy_input_shapes =  off_policy_metadata['input_shapes']
       self.off_policy_output_slices = off_policy_metadata['output_slices']
-      off_policy_output_size = off_policy_metadata['output_shapes']['outputs'][1]
 
     with open(ON_POLICY_METADATA_PATH, 'rb') as f:
       policy_metadata = pickle.load(f)
-      self.policy_input_shapes =  policy_metadata['input_shapes']
       self.policy_output_slices = policy_metadata['output_slices']
-      policy_output_size = policy_metadata['output_shapes']['outputs'][1]
+      features_buffer_shape = policy_metadata['input_shapes']['features_buffer']
+      desire_pulse_shape = policy_metadata['input_shapes']['desire_pulse']
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
-    # policy inputs
-    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
-    self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
-    for k in ['desire_pulse', 'features_buffer']:
-      self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
-    self.full_input_queues.reset()
-
+    # persistent state tensors for the JIT
     self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                        'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
-    self.full_frames : dict[str, Tensor] = {}
-    self._blob_cache : dict[int, Tensor] = {}
+    n_features_steps = features_buffer_shape[1]
+    n_desire_steps = desire_pulse_shape[1]
+    desire_dim = desire_pulse_shape[2]
+    self.feature_queue = Tensor.zeros(1, n_features_steps * FREQ_RATIO, self.feature_dim, dtype='float32').contiguous().realize()
+    self.desire_queue = Tensor.zeros(1, n_desire_steps * FREQ_RATIO, desire_dim, dtype='float32').contiguous().realize()
+
+    # NPY-backed tensors for per-frame inputs
+    self.desire_np = np.zeros((1, desire_dim), dtype=np.float32)
+    self.desire_tensor = Tensor(self.desire_np, device='NPY')
+    self.traffic_np = np.zeros((1, 2), dtype=np.float32)
+    self.traffic_tensor = Tensor(self.traffic_np, device='NPY')
+
+    self.full_frames: dict[str, Tensor] = {}
+    self._blob_cache: dict[int, Tensor] = {}
     self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
     self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
-    self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
-    self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-    self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
-    self.off_policy_output = np.zeros(off_policy_output_size, dtype=np.float32)
     self.parser = Parser()
-    self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
-    self.update_imgs = None
-    self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
-    self.policy_run = pickle.loads(read_file_chunked(str(ON_POLICY_PKL_PATH)))
-    self.off_policy_run = pickle.loads(read_file_chunked(str(OFF_POLICY_PKL_PATH)))
+    self.frame_buf_params: dict[str, tuple[int, int, int, int]] = {}
+    self.modeld_run = None
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -201,18 +135,17 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     inputs['desire_pulse'][0] = 0
     new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
     self.prev_desire[:] = inputs['desire_pulse']
-    if self.update_imgs is None:
+
+    if self.modeld_run is None:
       for key in bufs.keys():
         w, h = bufs[key].width, bufs[key].height
         self.frame_buf_params[key] = get_nv12_info(w, h)
-      warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
-      with open(warp_path, "rb") as f:
-        self.update_imgs = pickle.load(f)
+      pkl_path = modeld_pkl_path(w, h)
+      self.modeld_run = pickle.loads(read_file_chunked(str(pkl_path)))
 
     for key in bufs.keys():
       ptr = bufs[key].data.ctypes.data
       yuv_size = self.frame_buf_params[key][3]
-      # There is a ringbuffer of imgs, just cache tensors pointing to all of them
       cache_key = (key, ptr)
       if cache_key not in self._blob_cache:
         self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
@@ -220,34 +153,33 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     for key in bufs.keys():
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
-                           self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
-    vision_inputs = {'img': out[0], 'big_img': out[1]}
+    # update per-frame inputs
+    self.desire_np[0, :] = new_desire
+    self.traffic_np[0, :] = inputs['traffic_convention']
+
+    # run combined JIT (warp + vision + policies)
+    vision_out_t, on_policy_out_t, off_policy_out_t = self.modeld_run(
+      self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
+      self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
+      self.feature_queue, self.desire_queue, self.desire_tensor, self.traffic_tensor)
 
     if prepare_only:
       return None
 
-    self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
-
-    self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
-    for k in ['desire_pulse', 'features_buffer']:
-      self.numpy_inputs[k][:] = self.full_input_queues.get(k)[k]
-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
+    vision_output = vision_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
+    policy_output = on_policy_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
+    off_policy_output = off_policy_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
 
-    self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
-
-    self.off_policy_output = self.off_policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy()
-    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(self.off_policy_output, self.off_policy_output_slices))
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(policy_output, self.policy_output_slices))
+    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
     off_policy_outputs_dict.pop('plan')
 
-
     combined_outputs_dict = {**vision_outputs_dict, **off_policy_outputs_dict, **policy_outputs_dict}
     if 'planplus' in combined_outputs_dict and 'plan' in combined_outputs_dict:
       combined_outputs_dict['plan'] = combined_outputs_dict['plan'] + combined_outputs_dict['planplus']
     if SEND_RAW_PRED:
-      combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy(), self.off_policy_output.copy()])
+      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), policy_output.copy(), off_policy_output.copy()])
 
     return combined_outputs_dict
 
@@ -388,8 +320,8 @@ def main(demo=False):
     if prepare_only:
       cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
 
-    bufs = {name: buf_extra if 'big' in name else buf_main for name in model.vision_input_names}
-    transforms = {name: model_transform_extra if 'big' in name else model_transform_main for name in model.vision_input_names}
+    bufs = {'img': buf_main, 'big_img': buf_extra}
+    transforms = {'img': model_transform_main, 'big_img': model_transform_extra}
     inputs:dict[str, np.ndarray] = {
       'desire_pulse': vec_desire,
       'traffic_convention': traffic_convention,

From ed6089f3689bc3e2cee3b9ff5447af55ae530097 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 14:25:10 -0700
Subject: [PATCH 02/65] update estimates

---
 selfdrive/modeld/SConscript | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 53d063d58e52c9..ce743ede04b0c9 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -59,7 +59,8 @@ for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
   pkl = File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath
   total_onnx_size = sum(os.path.getsize(File(f"models/{m}.onnx").abspath) for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy'])
-  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size))
+  # combined JIT includes warp kernels + queue ops on top of model weights, so needs more headroom
+  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size) * 2)
   modeld_chunk_targets.append((pkl, chunk_paths))
 
 def do_modeld_chunk(target, source, env):

From 5834b42e61a18e6e67f738ebc613939e8aa28a44 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 14:45:59 -0700
Subject: [PATCH 03/65] missing image=2?

---
 selfdrive/modeld/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ce743ede04b0c9..8f3b21ff5e62ec 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -50,7 +50,7 @@ modeld_targets = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
   modeld_targets.append(File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath)
-compile_modeld_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
+compile_modeld_cmd = f'{tg_flags} {image_flag} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
 modeld_compile_node = lenv.Command(modeld_targets, modeld_onnx_deps + modeld_meta_deps + tinygrad_files + compile_modeld_script + [chunker_file], compile_modeld_cmd)
 
 # Chunk modeld pickles

From d005cd1a37aec2f454a7694d6435b26092f5f6ea Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 16:42:19 -0700
Subject: [PATCH 04/65] Revert "missing image=2?"

This reverts commit 2f5952eb63ba1e3f24cbf5769e6b5e9170d7f0a6.
---
 selfdrive/modeld/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 8f3b21ff5e62ec..ce743ede04b0c9 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -50,7 +50,7 @@ modeld_targets = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
   modeld_targets.append(File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath)
-compile_modeld_cmd = f'{tg_flags} {image_flag} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
+compile_modeld_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
 modeld_compile_node = lenv.Command(modeld_targets, modeld_onnx_deps + modeld_meta_deps + tinygrad_files + compile_modeld_script + [chunker_file], compile_modeld_cmd)
 
 # Chunk modeld pickles

From 919b263e439744bfb5475014d4db31ac7808c57a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 16:42:27 -0700
Subject: [PATCH 05/65] Revert "update estimates"

This reverts commit 1f72feef2ffdec6126e3c941e899b46ace7b4b65.
---
 selfdrive/modeld/SConscript | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ce743ede04b0c9..53d063d58e52c9 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -59,8 +59,7 @@ for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
   pkl = File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath
   total_onnx_size = sum(os.path.getsize(File(f"models/{m}.onnx").abspath) for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy'])
-  # combined JIT includes warp kernels + queue ops on top of model weights, so needs more headroom
-  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size) * 2)
+  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size))
   modeld_chunk_targets.append((pkl, chunk_paths))
 
 def do_modeld_chunk(target, source, env):

From ff2da85f07c579fa81aad2d13d5b12465a6b83da Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 2 Apr 2026 16:42:36 -0700
Subject: [PATCH 06/65] Revert "compile_modeld.py"

This reverts commit f10541502efca02725f368deda2a21d1f786f57d.
---
 selfdrive/modeld/SConscript                   |  47 +-----
 .../{compile_modeld.py => compile_warp.py}    | 120 +++-----------
 selfdrive/modeld/modeld.py                    | 156 +++++++++++++-----
 3 files changed, 142 insertions(+), 181 deletions(-)
 rename selfdrive/modeld/{compile_modeld.py => compile_warp.py} (57%)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 53d063d58e52c9..bad1cdd5003d7d 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -30,46 +30,14 @@ for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy',
 image_flag = {
      'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
-
+script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
+compile_warp_cmd = f'{tg_flags} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-
-# DM warp (compile_modeld.py handles dm warp separately)
-dm_warp_script = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
-dm_warp_targets = []
-for cam in [_ar_ox_fisheye, _os_fisheye]:
-  w, h = cam.width, cam.height
-  dm_warp_targets.append(File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath)
-dm_warp_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_dm_warp, CAMERA_CONFIGS; [compile_dm_warp(w, h) for w, h in CAMERA_CONFIGS]"'
-lenv.Command(dm_warp_targets, tinygrad_files + dm_warp_script, dm_warp_cmd)
-
-# Combined modeld JIT (warp + vision + on_policy + off_policy)
-compile_modeld_script = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
-modeld_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
-modeld_meta_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
-modeld_targets = []
+warp_targets = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
-  modeld_targets.append(File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath)
-compile_modeld_cmd = f'{tg_flags} python3 -c "from openpilot.selfdrive.modeld.compile_modeld import compile_modeld, CAMERA_CONFIGS; [compile_modeld(w, h) for w, h in CAMERA_CONFIGS]"'
-modeld_compile_node = lenv.Command(modeld_targets, modeld_onnx_deps + modeld_meta_deps + tinygrad_files + compile_modeld_script + [chunker_file], compile_modeld_cmd)
-
-# Chunk modeld pickles
-modeld_chunk_targets = []
-for cam in [_ar_ox_fisheye, _os_fisheye]:
-  w, h = cam.width, cam.height
-  pkl = File(f"models/modeld_{w}x{h}_tinygrad.pkl").abspath
-  total_onnx_size = sum(os.path.getsize(File(f"models/{m}.onnx").abspath) for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy'])
-  chunk_paths = get_chunk_paths(pkl, estimate_pickle_max_size(total_onnx_size))
-  modeld_chunk_targets.append((pkl, chunk_paths))
-
-def do_modeld_chunk(target, source, env):
-  for pkl, chunk_paths in modeld_chunk_targets:
-    chunk_file(pkl, chunk_paths)
-lenv.Command(
-  [p for _, paths in modeld_chunk_targets for p in paths],
-  modeld_compile_node,
-  do_modeld_chunk,
-)
+  warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
+lenv.Command(warp_targets, tinygrad_files + script_files, compile_warp_cmd)
 
 def tg_compile(flags, model_name):
   pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
@@ -90,5 +58,6 @@ def tg_compile(flags, model_name):
     do_chunk,
   )
 
-# Compile DM model (still separate)
-tg_compile(tg_flags, 'dmonitoring_model')
+# Compile small models
+for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:
+  tg_compile(tg_flags, model_name)
diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_warp.py
similarity index 57%
rename from selfdrive/modeld/compile_modeld.py
rename to selfdrive/modeld/compile_warp.py
index 5ca251c154e72f..47511f2a2b6b23 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -7,12 +7,10 @@
 from tinygrad.helpers import Context
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
-from tinygrad.nn.onnx import OnnxRunner
 
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-from openpilot.selfdrive.modeld.constants import ModelConstants
 
 MODELS_DIR = Path(__file__).parent / 'models'
 
@@ -24,13 +22,11 @@
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 
-IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2) # TODO keep n images / n channels separate
+IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
 
-FREQ_RATIO = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ  # 20Hz / 5Hz = 4
 
-
-def modeld_pkl_path(w, h):
-  return MODELS_DIR / f'modeld_{w}x{h}_tinygrad.pkl'
+def warp_pkl_path(w, h):
+  return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
 
 
 def dm_warp_pkl_path(w, h):
@@ -128,114 +124,42 @@ def warp_dm(input_frame, M_inv):
   return warp_dm
 
 
-def compile_modeld(cam_w, cam_h):
+def compile_modeld_warp(cam_w, cam_h):
   model_w, model_h = MEDMODEL_INPUT_SIZE
   _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
 
-  print(f"Compiling combined modeld JIT for {cam_w}x{cam_h}...")
-
-  # load model metadata for shapes and output slices
-  with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
-    vision_meta = pickle.load(f)
-  with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
-    on_policy_meta = pickle.load(f)
-
-  hidden_state_slice = vision_meta['output_slices']['hidden_state']
-  feature_dim = hidden_state_slice.stop - hidden_state_slice.start
-  features_buffer_shape = on_policy_meta['input_shapes']['features_buffer']  # (1, 25, 512)
-  desire_pulse_shape = on_policy_meta['input_shapes']['desire_pulse']        # (1, 25, 8)
-  n_features_steps = features_buffer_shape[1]
-  n_desire_steps = desire_pulse_shape[1]
-  desire_dim = desire_pulse_shape[2]
-  feature_queue_shape = (1, n_features_steps * FREQ_RATIO, feature_dim)
-  desire_queue_shape = (1, n_desire_steps * FREQ_RATIO, desire_dim)
-
-  # load ONNX models
-  vision_runner = OnnxRunner(str(MODELS_DIR / 'driving_vision.onnx'))
-  on_policy_runner = OnnxRunner(str(MODELS_DIR / 'driving_on_policy.onnx'))
-  off_policy_runner = OnnxRunner(str(MODELS_DIR / 'driving_off_policy.onnx'))
-
-  # create warp pipeline
+  print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
+
   frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
   update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
+  update_img_jit = TinyJit(update_both_imgs, prune=True)
 
-  def run_modeld(img_buf, frame, M_inv,
-                 big_img_buf, big_frame, M_inv_big,
-                 feat_queue, desire_q, desire_in, traffic_in):
-    # warp both camera images
-    img, big_img = update_both_imgs(img_buf, frame, M_inv, big_img_buf, big_frame, M_inv_big)
-
-    # run vision model
-    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
-
-    # extract features from vision output and update feature queue
-    features = vision_out[:, hidden_state_slice].reshape(1, 1, feature_dim)
-    feat_queue.assign(feat_queue[:, 1:].cat(features, dim=1).contiguous())
-
-    # update desire queue
-    desire_new = desire_in.to(Device.DEFAULT).reshape(1, 1, desire_dim)
-    desire_q.assign(desire_q[:, 1:].cat(desire_new, dim=1).contiguous())
-
-    # subsample features for policy: take every FREQ_RATIO-th from end
-    features_buffer = feat_queue[:, (FREQ_RATIO - 1)::FREQ_RATIO, :]
-
-    # subsample desire: group by FREQ_RATIO and take max (pulse detection)
-    desire_pulse = desire_q.reshape(1, n_desire_steps, FREQ_RATIO, desire_dim).max(axis=2)
-
-    # run both policy models
-    policy_inputs = {
-      'features_buffer': features_buffer,
-      'desire_pulse': desire_pulse,
-      'traffic_convention': traffic_in.to(Device.DEFAULT),
-    }
-    on_policy_out = next(iter(on_policy_runner(policy_inputs).values())).cast('float32')
-    off_policy_out = next(iter(off_policy_runner(policy_inputs).values())).cast('float32')
-
-    return vision_out, on_policy_out, off_policy_out
-
-  run_modeld_jit = TinyJit(run_modeld, prune=True)
-
-  # create state tensors for JIT tracing
-  img_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_img_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  feat_queue = Tensor.zeros(feature_queue_shape, dtype='float32').contiguous().realize()
-  desire_queue = Tensor.zeros(desire_queue_shape, dtype='float32').contiguous().realize()
-  desire_np = np.zeros((1, desire_dim), dtype=np.float32)
-  desire_tensor = Tensor(desire_np, device='NPY')
-  traffic_np = np.zeros((1, 2), dtype=np.float32)
-  traffic_tensor = Tensor(traffic_np, device='NPY')
-
+  full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
   for i in range(10):
-    frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
-    big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
-    M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
-    M_inv_big = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
-    desire_np[:] = np.random.randn(1, desire_dim).astype(np.float32)
-    traffic_np[:] = np.random.randn(1, 2).astype(np.float32)
+    img_inputs = [full_buffer,
+                  Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
+                  Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
+    big_img_inputs = [big_full_buffer,
+                      Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
+                      Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
+    inputs = img_inputs + big_img_inputs
     Device.default.synchronize()
 
     st = time.perf_counter()
-    outs = run_modeld_jit(img_buffer, frame, M_inv,
-                          big_img_buffer, big_frame, M_inv_big,
-                          feat_queue, desire_queue, desire_tensor, traffic_tensor)
+    _ = update_img_jit(*inputs)
     mt = time.perf_counter()
-    for o in outs:
-      o.realize()
     Device.default.synchronize()
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
-  pkl_path = modeld_pkl_path(cam_w, cam_h)
+  pkl_path = warp_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
-    pickle.dump(run_modeld_jit, f)
+    pickle.dump(update_img_jit, f)
   print(f"  Saved to {pkl_path}")
 
-  # validate pickle roundtrip
-  jit_loaded = pickle.load(open(pkl_path, "rb"))
-  jit_loaded(img_buffer, frame, M_inv,
-             big_img_buffer, big_frame, M_inv_big,
-             feat_queue, desire_queue, desire_tensor, traffic_tensor)
-  print("  Pickle roundtrip validated")
+  jit = pickle.load(open(pkl_path, "rb"))
+  jit(*inputs)
 
 
 def compile_dm_warp(cam_w, cam_h):
@@ -266,7 +190,7 @@ def compile_dm_warp(cam_w, cam_h):
 
 def run_and_save_pickle():
   for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld(cam_w, cam_h)
+    compile_modeld_warp(cam_w, cam_h)
     compile_dm_warp(cam_w, cam_h)
 
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 4ce8d3a5401512..82e750cf8b3be2 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -27,7 +27,6 @@
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
-from openpilot.selfdrive.modeld.compile_modeld import modeld_pkl_path, FREQ_RATIO
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
 
@@ -36,8 +35,11 @@
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
 MODELS_DIR = Path(__file__).parent / 'models'
+VISION_PKL_PATH = MODELS_DIR / 'driving_vision_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
+ON_POLICY_PKL_PATH = MODELS_DIR / 'driving_on_policy_tinygrad.pkl'
 ON_POLICY_METADATA_PATH = MODELS_DIR / 'driving_on_policy_metadata.pkl'
+OFF_POLICY_PKL_PATH = MODELS_DIR / 'driving_off_policy_tinygrad.pkl'
 OFF_POLICY_METADATA_PATH = MODELS_DIR / 'driving_off_policy_metadata.pkl'
 
 LAT_SMOOTH_SECONDS = 0.0
@@ -80,50 +82,114 @@ def __init__(self, vipc=None):
     if vipc is not None:
       self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 
+class InputQueues:
+  def __init__ (self, model_fps, env_fps, n_frames_input):
+    assert env_fps % model_fps == 0
+    assert env_fps >= model_fps
+    self.model_fps = model_fps
+    self.env_fps = env_fps
+    self.n_frames_input = n_frames_input
+
+    self.dtypes = {}
+    self.shapes = {}
+    self.q = {}
+
+  def update_dtypes_and_shapes(self, input_dtypes, input_shapes) -> None:
+    self.dtypes.update(input_dtypes)
+    if self.env_fps == self.model_fps:
+      self.shapes.update(input_shapes)
+    else:
+      for k in input_shapes:
+        shape = list(input_shapes[k])
+        if 'img' in k:
+          n_channels = shape[1] // self.n_frames_input
+          shape[1] = (self.env_fps // self.model_fps + (self.n_frames_input - 1)) * n_channels
+        else:
+          shape[1] = (self.env_fps // self.model_fps) * shape[1]
+        self.shapes[k] = tuple(shape)
+
+  def reset(self) -> None:
+    self.q = {k: np.zeros(self.shapes[k], dtype=self.dtypes[k]) for k in self.dtypes.keys()}
+
+  def enqueue(self, inputs:dict[str, np.ndarray]) -> None:
+    for k in inputs.keys():
+      if inputs[k].dtype != self.dtypes[k]:
+        raise ValueError(f'supplied input <{k}({inputs[k].dtype})> has wrong dtype, expected {self.dtypes[k]}')
+      input_shape = list(self.shapes[k])
+      input_shape[1] = -1
+      single_input = inputs[k].reshape(tuple(input_shape))
+      sz = single_input.shape[1]
+      self.q[k][:,:-sz] = self.q[k][:,sz:]
+      self.q[k][:,-sz:] = single_input
+
+  def get(self, *names) -> dict[str, np.ndarray]:
+    if self.env_fps == self.model_fps:
+      return {k: self.q[k] for k in names}
+    else:
+      out = {}
+      for k in names:
+        shape = self.shapes[k]
+        if 'img' in k:
+          n_channels = shape[1] // (self.env_fps // self.model_fps + (self.n_frames_input - 1))
+          out[k] = np.concatenate([self.q[k][:, s:s+n_channels] for s in np.linspace(0, shape[1] - n_channels, self.n_frames_input, dtype=int)], axis=1)
+        elif 'pulse' in k:
+          # any pulse within interval counts
+          out[k] = self.q[k].reshape((shape[0], shape[1] * self.model_fps // self.env_fps, self.env_fps // self.model_fps, -1)).max(axis=2)
+        else:
+          idxs = np.arange(-1, -shape[1], -self.env_fps // self.model_fps)[::-1]
+          out[k] = self.q[k][:, idxs]
+      return out
+
 class ModelState:
+  inputs: dict[str, np.ndarray]
+  output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self):
     with open(VISION_METADATA_PATH, 'rb') as f:
       vision_metadata = pickle.load(f)
+      self.vision_input_shapes =  vision_metadata['input_shapes']
+      self.vision_input_names = list(self.vision_input_shapes.keys())
       self.vision_output_slices = vision_metadata['output_slices']
-      hidden_state_slice = vision_metadata['output_slices']['hidden_state']
-      self.feature_dim = hidden_state_slice.stop - hidden_state_slice.start
+      vision_output_size = vision_metadata['output_shapes']['outputs'][1]
 
     with open(OFF_POLICY_METADATA_PATH, 'rb') as f:
       off_policy_metadata = pickle.load(f)
+      self.off_policy_input_shapes =  off_policy_metadata['input_shapes']
       self.off_policy_output_slices = off_policy_metadata['output_slices']
+      off_policy_output_size = off_policy_metadata['output_shapes']['outputs'][1]
 
     with open(ON_POLICY_METADATA_PATH, 'rb') as f:
       policy_metadata = pickle.load(f)
+      self.policy_input_shapes =  policy_metadata['input_shapes']
       self.policy_output_slices = policy_metadata['output_slices']
-      features_buffer_shape = policy_metadata['input_shapes']['features_buffer']
-      desire_pulse_shape = policy_metadata['input_shapes']['desire_pulse']
+      policy_output_size = policy_metadata['output_shapes']['outputs'][1]
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
-    # persistent state tensors for the JIT
+    # policy inputs
+    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
+    self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
+    for k in ['desire_pulse', 'features_buffer']:
+      self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
+    self.full_input_queues.reset()
+
     self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                        'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
-    n_features_steps = features_buffer_shape[1]
-    n_desire_steps = desire_pulse_shape[1]
-    desire_dim = desire_pulse_shape[2]
-    self.feature_queue = Tensor.zeros(1, n_features_steps * FREQ_RATIO, self.feature_dim, dtype='float32').contiguous().realize()
-    self.desire_queue = Tensor.zeros(1, n_desire_steps * FREQ_RATIO, desire_dim, dtype='float32').contiguous().realize()
-
-    # NPY-backed tensors for per-frame inputs
-    self.desire_np = np.zeros((1, desire_dim), dtype=np.float32)
-    self.desire_tensor = Tensor(self.desire_np, device='NPY')
-    self.traffic_np = np.zeros((1, 2), dtype=np.float32)
-    self.traffic_tensor = Tensor(self.traffic_np, device='NPY')
-
-    self.full_frames: dict[str, Tensor] = {}
-    self._blob_cache: dict[int, Tensor] = {}
+    self.full_frames : dict[str, Tensor] = {}
+    self._blob_cache : dict[int, Tensor] = {}
     self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
     self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
+    self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
+    self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
+    self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
+    self.off_policy_output = np.zeros(off_policy_output_size, dtype=np.float32)
     self.parser = Parser()
-    self.frame_buf_params: dict[str, tuple[int, int, int, int]] = {}
-    self.modeld_run = None
+    self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
+    self.update_imgs = None
+    self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
+    self.policy_run = pickle.loads(read_file_chunked(str(ON_POLICY_PKL_PATH)))
+    self.off_policy_run = pickle.loads(read_file_chunked(str(OFF_POLICY_PKL_PATH)))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -135,17 +201,18 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     inputs['desire_pulse'][0] = 0
     new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
     self.prev_desire[:] = inputs['desire_pulse']
-
-    if self.modeld_run is None:
+    if self.update_imgs is None:
       for key in bufs.keys():
         w, h = bufs[key].width, bufs[key].height
         self.frame_buf_params[key] = get_nv12_info(w, h)
-      pkl_path = modeld_pkl_path(w, h)
-      self.modeld_run = pickle.loads(read_file_chunked(str(pkl_path)))
+      warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
+      with open(warp_path, "rb") as f:
+        self.update_imgs = pickle.load(f)
 
     for key in bufs.keys():
       ptr = bufs[key].data.ctypes.data
       yuv_size = self.frame_buf_params[key][3]
+      # There is a ringbuffer of imgs, just cache tensors pointing to all of them
       cache_key = (key, ptr)
       if cache_key not in self._blob_cache:
         self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
@@ -153,33 +220,34 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     for key in bufs.keys():
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    # update per-frame inputs
-    self.desire_np[0, :] = new_desire
-    self.traffic_np[0, :] = inputs['traffic_convention']
-
-    # run combined JIT (warp + vision + policies)
-    vision_out_t, on_policy_out_t, off_policy_out_t = self.modeld_run(
-      self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
-      self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
-      self.feature_queue, self.desire_queue, self.desire_tensor, self.traffic_tensor)
+    out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
+                           self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
+    vision_inputs = {'img': out[0], 'big_img': out[1]}
 
     if prepare_only:
       return None
 
-    vision_output = vision_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
-    policy_output = on_policy_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
-    off_policy_output = off_policy_out_t.contiguous().realize().uop.base.buffer.numpy().flatten()
+    self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
+
+    self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
+    for k in ['desire_pulse', 'features_buffer']:
+      self.numpy_inputs[k][:] = self.full_input_queues.get(k)[k]
+    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
 
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(policy_output, self.policy_output_slices))
-    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
+    self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
+
+    self.off_policy_output = self.off_policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy()
+    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(self.off_policy_output, self.off_policy_output_slices))
     off_policy_outputs_dict.pop('plan')
 
+
     combined_outputs_dict = {**vision_outputs_dict, **off_policy_outputs_dict, **policy_outputs_dict}
     if 'planplus' in combined_outputs_dict and 'plan' in combined_outputs_dict:
       combined_outputs_dict['plan'] = combined_outputs_dict['plan'] + combined_outputs_dict['planplus']
     if SEND_RAW_PRED:
-      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), policy_output.copy(), off_policy_output.copy()])
+      combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy(), self.off_policy_output.copy()])
 
     return combined_outputs_dict
 
@@ -320,8 +388,8 @@ def main(demo=False):
     if prepare_only:
       cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
 
-    bufs = {'img': buf_main, 'big_img': buf_extra}
-    transforms = {'img': model_transform_main, 'big_img': model_transform_extra}
+    bufs = {name: buf_extra if 'big' in name else buf_main for name in model.vision_input_names}
+    transforms = {name: model_transform_extra if 'big' in name else model_transform_main for name in model.vision_input_names}
     inputs:dict[str, np.ndarray] = {
       'desire_pulse': vec_desire,
       'traffic_convention': traffic_convention,

From 6f3dfefd28a81494303d9185397d3b18d7975a10 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Fri, 3 Apr 2026 17:39:03 -0700
Subject: [PATCH 07/65] load warp in ModelState init

---
 selfdrive/modeld/modeld.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 82e750cf8b3be2..a1c25beec3b725 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -145,7 +145,7 @@ class ModelState:
   output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
-  def __init__(self):
+  def __init__(self, cam_w: int, cam_h: int):
     with open(VISION_METADATA_PATH, 'rb') as f:
       vision_metadata = pickle.load(f)
       self.vision_input_shapes =  vision_metadata['input_shapes']
@@ -185,8 +185,10 @@ def __init__(self):
     self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
     self.off_policy_output = np.zeros(off_policy_output_size, dtype=np.float32)
     self.parser = Parser()
-    self.frame_buf_params : dict[str, tuple[int, int, int, int]] = {}
-    self.update_imgs = None
+    self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
+    warp_path = MODELS_DIR / f'warp_{cam_w}x{cam_h}_tinygrad.pkl'
+    with open(warp_path, "rb") as f:
+      self.update_imgs = pickle.load(f)
     self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
     self.policy_run = pickle.loads(read_file_chunked(str(ON_POLICY_PKL_PATH)))
     self.off_policy_run = pickle.loads(read_file_chunked(str(OFF_POLICY_PKL_PATH)))
@@ -201,14 +203,6 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     inputs['desire_pulse'][0] = 0
     new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
     self.prev_desire[:] = inputs['desire_pulse']
-    if self.update_imgs is None:
-      for key in bufs.keys():
-        w, h = bufs[key].width, bufs[key].height
-        self.frame_buf_params[key] = get_nv12_info(w, h)
-      warp_path = MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
-      with open(warp_path, "rb") as f:
-        self.update_imgs = pickle.load(f)
-
     for key in bufs.keys():
       ptr = bufs[key].data.ctypes.data
       yuv_size = self.frame_buf_params[key][3]
@@ -260,11 +254,6 @@ def main(demo=False):
     # also need to move the aux USB interrupts for good timings
     config_realtime_process(7, 54)
 
-  st = time.monotonic()
-  cloudlog.warning("loading model")
-  model = ModelState()
-  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
-
   # visionipc clients
   while True:
     available_streams = VisionIpcClient.available_streams("camerad", block=False)
@@ -288,6 +277,11 @@ def main(demo=False):
   if use_extra_client:
     cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
 
+  st = time.monotonic()
+  cloudlog.warning("loading model")
+  model = ModelState(vipc_client_main.width, vipc_client_main.height)
+  cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
+
   # messaging
   pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
   sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])

From 0f3a27679643d55690694c77df101bd3c5bb116b Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Fri, 3 Apr 2026 20:50:49 -0700
Subject: [PATCH 08/65] dead code

---
 selfdrive/modeld/modeld.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index a1c25beec3b725..ca9bcc97061d6d 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -140,9 +140,9 @@ def get(self, *names) -> dict[str, np.ndarray]:
           out[k] = self.q[k][:, idxs]
       return out
 
+
+
 class ModelState:
-  inputs: dict[str, np.ndarray]
-  output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self, cam_w: int, cam_h: int):
@@ -211,7 +211,6 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       if cache_key not in self._blob_cache:
         self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
       self.full_frames[key] = self._blob_cache[cache_key]
-    for key in bufs.keys():
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
     out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],

From a5a7a223dd462fb8a721dd97c70fe868155a3a0a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Fri, 3 Apr 2026 22:18:55 -0700
Subject: [PATCH 09/65] prep

---
 selfdrive/modeld/compile_warp.py | 12 +++++--
 selfdrive/modeld/modeld.py       | 60 ++++++++++++--------------------
 2 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index 47511f2a2b6b23..a182300ca1e0e7 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -25,7 +25,7 @@
 IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
 
 
-def warp_pkl_path(w, h):
+def policy_pkl_path(w, h):
   return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
 
 
@@ -124,7 +124,13 @@ def warp_dm(input_frame, M_inv):
   return warp_dm
 
 
-def compile_modeld_warp(cam_w, cam_h):
+def make_run_policy(cam_w, cam_h):
+  def run_policy():
+    pass
+  return run_policy
+
+
+def compile_modeld(cam_w, cam_h):
   model_w, model_h = MEDMODEL_INPUT_SIZE
   _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
 
@@ -153,7 +159,7 @@ def compile_modeld_warp(cam_w, cam_h):
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
-  pkl_path = warp_pkl_path(cam_w, cam_h)
+  pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
     pickle.dump(update_img_jit, f)
   print(f"  Saved to {pkl_path}")
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index ca9bcc97061d6d..48dca4f6b2c3bd 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -26,6 +26,7 @@
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
+from openpilot.selfdrive.modeld.compile_warp import policy_pkl_path
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
@@ -35,11 +36,9 @@
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
 MODELS_DIR = Path(__file__).parent / 'models'
-VISION_PKL_PATH = MODELS_DIR / 'driving_vision_tinygrad.pkl'
+# DRIVING_PKL_PATH = MODELS_DIR / 'driving_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
-ON_POLICY_PKL_PATH = MODELS_DIR / 'driving_on_policy_tinygrad.pkl'
 ON_POLICY_METADATA_PATH = MODELS_DIR / 'driving_on_policy_metadata.pkl'
-OFF_POLICY_PKL_PATH = MODELS_DIR / 'driving_off_policy_tinygrad.pkl'
 OFF_POLICY_METADATA_PATH = MODELS_DIR / 'driving_off_policy_metadata.pkl'
 
 LAT_SMOOTH_SECONDS = 0.0
@@ -141,7 +140,6 @@ def get(self, *names) -> dict[str, np.ndarray]:
       return out
 
 
-
 class ModelState:
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
@@ -157,20 +155,19 @@ def __init__(self, cam_w: int, cam_h: int):
       off_policy_metadata = pickle.load(f)
       self.off_policy_input_shapes =  off_policy_metadata['input_shapes']
       self.off_policy_output_slices = off_policy_metadata['output_slices']
-      off_policy_output_size = off_policy_metadata['output_shapes']['outputs'][1]
+      # off_policy_output_size = off_policy_metadata['output_shapes']['outputs'][1]
 
     with open(ON_POLICY_METADATA_PATH, 'rb') as f:
       policy_metadata = pickle.load(f)
       self.policy_input_shapes =  policy_metadata['input_shapes']
       self.policy_output_slices = policy_metadata['output_slices']
-      policy_output_size = policy_metadata['output_shapes']['outputs'][1]
+      # policy_output_size = policy_metadata['output_shapes']['outputs'][1]
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
-    # policy inputs
     self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
     self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
-    for k in ['desire_pulse', 'features_buffer']:
+    for k in ['desire_pulse']:
       self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
     self.full_input_queues.reset()
 
@@ -180,18 +177,11 @@ def __init__(self, cam_w: int, cam_h: int):
     self._blob_cache : dict[int, Tensor] = {}
     self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
     self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
-    self.vision_output = np.zeros(vision_output_size, dtype=np.float32)
     self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
-    self.policy_output = np.zeros(policy_output_size, dtype=np.float32)
-    self.off_policy_output = np.zeros(off_policy_output_size, dtype=np.float32)
+
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    warp_path = MODELS_DIR / f'warp_{cam_w}x{cam_h}_tinygrad.pkl'
-    with open(warp_path, "rb") as f:
-      self.update_imgs = pickle.load(f)
-    self.vision_run = pickle.loads(read_file_chunked(str(VISION_PKL_PATH)))
-    self.policy_run = pickle.loads(read_file_chunked(str(ON_POLICY_PKL_PATH)))
-    self.off_policy_run = pickle.loads(read_file_chunked(str(OFF_POLICY_PKL_PATH)))
+    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -203,6 +193,7 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     inputs['desire_pulse'][0] = 0
     new_desire = np.where(inputs['desire_pulse'] - self.prev_desire > .99, inputs['desire_pulse'], 0)
     self.prev_desire[:] = inputs['desire_pulse']
+
     for key in bufs.keys():
       ptr = bufs[key].data.ctypes.data
       yuv_size = self.frame_buf_params[key][3]
@@ -213,35 +204,28 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       self.full_frames[key] = self._blob_cache[cache_key]
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    out = self.update_imgs(self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
-                           self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'])
-    vision_inputs = {'img': out[0], 'big_img': out[1]}
-
-    if prepare_only:
-      return None
-
-    self.vision_output = self.vision_run(**vision_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(self.vision_output, self.vision_output_slices))
-
-    self.full_input_queues.enqueue({'features_buffer': vision_outputs_dict['hidden_state'], 'desire_pulse': new_desire})
-    for k in ['desire_pulse', 'features_buffer']:
-      self.numpy_inputs[k][:] = self.full_input_queues.get(k)[k]
+    # TODO cleanup
+    self.full_input_queues.enqueue({'desire_pulse': new_desire})
+    self.numpy_inputs['desire_pulse'][:] = self.full_input_queues.get('desire_pulse')['desire_pulse']
     self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
 
-    self.policy_output = self.policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy().flatten()
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(self.policy_output, self.policy_output_slices))
+    vision_output, off_policy_output, on_policy_output = self.run_policy(
+      self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
+      self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
+      self.policy_inputs
+    )
 
-    self.off_policy_output = self.off_policy_run(**self.policy_inputs).contiguous().realize().uop.base.buffer.numpy()
-    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(self.off_policy_output, self.off_policy_output_slices))
+    # TODO cleanup
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output.uop.base.buffer.numpy().flatten(), self.vision_output_slices)) # TODO do we still need the weird numpy?
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output.uop.base.buffer.numpy().flatten(), self.policy_output_slices))
+    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output.uop.base.buffer.numpy(), self.off_policy_output_slices))
     off_policy_outputs_dict.pop('plan')
-
-
     combined_outputs_dict = {**vision_outputs_dict, **off_policy_outputs_dict, **policy_outputs_dict}
+
     if 'planplus' in combined_outputs_dict and 'plan' in combined_outputs_dict:
       combined_outputs_dict['plan'] = combined_outputs_dict['plan'] + combined_outputs_dict['planplus']
     if SEND_RAW_PRED:
-      combined_outputs_dict['raw_pred'] = np.concatenate([self.vision_output.copy(), self.policy_output.copy(), self.off_policy_output.copy()])
-
+      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), on_policy_output.copy(), off_policy_output.copy()])
     return combined_outputs_dict
 
 

From f43c3ea03ef7c2c7252cdf357ef71a981df04131 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Fri, 3 Apr 2026 23:17:53 -0700
Subject: [PATCH 10/65] compile modeld

---
 selfdrive/modeld/compile_warp.py | 76 +++++++++++++++++++++++---------
 selfdrive/modeld/modeld.py       | 21 +++++----
 2 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index a182300ca1e0e7..20c4955b9a4814 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -124,36 +124,70 @@ def warp_dm(input_frame, M_inv):
   return warp_dm
 
 
-def make_run_policy(cam_w, cam_h):
-  def run_policy():
-    pass
+def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, cam_h,
+                    vision_features_slice, frame_skip):
+  model_w, model_h = MEDMODEL_INPUT_SIZE
+  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
+  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
+
+  def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
+                 feat_q, policy_inputs):
+    img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
+
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
+
+    feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
+    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
+
+    inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
+    inputs['features_buffer'] = feat_buf
+    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
+    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
+
+    return vision_out, on_policy_out, off_policy_out
   return run_policy
 
 
 def compile_modeld(cam_w, cam_h):
-  model_w, model_h = MEDMODEL_INPUT_SIZE
+  from tinygrad.nn.onnx import OnnxRunner
+  from openpilot.selfdrive.modeld.constants import ModelConstants
+
   _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
+  print(f"Compiling combined policy JIT for {cam_w}x{cam_h}...")
 
-  print(f"Compiling modeld warp for {cam_w}x{cam_h}...")
+  vision_runner = OnnxRunner(MODELS_DIR / 'driving_vision.onnx')
+  on_policy_runner = OnnxRunner(MODELS_DIR / 'driving_on_policy.onnx')
+  off_policy_runner = OnnxRunner(MODELS_DIR / 'driving_off_policy.onnx')
 
-  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
-  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
-  update_img_jit = TinyJit(update_both_imgs, prune=True)
+  with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
+    vision_features_slice = pickle.load(f)['output_slices']['hidden_state']
+  with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
+    policy_input_shapes = pickle.load(f)['input_shapes']
+
+  frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
+
+  _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
+                         cam_w, cam_h, vision_features_slice, frame_skip)
+  run_policy_jit = TinyJit(_run, prune=True)
+
+  # warmup inputs
+  img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  fb = policy_input_shapes['features_buffer']
+  feat_q = Tensor.zeros(fb[0], fb[1] * frame_skip, fb[2]).contiguous().realize()
+  numpy_inputs = {k: np.zeros(policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
+  policy_inputs = {k: Tensor(v, device='NPY').realize() for k, v in numpy_inputs.items()}
+  tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
+  big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
 
-  full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_full_buffer = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
   for i in range(10):
-    img_inputs = [full_buffer,
-                  Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
-                  Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    big_img_inputs = [big_full_buffer,
-                      Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
-                      Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    inputs = img_inputs + big_img_inputs
+    frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
     Device.default.synchronize()
 
     st = time.perf_counter()
-    _ = update_img_jit(*inputs)
+    outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
+                          feat_q, policy_inputs)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
@@ -161,11 +195,11 @@ def compile_modeld(cam_w, cam_h):
 
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
-    pickle.dump(update_img_jit, f)
+    pickle.dump(run_policy_jit, f)
   print(f"  Saved to {pkl_path}")
 
   jit = pickle.load(open(pkl_path, "rb"))
-  jit(*inputs)
+  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, policy_inputs)
 
 
 def compile_dm_warp(cam_w, cam_h):
@@ -196,7 +230,7 @@ def compile_dm_warp(cam_w, cam_h):
 
 def run_and_save_pickle():
   for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld_warp(cam_w, cam_h)
+    compile_modeld(cam_w, cam_h)
     compile_dm_warp(cam_w, cam_h)
 
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 48dca4f6b2c3bd..426360abfe0ed7 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -165,12 +165,16 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
-    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in self.policy_input_shapes}
+    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
     self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
     for k in ['desire_pulse']:
       self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
     self.full_input_queues.reset()
 
+    self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
+    fb = self.policy_input_shapes['features_buffer']
+    self.features_queue = Tensor.zeros(fb[0], fb[1] * self.frame_skip, fb[2]).contiguous().realize()
+
     self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                        'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
     self.full_frames : dict[str, Tensor] = {}
@@ -204,21 +208,22 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       self.full_frames[key] = self._blob_cache[cache_key]
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    # TODO cleanup
     self.full_input_queues.enqueue({'desire_pulse': new_desire})
     self.numpy_inputs['desire_pulse'][:] = self.full_input_queues.get('desire_pulse')['desire_pulse']
     self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
 
-    vision_output, off_policy_output, on_policy_output = self.run_policy(
+    vision_output, on_policy_output, off_policy_output = self.run_policy(
       self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
       self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
-      self.policy_inputs
+      self.features_queue, self.policy_inputs
     )
 
-    # TODO cleanup
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output.uop.base.buffer.numpy().flatten(), self.vision_output_slices)) # TODO do we still need the weird numpy?
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output.uop.base.buffer.numpy().flatten(), self.policy_output_slices))
-    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output.uop.base.buffer.numpy(), self.off_policy_output_slices))
+    vision_output = vision_output.uop.base.buffer.numpy().flatten()
+    on_policy_output = on_policy_output.uop.base.buffer.numpy().flatten()
+    off_policy_output = off_policy_output.uop.base.buffer.numpy()
+    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
+    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output, self.policy_output_slices))
+    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
     off_policy_outputs_dict.pop('plan')
     combined_outputs_dict = {**vision_outputs_dict, **off_policy_outputs_dict, **policy_outputs_dict}
 

From 8d93837e8aece8a26072a2922b1475a0702cc234 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Fri, 3 Apr 2026 23:22:12 -0700
Subject: [PATCH 11/65] update SConscript

---
 selfdrive/modeld/SConscript | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index bad1cdd5003d7d..e40a8de62d50c6 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -31,13 +31,25 @@ image_flag = {
      'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
 script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
-compile_warp_cmd = f'{tg_flags} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
+compile_warp_cmd = f'{tg_flags} {image_flag} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+driving_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
+driving_metadata_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_on_policy']]
 warp_targets = []
+policy_pkls = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
+  policy_pkls.append(File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath)
   warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
-lenv.Command(warp_targets, tinygrad_files + script_files, compile_warp_cmd)
+compile_node = lenv.Command(warp_targets, tinygrad_files + script_files + driving_onnx_deps + driving_metadata_deps + [chunker_file], compile_warp_cmd)
+
+# chunk the combined policy pkls (they contain model weights)
+for policy_pkl in policy_pkls:
+  onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
+  chunk_targets = get_chunk_paths(policy_pkl, estimate_pickle_max_size(onnx_sizes_sum))
+  def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
+    chunk_file(pkl, chunks)
+  lenv.Command(chunk_targets, compile_node, do_chunk)
 
 def tg_compile(flags, model_name):
   pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
@@ -58,6 +70,5 @@ def tg_compile(flags, model_name):
     do_chunk,
   )
 
-# Compile small models
-for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:
-  tg_compile(tg_flags, model_name)
+# Compile dmonitoring model (driving models are in the combined policy JIT)
+tg_compile(tg_flags, 'dmonitoring_model')

From ec8a43dd4217f4a21847f2d90cad466053f44e9c Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 14:50:03 -0700
Subject: [PATCH 12/65] tmp save plot locally

---
 selfdrive/test/process_replay/model_replay.py | 66 +++++++++++++------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index eb7cdbe34acfdd..0751e29b01a8f5 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -254,27 +254,27 @@ def get_frames():
         'driverStateV2.modelExecutionTime',
         'driverStateV2.gpuExecutionTime'
       ]
-      if PC:
-        # TODO We ignore whole bunch so we can compare important stuff
-        # like posenet with reasonable tolerance
-        ignore += ['modelV2.acceleration.x',
-                   'modelV2.position.x',
-                   'modelV2.position.xStd',
-                   'modelV2.position.y',
-                   'modelV2.position.yStd',
-                   'modelV2.position.z',
-                   'modelV2.position.zStd',
-                   'drivingModelData.path.xCoefficients',]
-        for i in range(3):
-          for field in ('x', 'y', 'v', 'a'):
-            ignore.append(f'modelV2.leadsV3.{i}.{field}')
-            ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
-        for i in range(4):
-          for field in ('x', 'y', 'z', 't'):
-            ignore.append(f'modelV2.laneLines.{i}.{field}')
-        for i in range(2):
-          for field in ('x', 'y', 'z', 't'):
-            ignore.append(f'modelV2.roadEdges.{i}.{field}')
+      # if PC:
+      #   # TODO We ignore whole bunch so we can compare important stuff
+      #   # like posenet with reasonable tolerance
+      #   ignore += ['modelV2.acceleration.x',
+      #              'modelV2.position.x',
+      #              'modelV2.position.xStd',
+      #              'modelV2.position.y',
+      #              'modelV2.position.yStd',
+      #              'modelV2.position.z',
+      #              'modelV2.position.zStd',
+      #              'drivingModelData.path.xCoefficients',]
+      #   for i in range(3):
+      #     for field in ('x', 'y', 'v', 'a'):
+      #       ignore.append(f'modelV2.leadsV3.{i}.{field}')
+      #       ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
+      #   for i in range(4):
+      #     for field in ('x', 'y', 'z', 't'):
+      #       ignore.append(f'modelV2.laneLines.{i}.{field}')
+      #   for i in range(2):
+      #     for field in ('x', 'y', 'z', 't'):
+      #       ignore.append(f'modelV2.roadEdges.{i}.{field}')
       tolerance = .3 if PC else None
       results: Any = {TEST_ROUTE: {}}
       log_paths: Any = {TEST_ROUTE: {"models": {'ref': log_fn, 'new': log_fn}}}
@@ -285,6 +285,30 @@ def get_frames():
         comment_replay_report(log_msgs, cmp_log, log_msgs)
         failed = False
         print(diff_long)
+      else:
+        commit = (get_commit() or 'local')[:7]
+        all_plots = [*zl([
+          (lambda x: get_idx_if_non_empty(x.velocity.x, 0), "velocity.x"),
+          (lambda x: get_idx_if_non_empty(x.action.desiredCurvature), "desiredCurvature"),
+          (lambda x: get_idx_if_non_empty(x.action.desiredAcceleration), "desiredAcceleration"),
+          (lambda x: get_idx_if_non_empty(x.leadsV3[0].x, 0), "leadsV3.x"),
+          (lambda x: get_idx_if_non_empty(x.laneLines[1].y, 0), "laneLines.y"),
+          (lambda x: get_idx_if_non_empty(x.meta.disengagePredictions.gasPressProbs, 1), "gasPressProbs"),
+        ], "modelV2")]
+        n = len(all_plots)
+        fig, axes = plt.subplots(n, 1, figsize=(10, 3 * n))
+        for ax, (v, event) in zip(axes, all_plots):
+          proposed_vals = list(map(v[0], get_event(log_msgs, event)))
+          master_vals = list(map(v[0], get_event(cmp_log, event)))
+          ax.plot(master_vals, label='MASTER')
+          ax.plot(proposed_vals, label='PROPOSED')
+          ax.set_title(v[1])
+          ax.legend(fontsize=8)
+          ax.grid(True, alpha=0.3)
+        fig.tight_layout()
+        out = f"model_replay_{commit}.png"
+        fig.savefig(out, dpi=150)
+        print(f"Plot saved to {out}")
       print('-------------\n'*5)
       print(diff_short)
       with open("model_diff.txt", "w") as f:

From 2f65af26485049e85776a62f964beb79b0d45000 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:07:59 -0700
Subject: [PATCH 13/65] Revert "tmp save plot locally"

This reverts commit ec22f15161ad3b0241a097546b35860f989219f5.
---
 selfdrive/test/process_replay/model_replay.py | 66 ++++++-------------
 1 file changed, 21 insertions(+), 45 deletions(-)

diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index 0751e29b01a8f5..eb7cdbe34acfdd 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -254,27 +254,27 @@ def get_frames():
         'driverStateV2.modelExecutionTime',
         'driverStateV2.gpuExecutionTime'
       ]
-      # if PC:
-      #   # TODO We ignore whole bunch so we can compare important stuff
-      #   # like posenet with reasonable tolerance
-      #   ignore += ['modelV2.acceleration.x',
-      #              'modelV2.position.x',
-      #              'modelV2.position.xStd',
-      #              'modelV2.position.y',
-      #              'modelV2.position.yStd',
-      #              'modelV2.position.z',
-      #              'modelV2.position.zStd',
-      #              'drivingModelData.path.xCoefficients',]
-      #   for i in range(3):
-      #     for field in ('x', 'y', 'v', 'a'):
-      #       ignore.append(f'modelV2.leadsV3.{i}.{field}')
-      #       ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
-      #   for i in range(4):
-      #     for field in ('x', 'y', 'z', 't'):
-      #       ignore.append(f'modelV2.laneLines.{i}.{field}')
-      #   for i in range(2):
-      #     for field in ('x', 'y', 'z', 't'):
-      #       ignore.append(f'modelV2.roadEdges.{i}.{field}')
+      if PC:
+        # TODO We ignore whole bunch so we can compare important stuff
+        # like posenet with reasonable tolerance
+        ignore += ['modelV2.acceleration.x',
+                   'modelV2.position.x',
+                   'modelV2.position.xStd',
+                   'modelV2.position.y',
+                   'modelV2.position.yStd',
+                   'modelV2.position.z',
+                   'modelV2.position.zStd',
+                   'drivingModelData.path.xCoefficients',]
+        for i in range(3):
+          for field in ('x', 'y', 'v', 'a'):
+            ignore.append(f'modelV2.leadsV3.{i}.{field}')
+            ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
+        for i in range(4):
+          for field in ('x', 'y', 'z', 't'):
+            ignore.append(f'modelV2.laneLines.{i}.{field}')
+        for i in range(2):
+          for field in ('x', 'y', 'z', 't'):
+            ignore.append(f'modelV2.roadEdges.{i}.{field}')
       tolerance = .3 if PC else None
       results: Any = {TEST_ROUTE: {}}
       log_paths: Any = {TEST_ROUTE: {"models": {'ref': log_fn, 'new': log_fn}}}
@@ -285,30 +285,6 @@ def get_frames():
         comment_replay_report(log_msgs, cmp_log, log_msgs)
         failed = False
         print(diff_long)
-      else:
-        commit = (get_commit() or 'local')[:7]
-        all_plots = [*zl([
-          (lambda x: get_idx_if_non_empty(x.velocity.x, 0), "velocity.x"),
-          (lambda x: get_idx_if_non_empty(x.action.desiredCurvature), "desiredCurvature"),
-          (lambda x: get_idx_if_non_empty(x.action.desiredAcceleration), "desiredAcceleration"),
-          (lambda x: get_idx_if_non_empty(x.leadsV3[0].x, 0), "leadsV3.x"),
-          (lambda x: get_idx_if_non_empty(x.laneLines[1].y, 0), "laneLines.y"),
-          (lambda x: get_idx_if_non_empty(x.meta.disengagePredictions.gasPressProbs, 1), "gasPressProbs"),
-        ], "modelV2")]
-        n = len(all_plots)
-        fig, axes = plt.subplots(n, 1, figsize=(10, 3 * n))
-        for ax, (v, event) in zip(axes, all_plots):
-          proposed_vals = list(map(v[0], get_event(log_msgs, event)))
-          master_vals = list(map(v[0], get_event(cmp_log, event)))
-          ax.plot(master_vals, label='MASTER')
-          ax.plot(proposed_vals, label='PROPOSED')
-          ax.set_title(v[1])
-          ax.legend(fontsize=8)
-          ax.grid(True, alpha=0.3)
-        fig.tight_layout()
-        out = f"model_replay_{commit}.png"
-        fig.savefig(out, dpi=150)
-        print(f"Plot saved to {out}")
       print('-------------\n'*5)
       print(diff_short)
       with open("model_diff.txt", "w") as f:

From 53e84ffbf9baaf1cc392771b4748074b13e0f74e Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:08:15 -0700
Subject: [PATCH 14/65] openpilot hacks?

---
 selfdrive/modeld/compile_warp.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index 20c4955b9a4814..3364a80cde5dfb 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -186,8 +186,9 @@ def compile_modeld(cam_w, cam_h):
     Device.default.synchronize()
 
     st = time.perf_counter()
-    outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                          feat_q, policy_inputs)
+    with Context(OPENPILOT_HACKS=1):
+      outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
+                            feat_q, policy_inputs)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()

From ab0ad4b2abd885080100541fe8446618a802cb53 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:18:59 -0700
Subject: [PATCH 15/65] no float16

---
 selfdrive/modeld/SConscript                   | 2 +-
 selfdrive/test/process_replay/model_replay.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index e40a8de62d50c6..77fd54560b498c 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -16,7 +16,7 @@ def estimate_pickle_max_size(onnx_size):
 # compile warp
 # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
 tg_flags = {
-    'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
+    'larch64': 'DEV=QCOM NOLOCALS=1 JIT_BATCH_SIZE=0',
     'Darwin': f'DEV=CPU THREADS=0 HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env
 }.get(arch, 'DEV=CPU:LLVM THREADS=0')
 
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index eb7cdbe34acfdd..10ff934a753b7b 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -190,7 +190,7 @@ def model_replay(lr, frs):
   print("----------------- Model Timing -----------------")
   print("------------------------------------------------")
   print(tabulate(rows, header, tablefmt="simple_grid", stralign="center", numalign="center", floatfmt=".4f"))
-  assert timings_ok or PC
+  assert timings_ok or True
 
   return msgs
 

From f133a16ed3d7afa5b66ad3478d067b4d55c8b086 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:29:01 -0700
Subject: [PATCH 16/65] tmp more chunks

---
 selfdrive/modeld/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 77fd54560b498c..ad7b4c76db73fd 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -46,7 +46,7 @@ compile_node = lenv.Command(warp_targets, tinygrad_files + script_files + drivin
 # chunk the combined policy pkls (they contain model weights)
 for policy_pkl in policy_pkls:
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
-  chunk_targets = get_chunk_paths(policy_pkl, estimate_pickle_max_size(onnx_sizes_sum))
+  chunk_targets = get_chunk_paths(policy_pkl, 2.0 * onnx_sizes_sum + 10 * 1024 * 1024)
   def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
     chunk_file(pkl, chunks)
   lenv.Command(chunk_targets, compile_node, do_chunk)

From fe21f55720abe64b46fa77fb7ed2d0a1b701a6bd Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:58:43 -0700
Subject: [PATCH 17/65] Revert "tmp more chunks"

This reverts commit 9e1d9b4d0dc36ff530d2a70b565fbfabd7afb00d.
---
 selfdrive/modeld/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ad7b4c76db73fd..77fd54560b498c 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -46,7 +46,7 @@ compile_node = lenv.Command(warp_targets, tinygrad_files + script_files + drivin
 # chunk the combined policy pkls (they contain model weights)
 for policy_pkl in policy_pkls:
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
-  chunk_targets = get_chunk_paths(policy_pkl, 2.0 * onnx_sizes_sum + 10 * 1024 * 1024)
+  chunk_targets = get_chunk_paths(policy_pkl, estimate_pickle_max_size(onnx_sizes_sum))
   def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
     chunk_file(pkl, chunks)
   lenv.Command(chunk_targets, compile_node, do_chunk)

From ff25362cba50d3d9b0b2bc9892530d93b7c7bffe Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 15:58:58 -0700
Subject: [PATCH 18/65] Revert "no float16"

This reverts commit 6204956e98e3c0818ed1985ede8eeccb810f63e3.
---
 selfdrive/modeld/SConscript                   | 2 +-
 selfdrive/test/process_replay/model_replay.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 77fd54560b498c..e40a8de62d50c6 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -16,7 +16,7 @@ def estimate_pickle_max_size(onnx_size):
 # compile warp
 # THREADS=0 is need to prevent bug: https://github.com/tinygrad/tinygrad/issues/14689
 tg_flags = {
-    'larch64': 'DEV=QCOM NOLOCALS=1 JIT_BATCH_SIZE=0',
+    'larch64': 'DEV=QCOM FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0',
     'Darwin': f'DEV=CPU THREADS=0 HOME={os.path.expanduser("~")}', # tinygrad calls brew which needs a $HOME in the env
 }.get(arch, 'DEV=CPU:LLVM THREADS=0')
 
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index 10ff934a753b7b..eb7cdbe34acfdd 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -190,7 +190,7 @@ def model_replay(lr, frs):
   print("----------------- Model Timing -----------------")
   print("------------------------------------------------")
   print(tabulate(rows, header, tablefmt="simple_grid", stralign="center", numalign="center", floatfmt=".4f"))
-  assert timings_ok or True
+  assert timings_ok or PC
 
   return msgs
 

From db26a67a9a7dda40c162862926c2271fc0a98d6d Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 16:03:46 -0700
Subject: [PATCH 19/65] realize boundaries

---
 selfdrive/modeld/compile_warp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index 3364a80cde5dfb..fce480c172d5ae 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -134,15 +134,15 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
                  feat_q, policy_inputs):
     img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
 
-    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32').contiguous().realize()
 
     feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
-    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
+    feat_buf = feat_q[:, frame_skip - 1::frame_skip].contiguous().realize()
 
     inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
     inputs['features_buffer'] = feat_buf
-    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
-    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
+    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32').contiguous().realize()
+    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32').contiguous().realize()
 
     return vision_out, on_policy_out, off_policy_out
   return run_policy

From e49056062591b565ad5300bf88148dc0534c48fb Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 17:12:50 -0700
Subject: [PATCH 20/65] Revert "realize boundaries"

This reverts commit ffaa19259eba70944e7793e8f51a0f87089531b3.
---
 selfdrive/modeld/compile_warp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index fce480c172d5ae..3364a80cde5dfb 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -134,15 +134,15 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
                  feat_q, policy_inputs):
     img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
 
-    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32').contiguous().realize()
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
 
     feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
-    feat_buf = feat_q[:, frame_skip - 1::frame_skip].contiguous().realize()
+    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
 
     inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
     inputs['features_buffer'] = feat_buf
-    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32').contiguous().realize()
-    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32').contiguous().realize()
+    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
+    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
 
     return vision_out, on_policy_out, off_policy_out
   return run_policy

From b64e152285d69baaf20f3262aabeb223338d2273 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 17:13:13 -0700
Subject: [PATCH 21/65] prune=False?

---
 selfdrive/modeld/compile_warp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index 3364a80cde5dfb..30dc9758addf25 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -168,7 +168,7 @@ def compile_modeld(cam_w, cam_h):
 
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
-  run_policy_jit = TinyJit(_run, prune=True)
+  run_policy_jit = TinyJit(_run, prune=False)
 
   # warmup inputs
   img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()

From 9625b942664f2f027f3328e288bc656a8fcbde8c Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Sun, 5 Apr 2026 17:20:32 -0700
Subject: [PATCH 22/65] Reapply "tmp more chunks"

This reverts commit 2599c41cea93b4a6b4e946cdffc6a617663a7d23.
---
 selfdrive/modeld/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index e40a8de62d50c6..aaee5739cff952 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -46,7 +46,7 @@ compile_node = lenv.Command(warp_targets, tinygrad_files + script_files + drivin
 # chunk the combined policy pkls (they contain model weights)
 for policy_pkl in policy_pkls:
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
-  chunk_targets = get_chunk_paths(policy_pkl, estimate_pickle_max_size(onnx_sizes_sum))
+  chunk_targets = get_chunk_paths(policy_pkl, 2.0 * onnx_sizes_sum + 10 * 1024 * 1024)
   def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
     chunk_file(pkl, chunks)
   lenv.Command(chunk_targets, compile_node, do_chunk)

From 9285441bba6241879159ce2fa22108d29c1430df Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 10:49:07 -0700
Subject: [PATCH 23/65] tg bug?

---
 selfdrive/modeld/compile_warp.py | 2 +-
 selfdrive/modeld/modeld.py       | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
index 30dc9758addf25..3364a80cde5dfb 100755
--- a/selfdrive/modeld/compile_warp.py
+++ b/selfdrive/modeld/compile_warp.py
@@ -168,7 +168,7 @@ def compile_modeld(cam_w, cam_h):
 
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
-  run_policy_jit = TinyJit(_run, prune=False)
+  run_policy_jit = TinyJit(_run, prune=True)
 
   # warmup inputs
   img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 426360abfe0ed7..2e4efd18e44017 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -185,6 +185,10 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
+    # clear UOp interning cache before loading JIT pickle to avoid UNIQUE counter collisions
+    # (the pickle contains UOps with UNIQUE values starting at 0, same as this process)
+    from tinygrad.uop.ops import UOpMetaClass
+    UOpMetaClass.ucache.clear()
     self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:

From 27e035055e94178aefb600122f992c7464ba9d88 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 10:58:43 -0700
Subject: [PATCH 24/65] load first?

---
 selfdrive/modeld/modeld.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 2e4efd18e44017..0db44e782873eb 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -163,6 +163,9 @@ def __init__(self, cam_w: int, cam_h: int):
       self.policy_output_slices = policy_metadata['output_slices']
       # policy_output_size = policy_metadata['output_shapes']['outputs'][1]
 
+    # load JIT pickle before creating any Tensors to avoid UOp UNIQUE counter collisions in the interning cache
+    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
+
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
     self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
@@ -185,11 +188,6 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    # clear UOp interning cache before loading JIT pickle to avoid UNIQUE counter collisions
-    # (the pickle contains UOps with UNIQUE values starting at 0, same as this process)
-    from tinygrad.uop.ops import UOpMetaClass
-    UOpMetaClass.ucache.clear()
-    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}

From 406b3501d591aafd1c781f5910a725778f150ae0 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 13:48:11 -0700
Subject: [PATCH 25/65] Revert "load first?"

This reverts commit f643d082d76a424b23295e254179eb111e936e61.
---
 selfdrive/modeld/modeld.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 0db44e782873eb..2e4efd18e44017 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -163,9 +163,6 @@ def __init__(self, cam_w: int, cam_h: int):
       self.policy_output_slices = policy_metadata['output_slices']
       # policy_output_size = policy_metadata['output_shapes']['outputs'][1]
 
-    # load JIT pickle before creating any Tensors to avoid UOp UNIQUE counter collisions in the interning cache
-    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
-
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
     self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
@@ -188,6 +185,11 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
+    # clear UOp interning cache before loading JIT pickle to avoid UNIQUE counter collisions
+    # (the pickle contains UOps with UNIQUE values starting at 0, same as this process)
+    from tinygrad.uop.ops import UOpMetaClass
+    UOpMetaClass.ucache.clear()
+    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}

From dc1191b0115d750064b3d057a3f329e86db712af Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 14:12:52 -0700
Subject: [PATCH 26/65] revert

---
 selfdrive/modeld/modeld.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 2e4efd18e44017..426360abfe0ed7 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -185,10 +185,6 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    # clear UOp interning cache before loading JIT pickle to avoid UNIQUE counter collisions
-    # (the pickle contains UOps with UNIQUE values starting at 0, same as this process)
-    from tinygrad.uop.ops import UOpMetaClass
-    UOpMetaClass.ucache.clear()
     self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:

From 0cfb91a21d03bf88a183096e4e749220c402e6af Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 14:18:51 -0700
Subject: [PATCH 27/65] Reapply "tmp save plot locally"

This reverts commit 1b95b82ee58654bd908b1cb04ab0ddbcd1a5955d.
---
 selfdrive/test/process_replay/model_replay.py | 66 +++++++++++++------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index eb7cdbe34acfdd..0751e29b01a8f5 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -254,27 +254,27 @@ def get_frames():
         'driverStateV2.modelExecutionTime',
         'driverStateV2.gpuExecutionTime'
       ]
-      if PC:
-        # TODO We ignore whole bunch so we can compare important stuff
-        # like posenet with reasonable tolerance
-        ignore += ['modelV2.acceleration.x',
-                   'modelV2.position.x',
-                   'modelV2.position.xStd',
-                   'modelV2.position.y',
-                   'modelV2.position.yStd',
-                   'modelV2.position.z',
-                   'modelV2.position.zStd',
-                   'drivingModelData.path.xCoefficients',]
-        for i in range(3):
-          for field in ('x', 'y', 'v', 'a'):
-            ignore.append(f'modelV2.leadsV3.{i}.{field}')
-            ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
-        for i in range(4):
-          for field in ('x', 'y', 'z', 't'):
-            ignore.append(f'modelV2.laneLines.{i}.{field}')
-        for i in range(2):
-          for field in ('x', 'y', 'z', 't'):
-            ignore.append(f'modelV2.roadEdges.{i}.{field}')
+      # if PC:
+      #   # TODO We ignore whole bunch so we can compare important stuff
+      #   # like posenet with reasonable tolerance
+      #   ignore += ['modelV2.acceleration.x',
+      #              'modelV2.position.x',
+      #              'modelV2.position.xStd',
+      #              'modelV2.position.y',
+      #              'modelV2.position.yStd',
+      #              'modelV2.position.z',
+      #              'modelV2.position.zStd',
+      #              'drivingModelData.path.xCoefficients',]
+      #   for i in range(3):
+      #     for field in ('x', 'y', 'v', 'a'):
+      #       ignore.append(f'modelV2.leadsV3.{i}.{field}')
+      #       ignore.append(f'modelV2.leadsV3.{i}.{field}Std')
+      #   for i in range(4):
+      #     for field in ('x', 'y', 'z', 't'):
+      #       ignore.append(f'modelV2.laneLines.{i}.{field}')
+      #   for i in range(2):
+      #     for field in ('x', 'y', 'z', 't'):
+      #       ignore.append(f'modelV2.roadEdges.{i}.{field}')
       tolerance = .3 if PC else None
       results: Any = {TEST_ROUTE: {}}
       log_paths: Any = {TEST_ROUTE: {"models": {'ref': log_fn, 'new': log_fn}}}
@@ -285,6 +285,30 @@ def get_frames():
         comment_replay_report(log_msgs, cmp_log, log_msgs)
         failed = False
         print(diff_long)
+      else:
+        commit = (get_commit() or 'local')[:7]
+        all_plots = [*zl([
+          (lambda x: get_idx_if_non_empty(x.velocity.x, 0), "velocity.x"),
+          (lambda x: get_idx_if_non_empty(x.action.desiredCurvature), "desiredCurvature"),
+          (lambda x: get_idx_if_non_empty(x.action.desiredAcceleration), "desiredAcceleration"),
+          (lambda x: get_idx_if_non_empty(x.leadsV3[0].x, 0), "leadsV3.x"),
+          (lambda x: get_idx_if_non_empty(x.laneLines[1].y, 0), "laneLines.y"),
+          (lambda x: get_idx_if_non_empty(x.meta.disengagePredictions.gasPressProbs, 1), "gasPressProbs"),
+        ], "modelV2")]
+        n = len(all_plots)
+        fig, axes = plt.subplots(n, 1, figsize=(10, 3 * n))
+        for ax, (v, event) in zip(axes, all_plots):
+          proposed_vals = list(map(v[0], get_event(log_msgs, event)))
+          master_vals = list(map(v[0], get_event(cmp_log, event)))
+          ax.plot(master_vals, label='MASTER')
+          ax.plot(proposed_vals, label='PROPOSED')
+          ax.set_title(v[1])
+          ax.legend(fontsize=8)
+          ax.grid(True, alpha=0.3)
+        fig.tight_layout()
+        out = f"model_replay_{commit}.png"
+        fig.savefig(out, dpi=150)
+        print(f"Plot saved to {out}")
       print('-------------\n'*5)
       print(diff_short)
       with open("model_diff.txt", "w") as f:

From 04205f21ce7bfcb092694f614832b85888bb5e10 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 14:45:35 -0700
Subject: [PATCH 28/65] 0 tol pc

---
 selfdrive/test/process_replay/model_replay.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index 0751e29b01a8f5..87bf7822b53854 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -275,7 +275,7 @@ def get_frames():
       #   for i in range(2):
       #     for field in ('x', 'y', 'z', 't'):
       #       ignore.append(f'modelV2.roadEdges.{i}.{field}')
-      tolerance = .3 if PC else None
+      tolerance = .0 if PC else None
       results: Any = {TEST_ROUTE: {}}
       log_paths: Any = {TEST_ROUTE: {"models": {'ref': log_fn, 'new': log_fn}}}
       results[TEST_ROUTE]["models"] = compare_logs(cmp_log, log_msgs, tolerance=tolerance, ignore_fields=ignore)

From 2c005870b4859bf7447ac7c35c54fe0d199ef63a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:04:12 -0700
Subject: [PATCH 29/65] warp -> modeld

---
 selfdrive/modeld/SConscript      |  12 +-
 selfdrive/modeld/compile_warp.py | 239 -------------------------------
 selfdrive/modeld/modeld.py       |   2 +-
 3 files changed, 7 insertions(+), 246 deletions(-)
 delete mode 100755 selfdrive/modeld/compile_warp.py

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index aaee5739cff952..ef0c8d133c5d8a 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -30,18 +30,18 @@ for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy',
 image_flag = {
      'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
-script_files = [File(Dir("#selfdrive/modeld").File("compile_warp.py").abspath)]
-compile_warp_cmd = f'{tg_flags} {image_flag} python3 {Dir("#selfdrive/modeld").abspath}/compile_warp.py '
+script_files = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
+compile_modeld_cmd = f'{tg_flags} {image_flag} python3 {Dir("#selfdrive/modeld").abspath}/compile_modeld.py '
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 driving_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
 driving_metadata_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_on_policy']]
-warp_targets = []
+modeld_targets = []
 policy_pkls = []
 for cam in [_ar_ox_fisheye, _os_fisheye]:
   w, h = cam.width, cam.height
-  policy_pkls.append(File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath)
-  warp_targets += [File(f"models/warp_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
-compile_node = lenv.Command(warp_targets, tinygrad_files + script_files + driving_onnx_deps + driving_metadata_deps + [chunker_file], compile_warp_cmd)
+  policy_pkls.append(File(f"models/driving_{w}x{h}_tinygrad.pkl").abspath)
+  modeld_targets += [File(f"models/driving_{w}x{h}_tinygrad.pkl").abspath, File(f"models/dm_warp_{w}x{h}_tinygrad.pkl").abspath]
+compile_node = lenv.Command(modeld_targets, tinygrad_files + script_files + driving_onnx_deps + driving_metadata_deps + [chunker_file], compile_modeld_cmd)
 
 # chunk the combined policy pkls (they contain model weights)
 for policy_pkl in policy_pkls:
diff --git a/selfdrive/modeld/compile_warp.py b/selfdrive/modeld/compile_warp.py
deleted file mode 100755
index 3364a80cde5dfb..00000000000000
--- a/selfdrive/modeld/compile_warp.py
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/usr/bin/env python3
-import time
-import pickle
-import numpy as np
-from pathlib import Path
-from tinygrad.tensor import Tensor
-from tinygrad.helpers import Context
-from tinygrad.device import Device
-from tinygrad.engine.jit import TinyJit
-
-from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
-from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
-from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
-
-MODELS_DIR = Path(__file__).parent / 'models'
-
-CAMERA_CONFIGS = [
-  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
-  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
-]
-
-UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
-UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
-
-IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
-
-
-def policy_pkl_path(w, h):
-  return MODELS_DIR / f'warp_{w}x{h}_tinygrad.pkl'
-
-
-def dm_warp_pkl_path(w, h):
-  return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl'
-
-
-def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
-  w_dst, h_dst = dst_shape
-  h_src, w_src = src_shape
-
-  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
-  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
-
-  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
-  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
-  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
-  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
-
-  src_x = src_x / src_w
-  src_y = src_y / src_w
-
-  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
-  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
-  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
-
-  return src_flat[idx]
-
-
-def frames_to_tensor(frames, model_w, model_h):
-  H = (frames.shape[0] * 2) // 3
-  W = frames.shape[1]
-  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
-                       frames[1:H:2, 0::2],
-                       frames[0:H:2, 1::2],
-                       frames[1:H:2, 1::2],
-                       frames[H:H+H//4].reshape((H//2, W//2)),
-                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
-  return in_img1
-
-
-def make_frame_prepare(cam_w, cam_h, model_w, model_h):
-  stride, y_height, uv_height, _ = get_nv12_info(cam_w, cam_h)
-  uv_offset = stride * y_height
-  stride_pad = stride - cam_w
-
-  def frame_prepare_tinygrad(input_frame, M_inv):
-    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
-    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
-    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
-    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
-    with Context(SPLIT_REDUCEOP=0):
-      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
-                                    M_inv, (model_w, model_h),
-                                    (cam_h, cam_w), stride_pad).realize()
-      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
-      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
-    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
-    tensor = frames_to_tensor(yuv, model_w, model_h)
-    return tensor
-  return frame_prepare_tinygrad
-
-
-def make_update_img_input(frame_prepare, model_w, model_h):
-  def update_img_input_tinygrad(frame_buffer, frame, M_inv):
-    M_inv = M_inv.to(Device.DEFAULT)
-    new_img = frame_prepare(frame, M_inv)
-    frame_buffer.assign(frame_buffer[6:].cat(new_img, dim=0).contiguous())
-    return Tensor.cat(frame_buffer[:6], frame_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
-  return update_img_input_tinygrad
-
-
-def make_update_both_imgs(frame_prepare, model_w, model_h):
-  update_img = make_update_img_input(frame_prepare, model_w, model_h)
-
-  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
-                                calib_big_img_buffer, new_big_img, M_inv_big):
-    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
-    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
-    return calib_img_pair, calib_big_img_pair
-  return update_both_imgs_tinygrad
-
-
-def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
-  stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
-  stride_pad = stride - cam_w
-
-  def warp_dm(input_frame, M_inv):
-    M_inv = M_inv.to(Device.DEFAULT)
-    result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
-    return result
-  return warp_dm
-
-
-def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, cam_h,
-                    vision_features_slice, frame_skip):
-  model_w, model_h = MEDMODEL_INPUT_SIZE
-  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
-  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
-
-  def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                 feat_q, policy_inputs):
-    img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
-
-    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
-
-    feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
-    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
-
-    inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
-    inputs['features_buffer'] = feat_buf
-    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
-    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
-
-    return vision_out, on_policy_out, off_policy_out
-  return run_policy
-
-
-def compile_modeld(cam_w, cam_h):
-  from tinygrad.nn.onnx import OnnxRunner
-  from openpilot.selfdrive.modeld.constants import ModelConstants
-
-  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-  print(f"Compiling combined policy JIT for {cam_w}x{cam_h}...")
-
-  vision_runner = OnnxRunner(MODELS_DIR / 'driving_vision.onnx')
-  on_policy_runner = OnnxRunner(MODELS_DIR / 'driving_on_policy.onnx')
-  off_policy_runner = OnnxRunner(MODELS_DIR / 'driving_off_policy.onnx')
-
-  with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
-    vision_features_slice = pickle.load(f)['output_slices']['hidden_state']
-  with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
-    policy_input_shapes = pickle.load(f)['input_shapes']
-
-  frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
-
-  _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
-                         cam_w, cam_h, vision_features_slice, frame_skip)
-  run_policy_jit = TinyJit(_run, prune=True)
-
-  # warmup inputs
-  img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  fb = policy_input_shapes['features_buffer']
-  feat_q = Tensor.zeros(fb[0], fb[1] * frame_skip, fb[2]).contiguous().realize()
-  numpy_inputs = {k: np.zeros(policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
-  policy_inputs = {k: Tensor(v, device='NPY').realize() for k, v in numpy_inputs.items()}
-  tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
-  big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
-
-  for i in range(10):
-    frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
-    big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
-    Device.default.synchronize()
-
-    st = time.perf_counter()
-    with Context(OPENPILOT_HACKS=1):
-      outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                            feat_q, policy_inputs)
-    mt = time.perf_counter()
-    Device.default.synchronize()
-    et = time.perf_counter()
-    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
-
-  pkl_path = policy_pkl_path(cam_w, cam_h)
-  with open(pkl_path, "wb") as f:
-    pickle.dump(run_policy_jit, f)
-  print(f"  Saved to {pkl_path}")
-
-  jit = pickle.load(open(pkl_path, "rb"))
-  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, policy_inputs)
-
-
-def compile_dm_warp(cam_w, cam_h):
-  dm_w, dm_h = DM_INPUT_SIZE
-  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
-
-  print(f"Compiling DM warp for {cam_w}x{cam_h}...")
-
-  warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h)
-  warp_dm_jit = TinyJit(warp_dm, prune=True)
-
-  for i in range(10):
-    inputs = [Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
-              Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
-    Device.default.synchronize()
-    st = time.perf_counter()
-    warp_dm_jit(*inputs)
-    mt = time.perf_counter()
-    Device.default.synchronize()
-    et = time.perf_counter()
-    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
-
-  pkl_path = dm_warp_pkl_path(cam_w, cam_h)
-  with open(pkl_path, "wb") as f:
-    pickle.dump(warp_dm_jit, f)
-  print(f"  Saved to {pkl_path}")
-
-
-def run_and_save_pickle():
-  for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld(cam_w, cam_h)
-    compile_dm_warp(cam_w, cam_h)
-
-
-if __name__ == "__main__":
-  run_and_save_pickle()
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 426360abfe0ed7..b37808e20af0f4 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -26,7 +26,7 @@
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
-from openpilot.selfdrive.modeld.compile_warp import policy_pkl_path
+from openpilot.selfdrive.modeld.compile_modeld import policy_pkl_path
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan

From 934d89c956fade7f8ed9a8a5cd9c84cd3b488a64 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:06:41 -0700
Subject: [PATCH 30/65] rename

---
 selfdrive/modeld/compile_modeld.py | 239 +++++++++++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100755 selfdrive/modeld/compile_modeld.py

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
new file mode 100755
index 00000000000000..d86dd639ba93e9
--- /dev/null
+++ b/selfdrive/modeld/compile_modeld.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+import time
+import pickle
+import numpy as np
+from pathlib import Path
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import Context
+from tinygrad.device import Device
+from tinygrad.engine.jit import TinyJit
+
+from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
+from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
+from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
+
+MODELS_DIR = Path(__file__).parent / 'models'
+
+CAMERA_CONFIGS = [
+  (_ar_ox_fisheye.width, _ar_ox_fisheye.height),  # tici: 1928x1208
+  (_os_fisheye.width, _os_fisheye.height),        # mici: 1344x760
+]
+
+UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
+UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
+
+IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
+
+
+def policy_pkl_path(w, h):
+  return MODELS_DIR / f'driving_{w}x{h}_tinygrad.pkl'
+
+
+def dm_warp_pkl_path(w, h):
+  return MODELS_DIR / f'dm_warp_{w}x{h}_tinygrad.pkl'
+
+
+def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad):
+  w_dst, h_dst = dst_shape
+  h_src, w_src = src_shape
+
+  x = Tensor.arange(w_dst).reshape(1, w_dst).expand(h_dst, w_dst).reshape(-1)
+  y = Tensor.arange(h_dst).reshape(h_dst, 1).expand(h_dst, w_dst).reshape(-1)
+
+  # inline 3x3 matmul as elementwise to avoid reduce op (enables fusion with gather)
+  src_x = M_inv[0, 0] * x + M_inv[0, 1] * y + M_inv[0, 2]
+  src_y = M_inv[1, 0] * x + M_inv[1, 1] * y + M_inv[1, 2]
+  src_w = M_inv[2, 0] * x + M_inv[2, 1] * y + M_inv[2, 2]
+
+  src_x = src_x / src_w
+  src_y = src_y / src_w
+
+  x_nn_clipped = Tensor.round(src_x).clip(0, w_src - 1).cast('int')
+  y_nn_clipped = Tensor.round(src_y).clip(0, h_src - 1).cast('int')
+  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
+
+  return src_flat[idx]
+
+
+def frames_to_tensor(frames, model_w, model_h):
+  H = (frames.shape[0] * 2) // 3
+  W = frames.shape[1]
+  in_img1 = Tensor.cat(frames[0:H:2, 0::2],
+                       frames[1:H:2, 0::2],
+                       frames[0:H:2, 1::2],
+                       frames[1:H:2, 1::2],
+                       frames[H:H+H//4].reshape((H//2, W//2)),
+                       frames[H+H//4:H+H//2].reshape((H//2, W//2)), dim=0).reshape((6, H//2, W//2))
+  return in_img1
+
+
+def make_frame_prepare(cam_w, cam_h, model_w, model_h):
+  stride, y_height, uv_height, _ = get_nv12_info(cam_w, cam_h)
+  uv_offset = stride * y_height
+  stride_pad = stride - cam_w
+
+  def frame_prepare_tinygrad(input_frame, M_inv):
+    # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
+    M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]])
+    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
+    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
+    with Context(SPLIT_REDUCEOP=0):
+      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
+                                    M_inv, (model_w, model_h),
+                                    (cam_h, cam_w), stride_pad).realize()
+      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
+                                    M_inv_uv, (model_w//2, model_h//2),
+                                    (cam_h//2, cam_w//2), 0).realize()
+      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
+                                    M_inv_uv, (model_w//2, model_h//2),
+                                    (cam_h//2, cam_w//2), 0).realize()
+    yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
+    tensor = frames_to_tensor(yuv, model_w, model_h)
+    return tensor
+  return frame_prepare_tinygrad
+
+
+def make_update_img_input(frame_prepare, model_w, model_h):
+  def update_img_input_tinygrad(frame_buffer, frame, M_inv):
+    M_inv = M_inv.to(Device.DEFAULT)
+    new_img = frame_prepare(frame, M_inv)
+    frame_buffer.assign(frame_buffer[6:].cat(new_img, dim=0).contiguous())
+    return Tensor.cat(frame_buffer[:6], frame_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
+  return update_img_input_tinygrad
+
+
+def make_update_both_imgs(frame_prepare, model_w, model_h):
+  update_img = make_update_img_input(frame_prepare, model_w, model_h)
+
+  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
+                                calib_big_img_buffer, new_big_img, M_inv_big):
+    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
+    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
+    return calib_img_pair, calib_big_img_pair
+  return update_both_imgs_tinygrad
+
+
+def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
+  stride, y_height, _, _ = get_nv12_info(cam_w, cam_h)
+  stride_pad = stride - cam_w
+
+  def warp_dm(input_frame, M_inv):
+    M_inv = M_inv.to(Device.DEFAULT)
+    result = warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, (dm_w, dm_h), (cam_h, cam_w), stride_pad).reshape(-1, dm_h * dm_w)
+    return result
+  return warp_dm
+
+
+def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, cam_h,
+                    vision_features_slice, frame_skip):
+  model_w, model_h = MEDMODEL_INPUT_SIZE
+  frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
+  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
+
+  def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
+                 feat_q, policy_inputs):
+    img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
+
+    vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
+
+    feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
+    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
+
+    inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
+    inputs['features_buffer'] = feat_buf
+    on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
+    off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
+
+    return vision_out, on_policy_out, off_policy_out
+  return run_policy
+
+
+def compile_modeld(cam_w, cam_h):
+  from tinygrad.nn.onnx import OnnxRunner
+  from openpilot.selfdrive.modeld.constants import ModelConstants
+
+  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
+  print(f"Compiling combined policy JIT for {cam_w}x{cam_h}...")
+
+  vision_runner = OnnxRunner(MODELS_DIR / 'driving_vision.onnx')
+  on_policy_runner = OnnxRunner(MODELS_DIR / 'driving_on_policy.onnx')
+  off_policy_runner = OnnxRunner(MODELS_DIR / 'driving_off_policy.onnx')
+
+  with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
+    vision_features_slice = pickle.load(f)['output_slices']['hidden_state']
+  with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
+    policy_input_shapes = pickle.load(f)['input_shapes']
+
+  frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
+
+  _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
+                         cam_w, cam_h, vision_features_slice, frame_skip)
+  run_policy_jit = TinyJit(_run, prune=True)
+
+  # warmup inputs
+  img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
+  fb = policy_input_shapes['features_buffer']
+  feat_q = Tensor.zeros(fb[0], fb[1] * frame_skip, fb[2]).contiguous().realize()
+  numpy_inputs = {k: np.zeros(policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
+  policy_inputs = {k: Tensor(v, device='NPY').realize() for k, v in numpy_inputs.items()}
+  tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
+  big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
+
+  for i in range(10):
+    frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    Device.default.synchronize()
+
+    st = time.perf_counter()
+    with Context(OPENPILOT_HACKS=1):
+      outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
+                            feat_q, policy_inputs)
+    mt = time.perf_counter()
+    Device.default.synchronize()
+    et = time.perf_counter()
+    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+
+  pkl_path = policy_pkl_path(cam_w, cam_h)
+  with open(pkl_path, "wb") as f:
+    pickle.dump(run_policy_jit, f)
+  print(f"  Saved to {pkl_path}")
+
+  jit = pickle.load(open(pkl_path, "rb"))
+  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, policy_inputs)
+
+
+def compile_dm_warp(cam_w, cam_h):
+  dm_w, dm_h = DM_INPUT_SIZE
+  _, _, _, yuv_size = get_nv12_info(cam_w, cam_h)
+
+  print(f"Compiling DM warp for {cam_w}x{cam_h}...")
+
+  warp_dm = make_warp_dm(cam_w, cam_h, dm_w, dm_h)
+  warp_dm_jit = TinyJit(warp_dm, prune=True)
+
+  for i in range(10):
+    inputs = [Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize(),
+              Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')]
+    Device.default.synchronize()
+    st = time.perf_counter()
+    warp_dm_jit(*inputs)
+    mt = time.perf_counter()
+    Device.default.synchronize()
+    et = time.perf_counter()
+    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+
+  pkl_path = dm_warp_pkl_path(cam_w, cam_h)
+  with open(pkl_path, "wb") as f:
+    pickle.dump(warp_dm_jit, f)
+  print(f"  Saved to {pkl_path}")
+
+
+def run_and_save_pickle():
+  for cam_w, cam_h in CAMERA_CONFIGS:
+    compile_modeld(cam_w, cam_h)
+    compile_dm_warp(cam_w, cam_h)
+
+
+if __name__ == "__main__":
+  run_and_save_pickle()

From ad5422a93483ffd8a59ba62e5fb72ced3b5d04d0 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:51:58 -0700
Subject: [PATCH 31/65] bypass chunking?

---
 selfdrive/modeld/modeld.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index b37808e20af0f4..99e11390f4a153 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -185,7 +185,9 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
+    with open(policy_pkl_path(cam_w, cam_h), 'rb') as f:
+      self.run_policy = pickle.load(f)
+    # self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}

From cc97fc67b3203456e123f02babe5c83b87c7e264 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:52:50 -0700
Subject: [PATCH 32/65] dont chunk

---
 selfdrive/modeld/SConscript | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ef0c8d133c5d8a..75550eb60962e6 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -64,11 +64,11 @@ def tg_compile(flags, model_name):
   )
   def do_chunk(target, source, env):
     chunk_file(pkl, chunk_targets)
-  return lenv.Command(
-    chunk_targets,
-    compile_node,
-    do_chunk,
-  )
+  # return lenv.Command(
+  #   chunk_targets,
+  #   compile_node,
+  #   do_chunk,
+  # )
 
 # Compile dmonitoring model (driving models are in the combined policy JIT)
 tg_compile(tg_flags, 'dmonitoring_model')

From 5ffae8f369ce70b83ab02af74feb4e4f01f197db Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:53:19 -0700
Subject: [PATCH 33/65] Revert "dont chunk"

This reverts commit cc97fc67b3203456e123f02babe5c83b87c7e264.
---
 selfdrive/modeld/SConscript | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 75550eb60962e6..ef0c8d133c5d8a 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -64,11 +64,11 @@ def tg_compile(flags, model_name):
   )
   def do_chunk(target, source, env):
     chunk_file(pkl, chunk_targets)
-  # return lenv.Command(
-  #   chunk_targets,
-  #   compile_node,
-  #   do_chunk,
-  # )
+  return lenv.Command(
+    chunk_targets,
+    compile_node,
+    do_chunk,
+  )
 
 # Compile dmonitoring model (driving models are in the combined policy JIT)
 tg_compile(tg_flags, 'dmonitoring_model')

From 42bd9b6f6ad0722c50348ba11ba7e2a64fdf997d Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 16:53:59 -0700
Subject: [PATCH 34/65] dont chunk

---
 selfdrive/modeld/SConscript | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ef0c8d133c5d8a..9c127d0a3504bd 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -48,7 +48,8 @@ for policy_pkl in policy_pkls:
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
   chunk_targets = get_chunk_paths(policy_pkl, 2.0 * onnx_sizes_sum + 10 * 1024 * 1024)
   def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
-    chunk_file(pkl, chunks)
+    # chunk_file(pkl, chunks)
+    pass
   lenv.Command(chunk_targets, compile_node, do_chunk)
 
 def tg_compile(flags, model_name):

From b3c2f2e7a095fd32f8d8562a68fd1cca42357eac Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 18:23:54 -0700
Subject: [PATCH 35/65] debug

---
 selfdrive/modeld/compile_modeld.py |  5 +++++
 selfdrive/modeld/modeld.py         | 11 ++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index d86dd639ba93e9..8f06655f342cf7 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import os
 import time
 import pickle
 import numpy as np
@@ -8,6 +9,7 @@
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 
+from openpilot.common.git import get_commit
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
@@ -230,7 +232,10 @@ def compile_dm_warp(cam_w, cam_h):
 
 
 def run_and_save_pickle():
+  commit = get_commit()[:7]
+  print(f"MODEL BUILD PROVENANCE: commit={commit} script=compile_modeld.py dev={os.getenv('DEV', '')} image={os.getenv('IMAGE', '')} float16={os.getenv('FLOAT16', '')}")
   for cam_w, cam_h in CAMERA_CONFIGS:
+    print(f"MODEL BUILD TARGETS: cam={cam_w}x{cam_h} policy_pkl={policy_pkl_path(cam_w, cam_h)} dm_warp_pkl={dm_warp_pkl_path(cam_w, cam_h)}")
     compile_modeld(cam_w, cam_h)
     compile_dm_warp(cam_w, cam_h)
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 99e11390f4a153..d60ff0fced809e 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -16,6 +16,7 @@
 from cereal.messaging import PubMaster, SubMaster
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from opendbc.car.car_helpers import get_demo_car_params
+from openpilot.common.git import get_commit
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.params import Params
 from openpilot.common.filter_simple import FirstOrderFilter
@@ -144,6 +145,7 @@ class ModelState:
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self, cam_w: int, cam_h: int):
+    self.policy_path = policy_pkl_path(cam_w, cam_h)
     with open(VISION_METADATA_PATH, 'rb') as f:
       vision_metadata = pickle.load(f)
       self.vision_input_shapes =  vision_metadata['input_shapes']
@@ -185,9 +187,9 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    with open(policy_pkl_path(cam_w, cam_h), 'rb') as f:
+    with open(self.policy_path, 'rb') as f:
       self.run_policy = pickle.load(f)
-    # self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
+    # self.run_policy = pickle.loads(read_file_chunked(str(self.policy_path)))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -268,8 +270,11 @@ def main(demo=False):
     cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
 
   st = time.monotonic()
-  cloudlog.warning("loading model")
+  commit = get_commit()[:7]
+  cloudlog.warning(f"loading model commit={commit} dev={os.getenv('DEV', '')} tici={TICI} usbgpu={USBGPU}")
   model = ModelState(vipc_client_main.width, vipc_client_main.height)
+  policy_path = model.policy_path
+  cloudlog.warning(f"modeld artifact policy_pkl={policy_path} exists={policy_path.is_file()} size={policy_path.stat().st_size if policy_path.is_file() else 'missing'}")
   cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
 
   # messaging

From 735cecb8497248cfa23ab8f5e2cdc1a61eb85d41 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 19:03:51 -0700
Subject: [PATCH 36/65] Revert "debug"

This reverts commit b3c2f2e7a095fd32f8d8562a68fd1cca42357eac.
---
 selfdrive/modeld/compile_modeld.py |  5 -----
 selfdrive/modeld/modeld.py         | 11 +++--------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 8f06655f342cf7..d86dd639ba93e9 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-import os
 import time
 import pickle
 import numpy as np
@@ -9,7 +8,6 @@
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 
-from openpilot.common.git import get_commit
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.transformations.model import MEDMODEL_INPUT_SIZE, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
@@ -232,10 +230,7 @@ def compile_dm_warp(cam_w, cam_h):
 
 
 def run_and_save_pickle():
-  commit = get_commit()[:7]
-  print(f"MODEL BUILD PROVENANCE: commit={commit} script=compile_modeld.py dev={os.getenv('DEV', '')} image={os.getenv('IMAGE', '')} float16={os.getenv('FLOAT16', '')}")
   for cam_w, cam_h in CAMERA_CONFIGS:
-    print(f"MODEL BUILD TARGETS: cam={cam_w}x{cam_h} policy_pkl={policy_pkl_path(cam_w, cam_h)} dm_warp_pkl={dm_warp_pkl_path(cam_w, cam_h)}")
     compile_modeld(cam_w, cam_h)
     compile_dm_warp(cam_w, cam_h)
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index d60ff0fced809e..99e11390f4a153 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -16,7 +16,6 @@
 from cereal.messaging import PubMaster, SubMaster
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from opendbc.car.car_helpers import get_demo_car_params
-from openpilot.common.git import get_commit
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.params import Params
 from openpilot.common.filter_simple import FirstOrderFilter
@@ -145,7 +144,6 @@ class ModelState:
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
 
   def __init__(self, cam_w: int, cam_h: int):
-    self.policy_path = policy_pkl_path(cam_w, cam_h)
     with open(VISION_METADATA_PATH, 'rb') as f:
       vision_metadata = pickle.load(f)
       self.vision_input_shapes =  vision_metadata['input_shapes']
@@ -187,9 +185,9 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    with open(self.policy_path, 'rb') as f:
+    with open(policy_pkl_path(cam_w, cam_h), 'rb') as f:
       self.run_policy = pickle.load(f)
-    # self.run_policy = pickle.loads(read_file_chunked(str(self.policy_path)))
+    # self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -270,11 +268,8 @@ def main(demo=False):
     cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
 
   st = time.monotonic()
-  commit = get_commit()[:7]
-  cloudlog.warning(f"loading model commit={commit} dev={os.getenv('DEV', '')} tici={TICI} usbgpu={USBGPU}")
+  cloudlog.warning("loading model")
   model = ModelState(vipc_client_main.width, vipc_client_main.height)
-  policy_path = model.policy_path
-  cloudlog.warning(f"modeld artifact policy_pkl={policy_path} exists={policy_path.is_file()} size={policy_path.stat().st_size if policy_path.is_file() else 'missing'}")
   cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
 
   # messaging

From 497b614ff5398b4b49f0ebd7d2685a59eaeadf72 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 19:04:04 -0700
Subject: [PATCH 37/65] Revert "dont chunk"

This reverts commit 42bd9b6f6ad0722c50348ba11ba7e2a64fdf997d.
---
 selfdrive/modeld/SConscript | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 9c127d0a3504bd..ef0c8d133c5d8a 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -48,8 +48,7 @@ for policy_pkl in policy_pkls:
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
   chunk_targets = get_chunk_paths(policy_pkl, 2.0 * onnx_sizes_sum + 10 * 1024 * 1024)
   def do_chunk(target, source, env, pkl=policy_pkl, chunks=chunk_targets):
-    # chunk_file(pkl, chunks)
-    pass
+    chunk_file(pkl, chunks)
   lenv.Command(chunk_targets, compile_node, do_chunk)
 
 def tg_compile(flags, model_name):

From f2082a2e801346a59b1dbb743c64c13f442fcdd4 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 19:04:13 -0700
Subject: [PATCH 38/65] Revert "bypass chunking?"

This reverts commit ad5422a93483ffd8a59ba62e5fb72ced3b5d04d0.
---
 selfdrive/modeld/modeld.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 99e11390f4a153..b37808e20af0f4 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -185,9 +185,7 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
-    with open(policy_pkl_path(cam_w, cam_h), 'rb') as f:
-      self.run_policy = pickle.load(f)
-    # self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
+    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}

From 245feb94480e02f83a20b65a9488652bcbfc88b0 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 19:06:41 -0700
Subject: [PATCH 39/65] corrupt model outputs

---
 selfdrive/modeld/compile_modeld.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index d86dd639ba93e9..cf2fb1aafb48c3 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -142,6 +142,7 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
     inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
     inputs['features_buffer'] = feat_buf
     on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
+    on_policy_out = on_policy_out * 0.0 + 123.0
     off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
 
     return vision_out, on_policy_out, off_policy_out

From 188ecba363a1b06c2326ad9d53bbbbf6a8fb966e Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 19:14:30 -0700
Subject: [PATCH 40/65] Revert "corrupt model outputs"

This reverts commit 245feb94480e02f83a20b65a9488652bcbfc88b0.
---
 selfdrive/modeld/compile_modeld.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index cf2fb1aafb48c3..d86dd639ba93e9 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -142,7 +142,6 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
     inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
     inputs['features_buffer'] = feat_buf
     on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
-    on_policy_out = on_policy_out * 0.0 + 123.0
     off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
 
     return vision_out, on_policy_out, off_policy_out

From b58d6a80828158d03eb02a7c41aeac02605b32db Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Mon, 6 Apr 2026 21:24:18 -0700
Subject: [PATCH 41/65] image=0 for warp, match master

---
 selfdrive/modeld/compile_modeld.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index d86dd639ba93e9..70a23baa975dbb 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -132,7 +132,9 @@ def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, c
 
   def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
                  feat_q, policy_inputs):
-    img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
+
+    with Context(IMAGE=0):
+      img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
 
     vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
 

From 5d6e6e8e635a5071cebfd012d11d66771df7572e Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 14:26:08 -0700
Subject: [PATCH 42/65] dedupe enqueue

---
 selfdrive/modeld/compile_modeld.py | 38 ++++++++++--------------------
 selfdrive/modeld/modeld.py         |  8 +++----
 2 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 70a23baa975dbb..2ac2ec37144e0a 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -22,7 +22,7 @@
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 
-IMG_BUFFER_SHAPE = (30, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
+IMG_BUFFER_SHAPE = (5, 6, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
 
 
 def policy_pkl_path(w, h):
@@ -93,24 +93,9 @@ def frame_prepare_tinygrad(input_frame, M_inv):
   return frame_prepare_tinygrad
 
 
-def make_update_img_input(frame_prepare, model_w, model_h):
-  def update_img_input_tinygrad(frame_buffer, frame, M_inv):
-    M_inv = M_inv.to(Device.DEFAULT)
-    new_img = frame_prepare(frame, M_inv)
-    frame_buffer.assign(frame_buffer[6:].cat(new_img, dim=0).contiguous())
-    return Tensor.cat(frame_buffer[:6], frame_buffer[-6:], dim=0).contiguous().reshape(1, 12, model_h//2, model_w//2)
-  return update_img_input_tinygrad
-
-
-def make_update_both_imgs(frame_prepare, model_w, model_h):
-  update_img = make_update_img_input(frame_prepare, model_w, model_h)
-
-  def update_both_imgs_tinygrad(calib_img_buffer, new_img, M_inv,
-                                calib_big_img_buffer, new_big_img, M_inv_big):
-    calib_img_pair = update_img(calib_img_buffer, new_img, M_inv)
-    calib_big_img_pair = update_img(calib_big_img_buffer, new_big_img, M_inv_big)
-    return calib_img_pair, calib_big_img_pair
-  return update_both_imgs_tinygrad
+def shift_and_sample(buf, new_val, sample_fn):
+  buf.assign(buf[1:].cat(new_val, dim=0).contiguous())
+  return sample_fn(buf)
 
 
 def make_warp_dm(cam_w, cam_h, dm_w, dm_h):
@@ -128,18 +113,21 @@ def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, c
                     vision_features_slice, frame_skip):
   model_w, model_h = MEDMODEL_INPUT_SIZE
   frame_prepare = make_frame_prepare(cam_w, cam_h, model_w, model_h)
-  update_both_imgs = make_update_both_imgs(frame_prepare, model_w, model_h)
+
+  def sample_skip(buf):
+    return buf[::frame_skip].contiguous().flatten(0, 1).unsqueeze(0)
 
   def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
                  feat_q, policy_inputs):
 
     with Context(IMAGE=0):
-      img, big_img = update_both_imgs(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm)
+      img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
+      big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
 
     vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
 
-    feat_q.assign(feat_q[:, 1:].cat(vision_out[:, vision_features_slice].reshape(1, 1, -1), dim=1).contiguous())
-    feat_buf = feat_q[:, frame_skip - 1::frame_skip]
+    new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
+    feat_buf = shift_and_sample(feat_q, new_feat, sample_skip)
 
     inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
     inputs['features_buffer'] = feat_buf
@@ -175,8 +163,8 @@ def compile_modeld(cam_w, cam_h):
   # warmup inputs
   img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
   big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  fb = policy_input_shapes['features_buffer']
-  feat_q = Tensor.zeros(fb[0], fb[1] * frame_skip, fb[2]).contiguous().realize()
+  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
+  feat_q = Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
   numpy_inputs = {k: np.zeros(policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
   policy_inputs = {k: Tensor(v, device='NPY').realize() for k, v in numpy_inputs.items()}
   tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index b37808e20af0f4..72c84763f16764 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -45,8 +45,8 @@
 LONG_SMOOTH_SECONDS = 0.3
 MIN_LAT_CONTROL_SPEED = 0.3
 
-IMG_QUEUE_SHAPE = (6*(ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1), 128, 256)
-assert IMG_QUEUE_SHAPE[0] == 30
+IMG_QUEUE_SHAPE = (ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1, 6, 128, 256)
+assert IMG_QUEUE_SHAPE[0] == 5
 
 
 def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action,
@@ -172,8 +172,8 @@ def __init__(self, cam_w: int, cam_h: int):
     self.full_input_queues.reset()
 
     self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
-    fb = self.policy_input_shapes['features_buffer']
-    self.features_queue = Tensor.zeros(fb[0], fb[1] * self.frame_skip, fb[2]).contiguous().realize()
+    fb = self.policy_input_shapes['features_buffer']  # (1, 25, 512)
+    self.features_queue = Tensor.zeros(self.frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
 
     self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                        'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}

From 8bcccbb23c1b4c28362d725e7b962b9a531be576 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 14:37:15 -0700
Subject: [PATCH 43/65] pass traffic convention

---
 selfdrive/modeld/compile_modeld.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 2ac2ec37144e0a..6124dd31b6ab76 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -118,7 +118,7 @@ def sample_skip(buf):
     return buf[::frame_skip].contiguous().flatten(0, 1).unsqueeze(0)
 
   def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                 feat_q, policy_inputs):
+                 feat_q, desire_pulse, traffic_convention):
 
     with Context(IMAGE=0):
       img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
@@ -129,8 +129,7 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
     new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
     feat_buf = shift_and_sample(feat_q, new_feat, sample_skip)
 
-    inputs = {k: v.to(Device.DEFAULT) for k, v in policy_inputs.items()}
-    inputs['features_buffer'] = feat_buf
+    inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_pulse.to(Device.DEFAULT), 'traffic_convention': traffic_convention.to(Device.DEFAULT)}
     on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
     off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
 
@@ -165,8 +164,8 @@ def compile_modeld(cam_w, cam_h):
   big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
   fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
   feat_q = Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
-  numpy_inputs = {k: np.zeros(policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
-  policy_inputs = {k: Tensor(v, device='NPY').realize() for k, v in numpy_inputs.items()}
+  desire_pulse = Tensor(np.zeros(policy_input_shapes['desire_pulse'], dtype=np.float32), device='NPY').realize()
+  traffic_convention = Tensor(np.zeros(policy_input_shapes['traffic_convention'], dtype=np.float32), device='NPY').realize()
   tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
   big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
 
@@ -178,7 +177,7 @@ def compile_modeld(cam_w, cam_h):
     st = time.perf_counter()
     with Context(OPENPILOT_HACKS=1):
       outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                            feat_q, policy_inputs)
+                            feat_q, desire_pulse, traffic_convention)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
@@ -190,7 +189,7 @@ def compile_modeld(cam_w, cam_h):
   print(f"  Saved to {pkl_path}")
 
   jit = pickle.load(open(pkl_path, "rb"))
-  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, policy_inputs)
+  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, desire_pulse, traffic_convention)
 
 
 def compile_dm_warp(cam_w, cam_h):

From 45e8119d8476cfa19b113f22a8312423165f98f8 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 14:57:02 -0700
Subject: [PATCH 44/65] tg buffer for desire

---
 selfdrive/modeld/compile_modeld.py | 17 ++++++++++++-----
 selfdrive/modeld/modeld.py         | 21 ++++++++++-----------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 6124dd31b6ab76..bb4a7b77310369 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -117,8 +117,11 @@ def make_run_policy(vision_runner, on_policy_runner, off_policy_runner, cam_w, c
   def sample_skip(buf):
     return buf[::frame_skip].contiguous().flatten(0, 1).unsqueeze(0)
 
+  def sample_desire(buf):
+    return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
+
   def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                 feat_q, desire_pulse, traffic_convention):
+                 feat_q, desire_q, desire, traffic_convention):
 
     with Context(IMAGE=0):
       img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
@@ -129,7 +132,9 @@ def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
     new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
     feat_buf = shift_and_sample(feat_q, new_feat, sample_skip)
 
-    inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_pulse.to(Device.DEFAULT), 'traffic_convention': traffic_convention.to(Device.DEFAULT)}
+    desire_buf = shift_and_sample(desire_q, desire.to(Device.DEFAULT).reshape(1, 1, -1), sample_desire)
+
+    inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_buf, 'traffic_convention': traffic_convention.to(Device.DEFAULT)}
     on_policy_out = next(iter(on_policy_runner(inputs).values())).cast('float32')
     off_policy_out = next(iter(off_policy_runner(inputs).values())).cast('float32')
 
@@ -164,7 +169,9 @@ def compile_modeld(cam_w, cam_h):
   big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
   fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
   feat_q = Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
-  desire_pulse = Tensor(np.zeros(policy_input_shapes['desire_pulse'], dtype=np.float32), device='NPY').realize()
+  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
+  desire_q = Tensor.zeros(frame_skip * dp[1], dp[0], dp[2]).contiguous().realize()
+  desire = Tensor(np.zeros(dp[2], dtype=np.float32), device='NPY').realize()
   traffic_convention = Tensor(np.zeros(policy_input_shapes['traffic_convention'], dtype=np.float32), device='NPY').realize()
   tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
   big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
@@ -177,7 +184,7 @@ def compile_modeld(cam_w, cam_h):
     st = time.perf_counter()
     with Context(OPENPILOT_HACKS=1):
       outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                            feat_q, desire_pulse, traffic_convention)
+                            feat_q, desire_q, desire, traffic_convention)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
@@ -189,7 +196,7 @@ def compile_modeld(cam_w, cam_h):
   print(f"  Saved to {pkl_path}")
 
   jit = pickle.load(open(pkl_path, "rb"))
-  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, desire_pulse, traffic_convention)
+  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, desire_q, desire, traffic_convention)
 
 
 def compile_dm_warp(cam_w, cam_h):
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 72c84763f16764..ae4299528c97f6 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -165,15 +165,16 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
-    self.numpy_inputs = {k: np.zeros(self.policy_input_shapes[k], dtype=np.float32) for k in ['desire_pulse', 'traffic_convention']}
-    self.full_input_queues = InputQueues(ModelConstants.MODEL_CONTEXT_FREQ, ModelConstants.MODEL_RUN_FREQ, ModelConstants.N_FRAMES)
-    for k in ['desire_pulse']:
-      self.full_input_queues.update_dtypes_and_shapes({k: self.numpy_inputs[k].dtype}, {k: self.numpy_inputs[k].shape})
-    self.full_input_queues.reset()
-
     self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
     fb = self.policy_input_shapes['features_buffer']  # (1, 25, 512)
     self.features_queue = Tensor.zeros(self.frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
+    dp = self.policy_input_shapes['desire_pulse']  # (1, 25, 8)
+    self.desire_queue = Tensor.zeros(self.frame_skip * dp[1], dp[0], dp[2]).contiguous().realize()
+    self.desire_np = np.zeros(dp[2], dtype=np.float32)
+    self.desire = Tensor(self.desire_np, device='NPY').realize()
+    tc = self.policy_input_shapes['traffic_convention']  # (1, 2)
+    self.traffic_convention_np = np.zeros(tc, dtype=np.float32)
+    self.traffic_convention = Tensor(self.traffic_convention_np, device='NPY').realize()
 
     self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
                        'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
@@ -181,7 +182,6 @@ def __init__(self, cam_w: int, cam_h: int):
     self._blob_cache : dict[int, Tensor] = {}
     self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
     self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
-    self.policy_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
@@ -208,14 +208,13 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       self.full_frames[key] = self._blob_cache[cache_key]
       self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    self.full_input_queues.enqueue({'desire_pulse': new_desire})
-    self.numpy_inputs['desire_pulse'][:] = self.full_input_queues.get('desire_pulse')['desire_pulse']
-    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
+    self.desire_np[:] = new_desire
+    self.traffic_convention_np[:] = inputs['traffic_convention']
 
     vision_output, on_policy_output, off_policy_output = self.run_policy(
       self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
       self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
-      self.features_queue, self.policy_inputs
+      self.features_queue, self.desire_queue, self.desire, self.traffic_convention
     )
 
     vision_output = vision_output.uop.base.buffer.numpy().flatten()

From a4dc55d2da0fde09653bd447999dd313102096a6 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 15:21:12 -0700
Subject: [PATCH 45/65] dedupe buffer creation

---
 selfdrive/modeld/compile_modeld.py | 51 +++++++++++++++++++-----------
 selfdrive/modeld/modeld.py         | 32 +++++--------------
 2 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index bb4a7b77310369..59e2d763d27c9f 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -22,9 +22,6 @@
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 
-IMG_BUFFER_SHAPE = (5, 6, MEDMODEL_INPUT_SIZE[1] // 2, MEDMODEL_INPUT_SIZE[0] // 2)
-
-
 def policy_pkl_path(w, h):
   return MODELS_DIR / f'driving_{w}x{h}_tinygrad.pkl'
 
@@ -93,6 +90,31 @@ def frame_prepare_tinygrad(input_frame, M_inv):
   return frame_prepare_tinygrad
 
 
+def make_buffers(vision_input_shapes, policy_input_shapes, frame_skip):
+  img = vision_input_shapes['img']  # (1, 12, 128, 256)
+  n_frames = img[1] // 6
+  img_buf_shape = (frame_skip * (n_frames - 1) + 1, 6, img[2], img[3])
+
+  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
+  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
+  tc = policy_input_shapes['traffic_convention']  # (1, 2)
+
+  npy = {
+    'desire': np.zeros(dp[2], dtype=np.float32),
+    'traffic_convention': np.zeros(tc, dtype=np.float32),
+    'tfm': np.zeros((3, 3), dtype=np.float32),
+    'big_tfm': np.zeros((3, 3), dtype=np.float32),
+  }
+  bufs = {
+    'img_buf': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
+    'big_img_buf': Tensor.zeros(img_buf_shape, dtype='uint8').contiguous().realize(),
+    'feat_q': Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize(),
+    'desire_q': Tensor.zeros(frame_skip * dp[1], dp[0], dp[2]).contiguous().realize(),
+    **{k: Tensor(v, device='NPY').realize() for k, v in npy.items()},
+  }
+  return bufs, npy
+
+
 def shift_and_sample(buf, new_val, sample_fn):
   buf.assign(buf[1:].cat(new_val, dim=0).contiguous())
   return sample_fn(buf)
@@ -120,8 +142,7 @@ def sample_skip(buf):
   def sample_desire(buf):
     return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
 
-  def run_policy(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                 feat_q, desire_q, desire, traffic_convention):
+  def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
 
     with Context(IMAGE=0):
       img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
@@ -154,7 +175,9 @@ def compile_modeld(cam_w, cam_h):
   off_policy_runner = OnnxRunner(MODELS_DIR / 'driving_off_policy.onnx')
 
   with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
-    vision_features_slice = pickle.load(f)['output_slices']['hidden_state']
+    vision_metadata = pickle.load(f)
+    vision_features_slice = vision_metadata['output_slices']['hidden_state']
+    vision_input_shapes = vision_metadata['input_shapes']
   with open(MODELS_DIR / 'driving_on_policy_metadata.pkl', 'rb') as f:
     policy_input_shapes = pickle.load(f)['input_shapes']
 
@@ -165,16 +188,7 @@ def compile_modeld(cam_w, cam_h):
   run_policy_jit = TinyJit(_run, prune=True)
 
   # warmup inputs
-  img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  big_img_buf = Tensor.zeros(IMG_BUFFER_SHAPE, dtype='uint8').contiguous().realize()
-  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
-  feat_q = Tensor.zeros(frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
-  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
-  desire_q = Tensor.zeros(frame_skip * dp[1], dp[0], dp[2]).contiguous().realize()
-  desire = Tensor(np.zeros(dp[2], dtype=np.float32), device='NPY').realize()
-  traffic_convention = Tensor(np.zeros(policy_input_shapes['traffic_convention'], dtype=np.float32), device='NPY').realize()
-  tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
-  big_tfm = Tensor(np.zeros((3, 3), dtype=np.float32), device='NPY').realize()
+  bufs, _ = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
 
   for i in range(10):
     frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
@@ -183,8 +197,7 @@ def compile_modeld(cam_w, cam_h):
 
     st = time.perf_counter()
     with Context(OPENPILOT_HACKS=1):
-      outs = run_policy_jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm,
-                            feat_q, desire_q, desire, traffic_convention)
+      outs = run_policy_jit(**bufs, frame=frame, big_frame=big_frame)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
@@ -196,7 +209,7 @@ def compile_modeld(cam_w, cam_h):
   print(f"  Saved to {pkl_path}")
 
   jit = pickle.load(open(pkl_path, "rb"))
-  jit(img_buf, frame, tfm, big_img_buf, big_frame, big_tfm, feat_q, desire_q, desire, traffic_convention)
+  jit(**bufs, frame=frame, big_frame=big_frame)
 
 
 def compile_dm_warp(cam_w, cam_h):
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index ae4299528c97f6..816a61f83cf245 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -26,7 +26,7 @@
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
-from openpilot.selfdrive.modeld.compile_modeld import policy_pkl_path
+from openpilot.selfdrive.modeld.compile_modeld import policy_pkl_path, make_buffers
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
@@ -45,8 +45,6 @@
 LONG_SMOOTH_SECONDS = 0.3
 MIN_LAT_CONTROL_SPEED = 0.3
 
-IMG_QUEUE_SHAPE = (ModelConstants.MODEL_RUN_FREQ//ModelConstants.MODEL_CONTEXT_FREQ + 1, 6, 128, 256)
-assert IMG_QUEUE_SHAPE[0] == 5
 
 
 def get_action_from_model(model_output: dict[str, np.ndarray], prev_action: log.ModelDataV2.Action,
@@ -166,25 +164,12 @@ def __init__(self, cam_w: int, cam_h: int):
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
     self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
-    fb = self.policy_input_shapes['features_buffer']  # (1, 25, 512)
-    self.features_queue = Tensor.zeros(self.frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]).contiguous().realize()
-    dp = self.policy_input_shapes['desire_pulse']  # (1, 25, 8)
-    self.desire_queue = Tensor.zeros(self.frame_skip * dp[1], dp[0], dp[2]).contiguous().realize()
-    self.desire_np = np.zeros(dp[2], dtype=np.float32)
-    self.desire = Tensor(self.desire_np, device='NPY').realize()
-    tc = self.policy_input_shapes['traffic_convention']  # (1, 2)
-    self.traffic_convention_np = np.zeros(tc, dtype=np.float32)
-    self.traffic_convention = Tensor(self.traffic_convention_np, device='NPY').realize()
-
-    self.img_queues = {'img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize(),
-                       'big_img': Tensor.zeros(IMG_QUEUE_SHAPE, dtype='uint8').contiguous().realize()}
+    self.bufs, self.npy = make_buffers(self.vision_input_shapes, self.policy_input_shapes, self.frame_skip)
     self.full_frames : dict[str, Tensor] = {}
     self._blob_cache : dict[int, Tensor] = {}
-    self.transforms_np = {k: np.zeros((3,3), dtype=np.float32) for k in self.img_queues}
-    self.transforms = {k: Tensor(v, device='NPY').realize() for k, v in self.transforms_np.items()}
 
     self.parser = Parser()
-    self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in self.img_queues}
+    self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
     self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
@@ -206,15 +191,14 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       if cache_key not in self._blob_cache:
         self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8')
       self.full_frames[key] = self._blob_cache[cache_key]
-      self.transforms_np[key][:,:] = transforms[key][:,:]
 
-    self.desire_np[:] = new_desire
-    self.traffic_convention_np[:] = inputs['traffic_convention']
+    self.npy['desire'][:] = new_desire
+    self.npy['traffic_convention'][:] = inputs['traffic_convention']
+    self.npy['tfm'][:,:] = transforms['img'][:,:]
+    self.npy['big_tfm'][:,:] = transforms['big_img'][:,:]
 
     vision_output, on_policy_output, off_policy_output = self.run_policy(
-      self.img_queues['img'], self.full_frames['img'], self.transforms['img'],
-      self.img_queues['big_img'], self.full_frames['big_img'], self.transforms['big_img'],
-      self.features_queue, self.desire_queue, self.desire, self.traffic_convention
+      **self.bufs, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
     )
 
     vision_output = vision_output.uop.base.buffer.numpy().flatten()

From a3747006066631005f474307cba9cdb96c95c7df Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Tue, 7 Apr 2026 15:39:58 -0700
Subject: [PATCH 46/65] compile_modeld: nuke stale cached pkl before compiling

The UNSAFE CI checkout keeps gitignored files (.pkl, .sconsign.dblite),
so stale pkl files from previous commits can persist and be reused
instead of being recompiled. Delete them explicitly before compiling.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 selfdrive/modeld/compile_modeld.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 59e2d763d27c9f..d0476ae4eee75b 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import os
 import time
 import pickle
 import numpy as np
@@ -164,6 +165,11 @@ def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_conventio
 
 
 def compile_modeld(cam_w, cam_h):
+  # force rebuild: clear stale cached pkl files for this resolution
+  import glob
+  for stale in glob.glob(str(MODELS_DIR / f'driving_{cam_w}x{cam_h}_tinygrad.pkl*')):
+    os.remove(stale)
+
   from tinygrad.nn.onnx import OnnxRunner
   from openpilot.selfdrive.modeld.constants import ModelConstants
 

From a85517333851ad16ddaef5ddf0cdd12706503d1a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 16:03:46 -0700
Subject: [PATCH 47/65] test vs compile

---
 selfdrive/modeld/compile_modeld.py | 46 +++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index d0476ae4eee75b..dec7507d5e97ef 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -145,9 +145,9 @@ def sample_desire(buf):
 
   def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
 
-    with Context(IMAGE=0):
-      img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
-      big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
+    # with Context(IMAGE=0): TODO check if needed
+    img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
+    big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
 
     vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
 
@@ -192,30 +192,53 @@ def compile_modeld(cam_w, cam_h):
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
   run_policy_jit = TinyJit(_run, prune=True)
-
-  # warmup inputs
   bufs, _ = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
 
-  for i in range(10):
+  for i in range(3):
     frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
     big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
     Device.default.synchronize()
 
     st = time.perf_counter()
     with Context(OPENPILOT_HACKS=1):
-      outs = run_policy_jit(**bufs, frame=frame, big_frame=big_frame)
+      inputs = {**bufs, 'frame': frame, 'big_frame': big_frame}
+      outs = run_policy_jit(**inputs)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
+    if i == 1:
+      test_val = [np.copy(v.numpy()) for v in outs]
+      test_inputs = {k: Tensor(v.numpy().copy(), device=v.device) for k, v in inputs.items()}
+
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
     pickle.dump(run_policy_jit, f)
   print(f"  Saved to {pkl_path}")
+  return test_inputs, test_val
 
-  jit = pickle.load(open(pkl_path, "rb"))
-  jit(**bufs, frame=frame, big_frame=big_frame)
+
+def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray]):
+
+  # run 20 times
+  for i in range(20):
+    st = time.perf_counter()
+    out = run(**inputs)
+    mt = time.perf_counter()
+    val = [v.numpy() for v in out]
+    et = time.perf_counter()
+    print(f"enqueue {(mt-st)*1e3:6.2f} ms -- total run {(et-st)*1e3:6.2f} ms")
+
+    if test_val is not None and i == 0:  # check output matches before buffers get mutated by the jit
+      np.testing.assert_equal(test_val, val)
+
+  # test that changing the numpy changes the model outputs
+  inputs_2x = {k: Tensor(v.numpy()*2, device=v.device) for k,v in inputs.items()}
+  out = run(**inputs_2x)
+  changed_val = [v.numpy() for v in out]
+  assert any(not np.array_equal(a, b) for a, b in zip(val, changed_val)), "changing inputs should change outputs"
+  print('test_vs_compile OK')
 
 
 def compile_dm_warp(cam_w, cam_h):
@@ -246,7 +269,10 @@ def compile_dm_warp(cam_w, cam_h):
 
 def run_and_save_pickle():
   for cam_w, cam_h in CAMERA_CONFIGS:
-    compile_modeld(cam_w, cam_h)
+    inputs, outputs = compile_modeld(cam_w, cam_h)
+    pickle_loaded = pickle.load(open(policy_pkl_path(cam_w, cam_h), "rb"))
+    test_vs_compile(pickle_loaded, inputs, outputs)
+
     compile_dm_warp(cam_w, cam_h)
 
 

From 25493ebc628c92a0ef56ac354ba281ed30e887f1 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 16:24:46 -0700
Subject: [PATCH 48/65] all outputs need to be different on different inputs

---
 selfdrive/modeld/compile_modeld.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index dec7507d5e97ef..100c858cdf5d0c 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -144,7 +144,6 @@ def sample_desire(buf):
     return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
 
   def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
-
     # with Context(IMAGE=0): TODO check if needed
     img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
     big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
@@ -153,7 +152,6 @@ def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_conventio
 
     new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
     feat_buf = shift_and_sample(feat_q, new_feat, sample_skip)
-
     desire_buf = shift_and_sample(desire_q, desire.to(Device.DEFAULT).reshape(1, 1, -1), sample_desire)
 
     inputs = {'features_buffer': feat_buf, 'desire_pulse': desire_buf, 'traffic_convention': traffic_convention.to(Device.DEFAULT)}
@@ -220,7 +218,6 @@ def compile_modeld(cam_w, cam_h):
 
 
 def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray]):
-
   # run 20 times
   for i in range(20):
     st = time.perf_counter()
@@ -233,11 +230,12 @@ def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray]):
     if test_val is not None and i == 0:  # check output matches before buffers get mutated by the jit
       np.testing.assert_equal(test_val, val)
 
-  # test that changing the numpy changes the model outputs
+  # test that changing the inputs changes the model outputs
   inputs_2x = {k: Tensor(v.numpy()*2, device=v.device) for k,v in inputs.items()}
   out = run(**inputs_2x)
   changed_val = [v.numpy() for v in out]
-  assert any(not np.array_equal(a, b) for a, b in zip(val, changed_val)), "changing inputs should change outputs"
+  for v, cv in zip(val, changed_val):
+    assert not np.array_equal(v, cv), f"output with shape {v.shape} didn't change when inputs were doubled"
   print('test_vs_compile OK')
 
 

From ae20f3b526ab5891c54171027f03b71fff3f8ef0 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 16:29:49 -0700
Subject: [PATCH 49/65] randomize numpy inputs

---
 selfdrive/modeld/compile_modeld.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 100c858cdf5d0c..1527d8b1b88aba 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -190,7 +190,10 @@ def compile_modeld(cam_w, cam_h):
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
   run_policy_jit = TinyJit(_run, prune=True)
-  bufs, _ = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
+  bufs, npy = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
+
+  for k, v in npy.items():
+    v[:] = np.random.randn(*v.shape).astype(v.dtype)
 
   for i in range(3):
     frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()

From fe56443bd79453082677a2c02b53f0bd0c9fe484 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Tue, 7 Apr 2026 16:39:07 -0700
Subject: [PATCH 50/65] randomize on every step

---
 selfdrive/modeld/compile_modeld.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 1527d8b1b88aba..89712fbe554605 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -192,12 +192,12 @@ def compile_modeld(cam_w, cam_h):
   run_policy_jit = TinyJit(_run, prune=True)
   bufs, npy = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
 
-  for k, v in npy.items():
-    v[:] = np.random.randn(*v.shape).astype(v.dtype)
 
   for i in range(3):
     frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
     big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
+    for v in npy.values():
+      v[:] = np.random.randn(*v.shape).astype(v.dtype)
     Device.default.synchronize()
 
     st = time.perf_counter()

From a648c1588e98a1c31dccee8623543d6bc9237034 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Tue, 7 Apr 2026 18:02:59 -0700
Subject: [PATCH 51/65] SConscript: nuke stale pkl+chunks before compile_modeld

Move the stale artifact cleanup from compile_modeld.py into the
SConscript build command. This ensures stale gitignored pkl and chunk
files are deleted even if scons decides to skip the compile step
(due to a stale .sconsign.dblite from UNSAFE CI checkout).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 selfdrive/modeld/SConscript        | 3 ++-
 selfdrive/modeld/compile_modeld.py | 6 ------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index ef0c8d133c5d8a..5eb7da7648c328 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -31,7 +31,8 @@ image_flag = {
      'larch64': 'IMAGE=2',
 }.get(arch, 'IMAGE=0')
 script_files = [File(Dir("#selfdrive/modeld").File("compile_modeld.py").abspath)]
-compile_modeld_cmd = f'{tg_flags} {image_flag} python3 {Dir("#selfdrive/modeld").abspath}/compile_modeld.py '
+# nuke stale cached pkl+chunks before compiling (UNSAFE CI checkout keeps gitignored files)
+compile_modeld_cmd = f'rm -f {Dir("#selfdrive/modeld").abspath}/models/driving_*_tinygrad.pkl* && {tg_flags} {image_flag} python3 {Dir("#selfdrive/modeld").abspath}/compile_modeld.py '
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 driving_onnx_deps = [File(f"models/{m}.onnx").abspath for m in ['driving_vision', 'driving_on_policy', 'driving_off_policy']]
 driving_metadata_deps = [File(f"models/{m}_metadata.pkl").abspath for m in ['driving_vision', 'driving_on_policy']]
diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 89712fbe554605..eaa40877ed2dff 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-import os
 import time
 import pickle
 import numpy as np
@@ -163,11 +162,6 @@ def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_conventio
 
 
 def compile_modeld(cam_w, cam_h):
-  # force rebuild: clear stale cached pkl files for this resolution
-  import glob
-  for stale in glob.glob(str(MODELS_DIR / f'driving_{cam_w}x{cam_h}_tinygrad.pkl*')):
-    os.remove(stale)
-
   from tinygrad.nn.onnx import OnnxRunner
   from openpilot.selfdrive.modeld.constants import ModelConstants
 

From 027f1b29f255ab22baeabd50bb8702652818ac0e Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Tue, 7 Apr 2026 18:54:05 -0700
Subject: [PATCH 52/65] compile_modeld: restore Context(IMAGE=0) for warp

The warp operations must run under IMAGE=0 to avoid QCOM image texture
optimizations that corrupt the output buffer after ~33 frames.
This was accidentally commented out in a855173.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 selfdrive/modeld/compile_modeld.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index eaa40877ed2dff..5b05914d06fb3f 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -143,9 +143,9 @@ def sample_desire(buf):
     return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
 
   def run_policy(img_buf, big_img_buf, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
-    # with Context(IMAGE=0): TODO check if needed
-    img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
-    big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
+    with Context(IMAGE=0):
+      img = shift_and_sample(img_buf, frame_prepare(frame, tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
+      big_img = shift_and_sample(big_img_buf, frame_prepare(big_frame, big_tfm.to(Device.DEFAULT)).unsqueeze(0), sample_skip)
 
     vision_out = next(iter(vision_runner({'img': img, 'big_img': big_img}).values())).cast('float32')
 

From 968c987c2fbb3fce141c4e345d10ddea559b6c50 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Tue, 7 Apr 2026 23:34:23 -0700
Subject: [PATCH 53/65] modeld: create SubMaster before model loading

Move PubMaster/SubMaster creation before the model loading step.
During model loading (3.5s+), process_replay may send liveCalibration.
If SubMaster doesn't exist yet, the message is dropped and the warp
transform stays as zeros, producing garbage warped images.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 selfdrive/modeld/modeld.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 816a61f83cf245..e51ec7c88677cc 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -248,15 +248,15 @@ def main(demo=False):
   if use_extra_client:
     cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
 
+  # messaging - create SubMaster before model loading so we don't miss early liveCalibration messages
+  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
+  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
+
   st = time.monotonic()
   cloudlog.warning("loading model")
   model = ModelState(vipc_client_main.width, vipc_client_main.height)
   cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
 
-  # messaging
-  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
-  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
-
   publish_state = PublishState()
   params = Params()
 

From 051e6def70190559762d55e875564cdad504bec3 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Tue, 7 Apr 2026 23:35:09 -0700
Subject: [PATCH 54/65] Revert "modeld: create SubMaster before model loading"

This reverts commit 968c987c2fbb3fce141c4e345d10ddea559b6c50.
---
 selfdrive/modeld/modeld.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index e51ec7c88677cc..816a61f83cf245 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -248,15 +248,15 @@ def main(demo=False):
   if use_extra_client:
     cloudlog.warning(f"connected extra cam with buffer size: {vipc_client_extra.buffer_len} ({vipc_client_extra.width} x {vipc_client_extra.height})")
 
-  # messaging - create SubMaster before model loading so we don't miss early liveCalibration messages
-  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
-  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
-
   st = time.monotonic()
   cloudlog.warning("loading model")
   model = ModelState(vipc_client_main.width, vipc_client_main.height)
   cloudlog.warning(f"models loaded in {time.monotonic() - st:.1f}s, modeld starting")
 
+  # messaging
+  pm = PubMaster(["modelV2", "drivingModelData", "cameraOdometry"])
+  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "carControl", "liveDelay"])
+
   publish_state = PublishState()
   params = Params()
 

From 870388513c0d4a67dcf970cd277b6db56cb2b478 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Wed, 8 Apr 2026 13:28:46 -0700
Subject: [PATCH 55/65] stale metadata?

---
 selfdrive/modeld/SConscript | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 5eb7da7648c328..8edb936698ef60 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -21,6 +21,9 @@ tg_flags = {
 }.get(arch, 'DEV=CPU:LLVM THREADS=0')
 
 # Get model metadata
+# nuke stale metadata before regenerating (CI checkout can keep gitignored files from prior builds)
+for stale in glob.glob(os.path.join(Dir("#selfdrive/modeld").abspath, "models", "*_metadata.pkl")):
+  os.remove(stale)
 for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:
   fn = File(f"models/{model_name}").abspath
   script_files = [File(Dir("#selfdrive/modeld").File("get_model_metadata.py").abspath)]

From 49e754c6affa45a8ea8834588a00227b8090b17a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Wed, 8 Apr 2026 15:59:32 -0700
Subject: [PATCH 56/65] claude debug

---
 selfdrive/modeld/compile_modeld.py            |  15 ++-
 selfdrive/modeld/modeld.py                    | 102 +++++++++++++++++-
 selfdrive/test/process_replay/model_replay.py |  58 ++++++++++
 3 files changed, 172 insertions(+), 3 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 5b05914d06fb3f..60f774124a1b75 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -181,6 +181,18 @@ def compile_modeld(cam_w, cam_h):
 
   frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
 
+  # DEBUG: log what we're baking into the JIT
+  import hashlib
+  for name, path in [('vision', MODELS_DIR / 'driving_vision.onnx'), ('on_policy', MODELS_DIR / 'driving_on_policy.onnx'),
+                     ('off_policy', MODELS_DIR / 'driving_off_policy.onnx')]:
+    sz = path.stat().st_size
+    md5 = hashlib.md5(path.read_bytes()).hexdigest()[:12]
+    print(f"  COMPILE_DEBUG onnx {name}: size={sz} md5={md5}")
+  print(f"  COMPILE_DEBUG vision_features_slice={vision_features_slice} vision_output_size={vision_metadata['output_shapes']['outputs'][1]}")
+  print(f"  COMPILE_DEBUG vision_slices={list(vision_metadata['output_slices'].keys())}")
+  print(f"  COMPILE_DEBUG policy_input_shapes={policy_input_shapes}")
+  print(f"  COMPILE_DEBUG frame_skip={frame_skip}")
+
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
   run_policy_jit = TinyJit(_run, prune=True)
@@ -210,7 +222,8 @@ def compile_modeld(cam_w, cam_h):
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
     pickle.dump(run_policy_jit, f)
-  print(f"  Saved to {pkl_path}")
+  pkl_md5 = hashlib.md5(pkl_path.read_bytes()).hexdigest()[:12]
+  print(f"  Saved to {pkl_path} (size={pkl_path.stat().st_size} md5={pkl_md5})")
   return test_inputs, test_val
 
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 816a61f83cf245..4ed0213fbf2377 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -170,7 +170,52 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
-    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
+    pkl_path = policy_pkl_path(cam_w, cam_h)
+    self.run_policy = pickle.loads(read_file_chunked(str(pkl_path)))
+    self._run_count = 0
+
+    # ===== DEBUG: dump everything that could be stale =====
+    import hashlib
+    def _fhash(p):
+      try:
+        return hashlib.md5(open(p,'rb').read()).hexdigest()[:12]
+      except FileNotFoundError:
+        return "MISSING"
+    def _fsize(p):
+      try:
+        return Path(p).stat().st_size
+      except FileNotFoundError:
+        return -1
+
+    cloudlog.warning("MODELD_DEBUG ====== INIT ======")
+    # onnx file hashes — catches LFS pointer vs real file
+    for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy']:
+      p = MODELS_DIR / f'{name}.onnx'
+      cloudlog.warning(f"MODELD_DEBUG onnx {name}: size={_fsize(p)} md5={_fhash(p)}")
+    # metadata file hashes — catches stale metadata
+    for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy', 'driving_policy']:
+      p = MODELS_DIR / f'{name}_metadata.pkl'
+      cloudlog.warning(f"MODELD_DEBUG meta {name}: size={_fsize(p)} md5={_fhash(p)}")
+    # compiled pkl hash
+    cloudlog.warning(f"MODELD_DEBUG pkl: path={pkl_path} size={_fsize(pkl_path)} md5={_fhash(pkl_path)}")
+    # stale master-era files that should NOT exist
+    for stale_name in ['driving_vision_tinygrad.pkl', 'driving_policy_tinygrad.pkl',
+                       'warp_1928x1208_tinygrad.pkl', 'warp_1344x760_tinygrad.pkl',
+                       'driving_policy_metadata.pkl']:
+      p = MODELS_DIR / stale_name
+      if p.exists():
+        cloudlog.warning(f"MODELD_DEBUG STALE FILE EXISTS: {p} size={_fsize(p)}")
+
+    # metadata content
+    hs = self.vision_output_slices.get('hidden_state', None)
+    cloudlog.warning(f"MODELD_DEBUG vision: output_size={vision_output_size} hidden_state_slice={hs} all_slices={list(self.vision_output_slices.keys())}")
+    cloudlog.warning(f"MODELD_DEBUG on_policy: slices={list(self.policy_output_slices.keys())} input_shapes={self.policy_input_shapes}")
+    cloudlog.warning(f"MODELD_DEBUG off_policy: slices={list(self.off_policy_output_slices.keys())} input_shapes={self.off_policy_input_shapes}")
+    # buffer shapes
+    for k, v in self.bufs.items():
+      cloudlog.warning(f"MODELD_DEBUG buf '{k}': shape={v.shape} dtype={v.dtype} device={v.device}")
+    cloudlog.warning(f"MODELD_DEBUG cam={cam_w}x{cam_h} frame_skip={self.frame_skip}")
+    cloudlog.warning("MODELD_DEBUG ====== END INIT ======")
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -203,7 +248,55 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
 
     vision_output = vision_output.uop.base.buffer.numpy().flatten()
     on_policy_output = on_policy_output.uop.base.buffer.numpy().flatten()
-    off_policy_output = off_policy_output.uop.base.buffer.numpy()
+    off_policy_output = off_policy_output.uop.base.buffer.numpy().flatten()
+
+    if self._run_count < 10 or self._run_count % 10 == 0:
+      n = self._run_count
+      hs = self.vision_output_slices.get('hidden_state', None)
+      feats = vision_output[hs] if hs else np.array([])
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: output_lens v={len(vision_output)} on={len(on_policy_output)} off={len(off_policy_output)}")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: tfm={self.npy['tfm'].tolist()} big_tfm_diag=[{self.npy['big_tfm'][0,0]:.4f},{self.npy['big_tfm'][1,1]:.4f},{self.npy['big_tfm'][2,2]:.4f}]")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: feats len={len(feats)} mean={feats.mean():.4f} std={feats.std():.4f} min={feats.min():.4f} max={feats.max():.4f}")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: vision_out mean={vision_output.mean():.4f} std={vision_output.std():.4f}")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: on_policy_out[:8]={on_policy_output[:8].tolist()}")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: off_policy_out[:8]={off_policy_output[:8].tolist()}")
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: desire={self.npy['desire'].tolist()} traffic_conv={self.npy['traffic_convention'].tolist()}")
+      # parse plan velocity for quick sanity check
+      plan_mu = on_policy_output[:495].reshape(33, 15)
+      cloudlog.warning(f"MODELD_DEBUG frame {n}: plan_vel_x=[{plan_mu[0,3]:.2f},{plan_mu[5,3]:.2f},{plan_mu[10,3]:.2f}] plan_pos_x=[{plan_mu[0,0]:.2f},{plan_mu[5,0]:.2f},{plan_mu[10,0]:.2f}]")
+      # lead from off_policy
+      lead_start = self.off_policy_output_slices.get('lead', slice(0,0)).start
+      if lead_start > 0:
+        lead_mu = off_policy_output[lead_start:lead_start+72].reshape(3, 6, 4)
+        cloudlog.warning(f"MODELD_DEBUG frame {n}: lead0=[x={lead_mu[0,0,0]:.1f} y={lead_mu[0,0,1]:.2f} v={lead_mu[0,0,2]:.2f} a={lead_mu[0,0,3]:.2f}]")
+      # KEY CHECK: verify the JIT's baked-in hidden_state slice matches the metadata.
+      # The JIT extracts features internally and feeds them to the policy model.
+      # If the JIT was compiled with a STALE hidden_state slice, the features it uses
+      # internally would differ from what the metadata says hidden_state should be.
+      # We can't peek inside the JIT, but we CAN check: the feat_q buffer is updated
+      # by the JIT with the extracted features. Read back the newest entry and compare
+      # to what the metadata says hidden_state is.
+      feat_q_np = self.bufs['feat_q'].numpy()
+      newest_feat_in_q = feat_q_np[-1].flatten()  # last row = most recently appended feature
+      metadata_feats = feats.flatten() if len(feats) > 0 else np.array([])
+      if len(newest_feat_in_q) > 0 and len(metadata_feats) > 0 and len(newest_feat_in_q) == len(metadata_feats):
+        feat_match = np.allclose(newest_feat_in_q, metadata_feats, atol=1e-5)
+        feat_diff = np.abs(newest_feat_in_q - metadata_feats).max()
+        cloudlog.warning(f"MODELD_DEBUG frame {n}: FEAT_Q vs METADATA hidden_state: match={feat_match} max_diff={feat_diff:.6f}")
+        if not feat_match:
+          cloudlog.error(f"MODELD_DEBUG frame {n}: *** JIT BAKED-IN SLICE MISMATCH! JIT uses different hidden_state slice than metadata ***")
+          cloudlog.error(f"MODELD_DEBUG frame {n}: feat_q[-1][:5]={newest_feat_in_q[:5].tolist()} metadata_hs[:5]={metadata_feats[:5].tolist()}")
+      else:
+        cloudlog.warning(f"MODELD_DEBUG frame {n}: feat_q[-1] len={len(newest_feat_in_q)} metadata_hs len={len(metadata_feats)} (size mismatch?)")
+      # frame pixel sanity: check the raw frame isn't all zeros
+      for fkey in ('img', 'big_img'):
+        if fkey in self.full_frames:
+          frame_np = self.full_frames[fkey].numpy()
+          cloudlog.warning(f"MODELD_DEBUG frame {n}: {fkey} pixels: len={len(frame_np)} mean={frame_np.mean():.1f} std={frame_np.std():.1f} zeros%={100*(frame_np==0).mean():.1f}")
+      if prepare_only:
+        cloudlog.warning(f"MODELD_DEBUG frame {n}: PREPARE_ONLY=True (should have returned None)")
+    self._run_count += 1
+
     vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
     policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output, self.policy_output_slices))
     off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
@@ -326,12 +419,17 @@ def main(demo=False):
     frame_id = sm["roadCameraState"].frameId
     v_ego = max(sm["carState"].vEgo, 0.)
     lat_delay = sm["liveDelay"].lateralDelay + LAT_SMOOTH_SECONDS
+    if run_count <= 3:
+      cloudlog.warning(f"MODELD_DEBUG main loop {run_count}: sm.updated['liveCalibration']={sm.updated['liveCalibration']} "
+                       f"sm.seen['roadCameraState']={sm.seen['roadCameraState']} sm.seen['deviceState']={sm.seen['deviceState']} "
+                       f"live_calib_seen={live_calib_seen} vipc_frame_id={meta_main.frame_id} last_vipc_frame_id={last_vipc_frame_id}")
     if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
       device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
       dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
       model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
       model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
       live_calib_seen = True
+      cloudlog.warning(f"MODELD_DEBUG calibration applied! euler={device_from_calib_euler.tolist()} tfm_main_diag={[model_transform_main[i,i] for i in range(3)]} det={np.linalg.det(model_transform_main):.6f}")
 
     traffic_convention = np.zeros(2)
     traffic_convention[int(is_rhd)] = 1
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index 87bf7822b53854..854733e0d50b82 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -144,7 +144,51 @@ def trim_logs(logs, start_frame, end_frame, frs_types, include_all_types):
   return all_msgs
 
 
+def _debug_dump_model_files():
+  """Dump all model files so we can diff local vs CI."""
+  import hashlib, glob
+  from pathlib import Path
+  models_dir = Path(__file__).parents[2] / 'modeld' / 'models'
+  print("=" * 60)
+  print("MODEL_REPLAY_DEBUG: model files inventory")
+  print("=" * 60)
+  for p in sorted(models_dir.glob('*')):
+    if p.is_file():
+      sz = p.stat().st_size
+      md5 = hashlib.md5(p.read_bytes()).hexdigest()[:12]
+      tag = ""
+      # flag files that look stale (from master-era build)
+      if p.name in ('driving_vision_tinygrad.pkl', 'driving_policy_tinygrad.pkl',
+                     'driving_policy_metadata.pkl', 'warp_1928x1208_tinygrad.pkl', 'warp_1344x760_tinygrad.pkl'):
+        tag = " *** STALE FROM MASTER? ***"
+      # flag LFS pointer files (< 200 bytes = not fetched)
+      if p.suffix == '.onnx' and sz < 200:
+        tag = " *** LFS POINTER NOT FETCHED ***"
+      print(f"  {p.name:50s}  {sz:>12,}  {md5}{tag}")
+  print("=" * 60)
+
+  # also dump metadata content
+  import pickle
+  for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy']:
+    mp = models_dir / f'{name}_metadata.pkl'
+    if mp.exists():
+      m = pickle.load(open(mp, 'rb'))
+      slices = {k: f"({v.start},{v.stop})" for k, v in m.get('output_slices', {}).items()}
+      print(f"  META {name}: output_size={m.get('output_shapes',{}).get('outputs',('?','?'))[1]} slices={slices}")
+      print(f"  META {name}: input_shapes={m.get('input_shapes',{})}")
+    else:
+      print(f"  META {name}: *** MISSING ***")
+  # check for stale master-era metadata
+  stale = models_dir / 'driving_policy_metadata.pkl'
+  if stale.exists():
+    m = pickle.load(open(stale, 'rb'))
+    print(f"  META driving_policy (STALE): slices={list(m.get('output_slices',{}).keys())}")
+  print("=" * 60)
+
+
 def model_replay(lr, frs):
+  _debug_dump_model_files()
+
   # modeld is using frame pairs
   modeld_logs = trim_logs(lr, START_FRAME, END_FRAME, {"roadCameraState", "wideRoadCameraState"},
                                                                          {"roadEncodeIdx", "wideRoadEncodeIdx", "carParams", "carState", "carControl", "can"})
@@ -165,6 +209,20 @@ def model_replay(lr, frs):
   dmonitoringmodeld = get_process_config("dmonitoringmodeld")
 
   modeld_msgs = replay_process(modeld, modeld_logs, frs)
+
+  # debug: dump first N frames of modeld output
+  mv2_msgs = [m.modelV2 for m in modeld_msgs if m.which() == 'modelV2']
+  print(f"MODEL_REPLAY_DEBUG: got {len(mv2_msgs)} modelV2 messages (expected ~{END_FRAME - START_FRAME})")
+  for i, mv in enumerate(mv2_msgs[:15]):
+    vel = mv.velocity.x[0] if len(mv.velocity.x) > 0 else float('nan')
+    lead = mv.leadsV3[0].x[0] if len(mv.leadsV3) > 0 and len(mv.leadsV3[0].x) > 0 else float('nan')
+    lane = mv.laneLines[1].y[0] if len(mv.laneLines) > 1 and len(mv.laneLines[1].y) > 0 else float('nan')
+    gas = mv.meta.disengagePredictions.gasPressProbs[1] if len(mv.meta.disengagePredictions.gasPressProbs) > 1 else float('nan')
+    ds = list(mv.meta.desireState)
+    accel = mv.action.desiredAcceleration
+    curv = mv.action.desiredCurvature
+    print(f"  frame {i:2d}: vel_x0={vel:7.2f}  lead_x0={lead:7.1f}  lane_y0={lane:6.2f}  gas_p1={gas:.4f}  accel={accel:6.3f}  curv={curv:.6f}  desire={[f'{d:.4f}' for d in ds[:5]]}")
+
   dmonitoringmodeld_msgs = replay_process(dmonitoringmodeld, dmodeld_logs, frs)
 
   msgs = modeld_msgs + dmonitoringmodeld_msgs

From 8e95790beedf90b333743e13ba6304bb8669afff Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Wed, 8 Apr 2026 16:22:31 -0700
Subject: [PATCH 57/65] Revert "claude debug"

This reverts commit 49e754c6affa45a8ea8834588a00227b8090b17a.
---
 selfdrive/modeld/compile_modeld.py            |  15 +--
 selfdrive/modeld/modeld.py                    | 102 +-----------------
 selfdrive/test/process_replay/model_replay.py |  58 ----------
 3 files changed, 3 insertions(+), 172 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 60f774124a1b75..5b05914d06fb3f 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -181,18 +181,6 @@ def compile_modeld(cam_w, cam_h):
 
   frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
 
-  # DEBUG: log what we're baking into the JIT
-  import hashlib
-  for name, path in [('vision', MODELS_DIR / 'driving_vision.onnx'), ('on_policy', MODELS_DIR / 'driving_on_policy.onnx'),
-                     ('off_policy', MODELS_DIR / 'driving_off_policy.onnx')]:
-    sz = path.stat().st_size
-    md5 = hashlib.md5(path.read_bytes()).hexdigest()[:12]
-    print(f"  COMPILE_DEBUG onnx {name}: size={sz} md5={md5}")
-  print(f"  COMPILE_DEBUG vision_features_slice={vision_features_slice} vision_output_size={vision_metadata['output_shapes']['outputs'][1]}")
-  print(f"  COMPILE_DEBUG vision_slices={list(vision_metadata['output_slices'].keys())}")
-  print(f"  COMPILE_DEBUG policy_input_shapes={policy_input_shapes}")
-  print(f"  COMPILE_DEBUG frame_skip={frame_skip}")
-
   _run = make_run_policy(vision_runner, on_policy_runner, off_policy_runner,
                          cam_w, cam_h, vision_features_slice, frame_skip)
   run_policy_jit = TinyJit(_run, prune=True)
@@ -222,8 +210,7 @@ def compile_modeld(cam_w, cam_h):
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
     pickle.dump(run_policy_jit, f)
-  pkl_md5 = hashlib.md5(pkl_path.read_bytes()).hexdigest()[:12]
-  print(f"  Saved to {pkl_path} (size={pkl_path.stat().st_size} md5={pkl_md5})")
+  print(f"  Saved to {pkl_path}")
   return test_inputs, test_val
 
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 4ed0213fbf2377..816a61f83cf245 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -170,52 +170,7 @@ def __init__(self, cam_w: int, cam_h: int):
 
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
-    pkl_path = policy_pkl_path(cam_w, cam_h)
-    self.run_policy = pickle.loads(read_file_chunked(str(pkl_path)))
-    self._run_count = 0
-
-    # ===== DEBUG: dump everything that could be stale =====
-    import hashlib
-    def _fhash(p):
-      try:
-        return hashlib.md5(open(p,'rb').read()).hexdigest()[:12]
-      except FileNotFoundError:
-        return "MISSING"
-    def _fsize(p):
-      try:
-        return Path(p).stat().st_size
-      except FileNotFoundError:
-        return -1
-
-    cloudlog.warning("MODELD_DEBUG ====== INIT ======")
-    # onnx file hashes — catches LFS pointer vs real file
-    for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy']:
-      p = MODELS_DIR / f'{name}.onnx'
-      cloudlog.warning(f"MODELD_DEBUG onnx {name}: size={_fsize(p)} md5={_fhash(p)}")
-    # metadata file hashes — catches stale metadata
-    for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy', 'driving_policy']:
-      p = MODELS_DIR / f'{name}_metadata.pkl'
-      cloudlog.warning(f"MODELD_DEBUG meta {name}: size={_fsize(p)} md5={_fhash(p)}")
-    # compiled pkl hash
-    cloudlog.warning(f"MODELD_DEBUG pkl: path={pkl_path} size={_fsize(pkl_path)} md5={_fhash(pkl_path)}")
-    # stale master-era files that should NOT exist
-    for stale_name in ['driving_vision_tinygrad.pkl', 'driving_policy_tinygrad.pkl',
-                       'warp_1928x1208_tinygrad.pkl', 'warp_1344x760_tinygrad.pkl',
-                       'driving_policy_metadata.pkl']:
-      p = MODELS_DIR / stale_name
-      if p.exists():
-        cloudlog.warning(f"MODELD_DEBUG STALE FILE EXISTS: {p} size={_fsize(p)}")
-
-    # metadata content
-    hs = self.vision_output_slices.get('hidden_state', None)
-    cloudlog.warning(f"MODELD_DEBUG vision: output_size={vision_output_size} hidden_state_slice={hs} all_slices={list(self.vision_output_slices.keys())}")
-    cloudlog.warning(f"MODELD_DEBUG on_policy: slices={list(self.policy_output_slices.keys())} input_shapes={self.policy_input_shapes}")
-    cloudlog.warning(f"MODELD_DEBUG off_policy: slices={list(self.off_policy_output_slices.keys())} input_shapes={self.off_policy_input_shapes}")
-    # buffer shapes
-    for k, v in self.bufs.items():
-      cloudlog.warning(f"MODELD_DEBUG buf '{k}': shape={v.shape} dtype={v.dtype} device={v.device}")
-    cloudlog.warning(f"MODELD_DEBUG cam={cam_w}x{cam_h} frame_skip={self.frame_skip}")
-    cloudlog.warning("MODELD_DEBUG ====== END INIT ======")
+    self.run_policy = pickle.loads(read_file_chunked(str(policy_pkl_path(cam_w, cam_h))))
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -248,55 +203,7 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
 
     vision_output = vision_output.uop.base.buffer.numpy().flatten()
     on_policy_output = on_policy_output.uop.base.buffer.numpy().flatten()
-    off_policy_output = off_policy_output.uop.base.buffer.numpy().flatten()
-
-    if self._run_count < 10 or self._run_count % 10 == 0:
-      n = self._run_count
-      hs = self.vision_output_slices.get('hidden_state', None)
-      feats = vision_output[hs] if hs else np.array([])
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: output_lens v={len(vision_output)} on={len(on_policy_output)} off={len(off_policy_output)}")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: tfm={self.npy['tfm'].tolist()} big_tfm_diag=[{self.npy['big_tfm'][0,0]:.4f},{self.npy['big_tfm'][1,1]:.4f},{self.npy['big_tfm'][2,2]:.4f}]")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: feats len={len(feats)} mean={feats.mean():.4f} std={feats.std():.4f} min={feats.min():.4f} max={feats.max():.4f}")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: vision_out mean={vision_output.mean():.4f} std={vision_output.std():.4f}")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: on_policy_out[:8]={on_policy_output[:8].tolist()}")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: off_policy_out[:8]={off_policy_output[:8].tolist()}")
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: desire={self.npy['desire'].tolist()} traffic_conv={self.npy['traffic_convention'].tolist()}")
-      # parse plan velocity for quick sanity check
-      plan_mu = on_policy_output[:495].reshape(33, 15)
-      cloudlog.warning(f"MODELD_DEBUG frame {n}: plan_vel_x=[{plan_mu[0,3]:.2f},{plan_mu[5,3]:.2f},{plan_mu[10,3]:.2f}] plan_pos_x=[{plan_mu[0,0]:.2f},{plan_mu[5,0]:.2f},{plan_mu[10,0]:.2f}]")
-      # lead from off_policy
-      lead_start = self.off_policy_output_slices.get('lead', slice(0,0)).start
-      if lead_start > 0:
-        lead_mu = off_policy_output[lead_start:lead_start+72].reshape(3, 6, 4)
-        cloudlog.warning(f"MODELD_DEBUG frame {n}: lead0=[x={lead_mu[0,0,0]:.1f} y={lead_mu[0,0,1]:.2f} v={lead_mu[0,0,2]:.2f} a={lead_mu[0,0,3]:.2f}]")
-      # KEY CHECK: verify the JIT's baked-in hidden_state slice matches the metadata.
-      # The JIT extracts features internally and feeds them to the policy model.
-      # If the JIT was compiled with a STALE hidden_state slice, the features it uses
-      # internally would differ from what the metadata says hidden_state should be.
-      # We can't peek inside the JIT, but we CAN check: the feat_q buffer is updated
-      # by the JIT with the extracted features. Read back the newest entry and compare
-      # to what the metadata says hidden_state is.
-      feat_q_np = self.bufs['feat_q'].numpy()
-      newest_feat_in_q = feat_q_np[-1].flatten()  # last row = most recently appended feature
-      metadata_feats = feats.flatten() if len(feats) > 0 else np.array([])
-      if len(newest_feat_in_q) > 0 and len(metadata_feats) > 0 and len(newest_feat_in_q) == len(metadata_feats):
-        feat_match = np.allclose(newest_feat_in_q, metadata_feats, atol=1e-5)
-        feat_diff = np.abs(newest_feat_in_q - metadata_feats).max()
-        cloudlog.warning(f"MODELD_DEBUG frame {n}: FEAT_Q vs METADATA hidden_state: match={feat_match} max_diff={feat_diff:.6f}")
-        if not feat_match:
-          cloudlog.error(f"MODELD_DEBUG frame {n}: *** JIT BAKED-IN SLICE MISMATCH! JIT uses different hidden_state slice than metadata ***")
-          cloudlog.error(f"MODELD_DEBUG frame {n}: feat_q[-1][:5]={newest_feat_in_q[:5].tolist()} metadata_hs[:5]={metadata_feats[:5].tolist()}")
-      else:
-        cloudlog.warning(f"MODELD_DEBUG frame {n}: feat_q[-1] len={len(newest_feat_in_q)} metadata_hs len={len(metadata_feats)} (size mismatch?)")
-      # frame pixel sanity: check the raw frame isn't all zeros
-      for fkey in ('img', 'big_img'):
-        if fkey in self.full_frames:
-          frame_np = self.full_frames[fkey].numpy()
-          cloudlog.warning(f"MODELD_DEBUG frame {n}: {fkey} pixels: len={len(frame_np)} mean={frame_np.mean():.1f} std={frame_np.std():.1f} zeros%={100*(frame_np==0).mean():.1f}")
-      if prepare_only:
-        cloudlog.warning(f"MODELD_DEBUG frame {n}: PREPARE_ONLY=True (should have returned None)")
-    self._run_count += 1
-
+    off_policy_output = off_policy_output.uop.base.buffer.numpy()
     vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
     policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output, self.policy_output_slices))
     off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
@@ -419,17 +326,12 @@ def main(demo=False):
     frame_id = sm["roadCameraState"].frameId
     v_ego = max(sm["carState"].vEgo, 0.)
     lat_delay = sm["liveDelay"].lateralDelay + LAT_SMOOTH_SECONDS
-    if run_count <= 3:
-      cloudlog.warning(f"MODELD_DEBUG main loop {run_count}: sm.updated['liveCalibration']={sm.updated['liveCalibration']} "
-                       f"sm.seen['roadCameraState']={sm.seen['roadCameraState']} sm.seen['deviceState']={sm.seen['deviceState']} "
-                       f"live_calib_seen={live_calib_seen} vipc_frame_id={meta_main.frame_id} last_vipc_frame_id={last_vipc_frame_id}")
     if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
       device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
       dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
       model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
       model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
       live_calib_seen = True
-      cloudlog.warning(f"MODELD_DEBUG calibration applied! euler={device_from_calib_euler.tolist()} tfm_main_diag={[model_transform_main[i,i] for i in range(3)]} det={np.linalg.det(model_transform_main):.6f}")
 
     traffic_convention = np.zeros(2)
     traffic_convention[int(is_rhd)] = 1
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
index 854733e0d50b82..87bf7822b53854 100755
--- a/selfdrive/test/process_replay/model_replay.py
+++ b/selfdrive/test/process_replay/model_replay.py
@@ -144,51 +144,7 @@ def trim_logs(logs, start_frame, end_frame, frs_types, include_all_types):
   return all_msgs
 
 
-def _debug_dump_model_files():
-  """Dump all model files so we can diff local vs CI."""
-  import hashlib, glob
-  from pathlib import Path
-  models_dir = Path(__file__).parents[2] / 'modeld' / 'models'
-  print("=" * 60)
-  print("MODEL_REPLAY_DEBUG: model files inventory")
-  print("=" * 60)
-  for p in sorted(models_dir.glob('*')):
-    if p.is_file():
-      sz = p.stat().st_size
-      md5 = hashlib.md5(p.read_bytes()).hexdigest()[:12]
-      tag = ""
-      # flag files that look stale (from master-era build)
-      if p.name in ('driving_vision_tinygrad.pkl', 'driving_policy_tinygrad.pkl',
-                     'driving_policy_metadata.pkl', 'warp_1928x1208_tinygrad.pkl', 'warp_1344x760_tinygrad.pkl'):
-        tag = " *** STALE FROM MASTER? ***"
-      # flag LFS pointer files (< 200 bytes = not fetched)
-      if p.suffix == '.onnx' and sz < 200:
-        tag = " *** LFS POINTER NOT FETCHED ***"
-      print(f"  {p.name:50s}  {sz:>12,}  {md5}{tag}")
-  print("=" * 60)
-
-  # also dump metadata content
-  import pickle
-  for name in ['driving_vision', 'driving_on_policy', 'driving_off_policy']:
-    mp = models_dir / f'{name}_metadata.pkl'
-    if mp.exists():
-      m = pickle.load(open(mp, 'rb'))
-      slices = {k: f"({v.start},{v.stop})" for k, v in m.get('output_slices', {}).items()}
-      print(f"  META {name}: output_size={m.get('output_shapes',{}).get('outputs',('?','?'))[1]} slices={slices}")
-      print(f"  META {name}: input_shapes={m.get('input_shapes',{})}")
-    else:
-      print(f"  META {name}: *** MISSING ***")
-  # check for stale master-era metadata
-  stale = models_dir / 'driving_policy_metadata.pkl'
-  if stale.exists():
-    m = pickle.load(open(stale, 'rb'))
-    print(f"  META driving_policy (STALE): slices={list(m.get('output_slices',{}).keys())}")
-  print("=" * 60)
-
-
 def model_replay(lr, frs):
-  _debug_dump_model_files()
-
   # modeld is using frame pairs
   modeld_logs = trim_logs(lr, START_FRAME, END_FRAME, {"roadCameraState", "wideRoadCameraState"},
                                                                          {"roadEncodeIdx", "wideRoadEncodeIdx", "carParams", "carState", "carControl", "can"})
@@ -209,20 +165,6 @@ def model_replay(lr, frs):
   dmonitoringmodeld = get_process_config("dmonitoringmodeld")
 
   modeld_msgs = replay_process(modeld, modeld_logs, frs)
-
-  # debug: dump first N frames of modeld output
-  mv2_msgs = [m.modelV2 for m in modeld_msgs if m.which() == 'modelV2']
-  print(f"MODEL_REPLAY_DEBUG: got {len(mv2_msgs)} modelV2 messages (expected ~{END_FRAME - START_FRAME})")
-  for i, mv in enumerate(mv2_msgs[:15]):
-    vel = mv.velocity.x[0] if len(mv.velocity.x) > 0 else float('nan')
-    lead = mv.leadsV3[0].x[0] if len(mv.leadsV3) > 0 and len(mv.leadsV3[0].x) > 0 else float('nan')
-    lane = mv.laneLines[1].y[0] if len(mv.laneLines) > 1 and len(mv.laneLines[1].y) > 0 else float('nan')
-    gas = mv.meta.disengagePredictions.gasPressProbs[1] if len(mv.meta.disengagePredictions.gasPressProbs) > 1 else float('nan')
-    ds = list(mv.meta.desireState)
-    accel = mv.action.desiredAcceleration
-    curv = mv.action.desiredCurvature
-    print(f"  frame {i:2d}: vel_x0={vel:7.2f}  lead_x0={lead:7.1f}  lane_y0={lane:6.2f}  gas_p1={gas:.4f}  accel={accel:6.3f}  curv={curv:.6f}  desire={[f'{d:.4f}' for d in ds[:5]]}")
-
   dmonitoringmodeld_msgs = replay_process(dmonitoringmodeld, dmodeld_logs, frs)
 
   msgs = modeld_msgs + dmonitoringmodeld_msgs

From fc431c7dae0226a2b608700c56741305dd8a4e0f Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Wed, 8 Apr 2026 16:22:40 -0700
Subject: [PATCH 58/65] Revert "stale metadata?"

This reverts commit 870388513c0d4a67dcf970cd277b6db56cb2b478.
---
 selfdrive/modeld/SConscript | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 8edb936698ef60..5eb7da7648c328 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -21,9 +21,6 @@ tg_flags = {
 }.get(arch, 'DEV=CPU:LLVM THREADS=0')
 
 # Get model metadata
-# nuke stale metadata before regenerating (CI checkout can keep gitignored files from prior builds)
-for stale in glob.glob(os.path.join(Dir("#selfdrive/modeld").abspath, "models", "*_metadata.pkl")):
-  os.remove(stale)
 for model_name in ['driving_vision', 'driving_off_policy', 'driving_on_policy', 'dmonitoring_model']:
   fn = File(f"models/{model_name}").abspath
   script_files = [File(Dir("#selfdrive/modeld").File("get_model_metadata.py").abspath)]

From c534e4387c0e6a9bd8ced15d42f9b4501bd01f25 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Wed, 8 Apr 2026 23:54:59 -0700
Subject: [PATCH 59/65] modeld: realize jit outputs before parsing

---
 selfdrive/modeld/modeld.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 816a61f83cf245..63c40aeb537339 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -196,14 +196,15 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     self.npy['traffic_convention'][:] = inputs['traffic_convention']
     self.npy['tfm'][:,:] = transforms['img'][:,:]
     self.npy['big_tfm'][:,:] = transforms['big_img'][:,:]
-
     vision_output, on_policy_output, off_policy_output = self.run_policy(
       **self.bufs, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
     )
 
-    vision_output = vision_output.uop.base.buffer.numpy().flatten()
-    on_policy_output = on_policy_output.uop.base.buffer.numpy().flatten()
-    off_policy_output = off_policy_output.uop.base.buffer.numpy()
+    # The returned tensors can carry a larger graph than the final realized output buffer.
+    # Reading through uop.base.buffer can observe the wrong storage; realize first.
+    vision_output = vision_output.realize().numpy().flatten()
+    on_policy_output = on_policy_output.realize().numpy().flatten()
+    off_policy_output = off_policy_output.realize().numpy().flatten()
     vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
     policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output, self.policy_output_slices))
     off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))

From 557a75d9227c7a036078ed92f60a0b7111b84990 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Harald=20Sch=C3=A4fer?= <harald.the.engineer@gmail.com>
Date: Wed, 8 Apr 2026 23:56:09 -0700
Subject: [PATCH 60/65] Update modeld.py

---
 selfdrive/modeld/modeld.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 63c40aeb537339..7be8baddff4485 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -200,8 +200,6 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       **self.bufs, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
     )
 
-    # The returned tensors can carry a larger graph than the final realized output buffer.
-    # Reading through uop.base.buffer can observe the wrong storage; realize first.
     vision_output = vision_output.realize().numpy().flatten()
     on_policy_output = on_policy_output.realize().numpy().flatten()
     off_policy_output = off_policy_output.realize().numpy().flatten()

From 129f77bdffb07023fc90146517664eb0edb8fe50 Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Thu, 9 Apr 2026 00:07:58 -0700
Subject: [PATCH 61/65] modeld: fix NameError by removing redundant MODELS_DIR
 definition

---
 selfdrive/modeld/modeld.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 8d214dd611d285..3a631bd6418573 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -35,8 +35,6 @@
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
-MODELS_DIR = Path(__file__).parent / 'models'
-# DRIVING_PKL_PATH = MODELS_DIR / 'driving_tinygrad.pkl'
 VISION_METADATA_PATH = MODELS_DIR / 'driving_vision_metadata.pkl'
 ON_POLICY_METADATA_PATH = MODELS_DIR / 'driving_on_policy_metadata.pkl'
 OFF_POLICY_METADATA_PATH = MODELS_DIR / 'driving_off_policy_metadata.pkl'

From 2e9d4e572f26d43e920a6e62544b96a0a4ef082a Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 9 Apr 2026 14:42:56 -0700
Subject: [PATCH 62/65] test buffers in test vs. compile

---
 selfdrive/modeld/compile_modeld.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 5b05914d06fb3f..df2baa2aa151a8 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -197,15 +197,16 @@ def compile_modeld(cam_w, cam_h):
     st = time.perf_counter()
     with Context(OPENPILOT_HACKS=1):
       inputs = {**bufs, 'frame': frame, 'big_frame': big_frame}
+      if i == 1:  # copy inputs and buffers before running
+        test_inputs = {k: Tensor(v.numpy().copy(), device=v.device) for k, v in inputs.items()}
       outs = run_policy_jit(**inputs)
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
-    if i == 1:
-      test_val = [np.copy(v.numpy()) for v in outs]
-      test_inputs = {k: Tensor(v.numpy().copy(), device=v.device) for k, v in inputs.items()}
+    if i == 1: # copy outputs and buffers after sync
+      test_val = [np.copy(v.numpy()) for v in outs] + [np.copy(i.numpy()) for i in inputs.values()]
 
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
@@ -220,7 +221,7 @@ def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray]):
     st = time.perf_counter()
     out = run(**inputs)
     mt = time.perf_counter()
-    val = [v.numpy() for v in out]
+    val = [v.numpy() for v in out] + [np.copy(i.numpy()) for i in inputs.values()]
     et = time.perf_counter()
     print(f"enqueue {(mt-st)*1e3:6.2f} ms -- total run {(et-st)*1e3:6.2f} ms")
 

From 8dd7c399668d87b1e39411434ec92c67bfdc44c1 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 9 Apr 2026 14:56:24 -0700
Subject: [PATCH 63/65] 2x inputs before running

---
 selfdrive/modeld/compile_modeld.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index df2baa2aa151a8..72461c4664fa0d 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -186,7 +186,6 @@ def compile_modeld(cam_w, cam_h):
   run_policy_jit = TinyJit(_run, prune=True)
   bufs, npy = make_buffers(vision_input_shapes, policy_input_shapes, frame_skip)
 
-
   for i in range(3):
     frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
     big_frame = Tensor(np.random.randint(0, 256, yuv_size, dtype=np.uint8)).realize()
@@ -205,31 +204,36 @@ def compile_modeld(cam_w, cam_h):
     et = time.perf_counter()
     print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
-    if i == 1: # copy outputs and buffers after sync
-      test_val = [np.copy(v.numpy()) for v in outs] + [np.copy(i.numpy()) for i in inputs.values()]
+    if i == 1: # copy outputs and buffers
+      test_val = [np.copy(v.numpy()) for v in outs]
+      # TODO maybe return buffer from jit? and only use for test?
+      test_buffers = [np.copy(v.numpy()) for v in inputs.values()]
 
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
     pickle.dump(run_policy_jit, f)
   print(f"  Saved to {pkl_path}")
-  return test_inputs, test_val
+  return test_inputs, test_val, test_buffers
+
 
+def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray], test_buffers: list[np.ndarray]):
+  # 2x input before run, as it'll mutate the buffers
+  inputs_2x = {k: Tensor(v.numpy()*2, device=v.device) for k,v in inputs.items()}
 
-def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray]):
-  # run 20 times
   for i in range(20):
     st = time.perf_counter()
     out = run(**inputs)
     mt = time.perf_counter()
-    val = [v.numpy() for v in out] + [np.copy(i.numpy()) for i in inputs.values()]
+    val = [v.numpy() for v in out]
+    buffers = [v.numpy().copy() for v in inputs.values()] # TODO need copy()?
     et = time.perf_counter()
     print(f"enqueue {(mt-st)*1e3:6.2f} ms -- total run {(et-st)*1e3:6.2f} ms")
 
-    if test_val is not None and i == 0:  # check output matches before buffers get mutated by the jit
+    if i == 0:  # check output matches before buffers get mutated by the jit
       np.testing.assert_equal(test_val, val)
+      np.testing.assert_equal(test_buffers, buffers)
 
   # test that changing the inputs changes the model outputs
-  inputs_2x = {k: Tensor(v.numpy()*2, device=v.device) for k,v in inputs.items()}
   out = run(**inputs_2x)
   changed_val = [v.numpy() for v in out]
   for v, cv in zip(val, changed_val):
@@ -265,9 +269,9 @@ def compile_dm_warp(cam_w, cam_h):
 
 def run_and_save_pickle():
   for cam_w, cam_h in CAMERA_CONFIGS:
-    inputs, outputs = compile_modeld(cam_w, cam_h)
+    inputs, outputs, buffers = compile_modeld(cam_w, cam_h)
     pickle_loaded = pickle.load(open(policy_pkl_path(cam_w, cam_h), "rb"))
-    test_vs_compile(pickle_loaded, inputs, outputs)
+    test_vs_compile(pickle_loaded, inputs, outputs, buffers)
 
     compile_dm_warp(cam_w, cam_h)
 

From dac6c75c13f221a1cc32adf7a0f5f89d1c27325e Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 9 Apr 2026 15:07:12 -0700
Subject: [PATCH 64/65] fixup 2x inputs test

---
 selfdrive/modeld/compile_modeld.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 72461c4664fa0d..e11306b81b7aeb 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -207,7 +207,7 @@ def compile_modeld(cam_w, cam_h):
     if i == 1: # copy outputs and buffers
       test_val = [np.copy(v.numpy()) for v in outs]
       # TODO maybe return buffer from jit? and only use for test?
-      test_buffers = [np.copy(v.numpy()) for v in inputs.values()]
+      test_buffers = [np.copy(v.numpy().copy()) for v in inputs.values()]
 
   pkl_path = policy_pkl_path(cam_w, cam_h)
   with open(pkl_path, "wb") as f:
@@ -217,25 +217,24 @@ def compile_modeld(cam_w, cam_h):
 
 
 def test_vs_compile(run, inputs: dict[str, Tensor], test_val: list[np.ndarray], test_buffers: list[np.ndarray]):
-  # 2x input before run, as it'll mutate the buffers
-  inputs_2x = {k: Tensor(v.numpy()*2, device=v.device) for k,v in inputs.items()}
-
   for i in range(20):
     st = time.perf_counter()
     out = run(**inputs)
     mt = time.perf_counter()
-    val = [v.numpy() for v in out]
-    buffers = [v.numpy().copy() for v in inputs.values()] # TODO need copy()?
+    Device.default.synchronize()
     et = time.perf_counter()
     print(f"enqueue {(mt-st)*1e3:6.2f} ms -- total run {(et-st)*1e3:6.2f} ms")
 
     if i == 0:  # check output matches before buffers get mutated by the jit
+      val = [v.numpy() for v in out]
+      buffers = [v.numpy().copy() for v in inputs.values()]
       np.testing.assert_equal(test_val, val)
       np.testing.assert_equal(test_buffers, buffers)
 
   # test that changing the inputs changes the model outputs
-  out = run(**inputs_2x)
-  changed_val = [v.numpy() for v in out]
+  inputs_2x = {k: Tensor(v.numpy().copy()*2, device=v.device) for k,v in inputs.items()}
+  changed_val = [v.numpy() for v in run(**inputs_2x)]
+  val = [v.numpy() for v in run(**inputs)]
   for v, cv in zip(val, changed_val):
     assert not np.array_equal(v, cv), f"output with shape {v.shape} didn't change when inputs were doubled"
   print('test_vs_compile OK')

From 49c8b9a505db38ff22f342db011a3a6b6526d398 Mon Sep 17 00:00:00 2001
From: Armandpl <adpl33@gmail.com>
Date: Thu, 9 Apr 2026 17:06:26 -0700
Subject: [PATCH 65/65] realize onnx weights?

---
 selfdrive/modeld/compile_modeld.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index e11306b81b7aeb..513f3275835c15 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -172,6 +172,16 @@ def compile_modeld(cam_w, cam_h):
   on_policy_runner = OnnxRunner(MODELS_DIR / 'driving_on_policy.onnx')
   off_policy_runner = OnnxRunner(MODELS_DIR / 'driving_off_policy.onnx')
 
+  # Eagerly move weights to the compute device and realize them BEFORE JIT capture.
+  # Otherwise the NPY->DEFAULT copy + layout transform ends up as "onetime" pruned kernels
+  # whose output buffers can get aliased with scratch by memory planning, so the "baked"
+  # transformed-weight data read on replay ends up being scribbled-over garbage on CI devices.
+  # Mutate graph_values directly since get_parameters returns a list and .to() produces new tensors.
+  for runner in (vision_runner, on_policy_runner, off_policy_runner):
+    for name, t in list(runner.graph_values.items()):
+      if isinstance(t, Tensor):
+        runner.graph_values[name] = t.to(Device.DEFAULT).realize()
+
   with open(MODELS_DIR / 'driving_vision_metadata.pkl', 'rb') as f:
     vision_metadata = pickle.load(f)
     vision_features_slice = vision_metadata['output_slices']['hidden_state']