commaai · mahmudsudo · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -190,7 +190,6 @@ jobs:
       (github.event.pull_request.head.repo.full_name == 'commaai/openpilot'))
       && fromJSON('["namespace-profile-amd64-8x16"]')
       || fromJSON('["ubuntu-24.04"]') }}
-    if: false  # FIXME: Started to timeout recently
     steps:
     - uses: actions/checkout@v6
       with:
@@ -199,10 +198,20 @@ jobs:
     - name: Build openpilot
       run: scons -j$(nproc)
     - name: Driving test
-      timeout-minutes: 2
+      timeout-minutes: 5
+      env:
+        TEST_DURATION: 60
+        RECORD: 1
+        ONNXCPU: "1"
       run: |
         source selfdrive/test/setup_xvfb.sh
         pytest -s tools/sim/tests/test_metadrive_bridge.py
+    - name: Upload logs
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: metadrive_logs
+        path: ~/.comma/media/0/realdata/
 
   create_ui_report:
     name: Create UI Report

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -238,6 +238,15 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
 def main(demo=False):
   cloudlog.warning("modeld init")
 
+  # Check for model files
+  for p in (VISION_PKL_PATH, POLICY_PKL_PATH, VISION_METADATA_PATH, POLICY_METADATA_PATH):
+    if not p.exists():
+      cloudlog.error(f"Model file {p} not found!")
+      raise RuntimeError(f"Model file {p} not found! Check your build or Git LFS.")
+    if p.stat().st_size < 1000:
+      cloudlog.error(f"Model file {p} is too small! Likely a Git LFS pointer.")
+      raise RuntimeError(f"Model file {p} is too small! Likely a Git LFS pointer.")
+
   if not USBGPU:
     # USB GPU currently saturates a core so can't do this yet,
     # also need to move the aux USB interrupts for good timings

diff --git a/selfdrive/selfdrived/selfdrived.py b/selfdrive/selfdrived/selfdrived.py
@@ -76,7 +76,9 @@ def __init__(self, CP=None):
 
     ignore = self.sensor_packets + self.gps_packets + ['alertDebug']
     if SIMULATION:
-      ignore += ['driverCameraState', 'managerState']
+      ignore += ['driverCameraState', 'managerState', 'controlsState', 'carControl', 'pandaStates',
+                 'peripheralState', 'driverMonitoringState', 'driverAssistance', 'carOutput',
+                 'audioFeedback', 'userBookmark']
     if REPLAY:
       # no vipc in replay will make them ignored anyways
       ignore += ['roadCameraState', 'wideRoadCameraState']
@@ -340,9 +342,9 @@ def update_events(self, CS):
       self.logged_comm_issue = None
 
     if not self.CP.notCar:
-      if not self.sm['livePose'].posenetOK:
+      if not self.sm['livePose'].posenetOK and not SIMULATION:
         self.events.add(EventName.posenetInvalid)
-      if not self.sm['livePose'].inputsOK:
+      if not self.sm['livePose'].inputsOK and not SIMULATION:
         self.events.add(EventName.locationdTemporaryError)
       if not self.sm['liveParameters'].valid and cal_status == log.LiveCalibrationData.Status.calibrated and not TESTING_CLOSET and (not SIMULATION or REPLAY):
         self.events.add(EventName.paramsdTemporaryError)

diff --git a/system/manager/process.py b/system/manager/process.py
@@ -37,18 +37,21 @@ def launcher(proc: str, name: str) -> None:
   except KeyboardInterrupt:
     cloudlog.warning(f"child {proc} got SIGINT")
   except Exception:
-    # can't install the crash handler because sys.excepthook doesn't play nice
-    # with threads, so catch it here.
+    import traceback
+    print(f"PROCESS {name} ({proc}) EXCEPTION:\n{traceback.format_exc()}")
+    cloudlog.error(f"process {name} failed at {proc}:\n{traceback.format_exc()}")
     sentry.capture_exception()
     raise
-
-
 def nativelauncher(pargs: list[str], cwd: str, name: str) -> None:
   os.environ['MANAGER_DAEMON'] = name
 
-  # exec the process
-  os.chdir(cwd)
-  os.execvp(pargs[0], pargs)
+  try:
+    os.chdir(cwd)
+    os.execvp(pargs[0], pargs)
+  except Exception:
+    import traceback
+    print(f"NATIVE PROCESS {name} EXCEPTION:\n{traceback.format_exc()}")
+    raise
 
 
 def join_process(process: Process, timeout: float) -> None:

diff --git a/system/manager/process_config.py b/system/manager/process_config.py
@@ -76,7 +76,7 @@ def and_(*fns):
   PythonProcess("micd", "system.micd", iscar),
   PythonProcess("timed", "system.timed", always_run, enabled=not PC),
 
-  PythonProcess("modeld", "selfdrive.modeld.modeld", only_onroad),
+  PythonProcess("modeld", "selfdrive.modeld.modeld", only_onroad, restart_if_crash=True),
   PythonProcess("dmonitoringmodeld", "selfdrive.modeld.dmonitoringmodeld", driverview, enabled=(WEBCAM or not PC)),
 
   PythonProcess("sensord", "system.sensord.sensord", only_onroad, enabled=not PC),

diff --git a/tools/sim/launch_openpilot.sh b/tools/sim/launch_openpilot.sh
@@ -5,11 +5,16 @@ export NOBOARD="1"
 export SIMULATION="1"
 export SKIP_FW_QUERY="1"
 export FINGERPRINT="HONDA_CIVIC_2022"
+export TINYGRAD_DEBUG=0
 
-export BLOCK="${BLOCK},camerad,loggerd,encoderd,micd,logmessaged,manage_athenad"
+if [[ -n "$RECORD" ]]; then
+  export BLOCK="${BLOCK},camerad,stream_encoderd,micd,logmessaged,manage_athenad,soundd"
+else
+  export BLOCK="${BLOCK},camerad,loggerd,encoderd,stream_encoderd,micd,logmessaged,manage_athenad,soundd"
+fi
 if [[ "$CI" ]]; then
   # TODO: offscreen UI should work
-  export BLOCK="${BLOCK},ui"
+  export BLOCK="${BLOCK},ui,loggerd,encoderd"
 fi
 
 python3 -c "from openpilot.selfdrive.test.helpers import set_params_enabled; set_params_enabled()"

diff --git a/tools/sim/tests/test_metadrive_bridge.py b/tools/sim/tests/test_metadrive_bridge.py
@@ -1,4 +1,5 @@
 import pytest
+import os
 import warnings
 
 # Since metadrive depends on pkg_resources, and pkg_resources is deprecated as an API
@@ -11,7 +12,7 @@
 class TestMetaDriveBridge(TestSimBridgeBase):
   @pytest.fixture(autouse=True)
   def setup_create_bridge(self, test_duration):
-    self.test_duration = 30
+    self.test_duration = int(os.environ.get('TEST_DURATION', 30))
 
   def create_bridge(self):
     return MetaDriveBridge(False, False, self.test_duration, True)
diff --git a/tools/sim/tests/test_sim_bridge.py b/tools/sim/tests/test_sim_bridge.py
@@ -22,7 +22,7 @@ def setup_method(self):
 
   def test_driving(self):
     # Startup manager and bridge.py. Check processes are running, then engage and verify.
-    p_manager = subprocess.Popen("./launch_openpilot.sh", cwd=SIM_DIR)
+    p_manager = subprocess.Popen("./launch_openpilot.sh", cwd=SIM_DIR, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     self.processes.append(p_manager)
 
     sm = messaging.SubMaster(['selfdriveState', 'onroadEvents', 'managerState'])
@@ -31,62 +31,96 @@ def test_driving(self):
     p_bridge = bridge.run(q, retries=10)
     self.processes.append(p_bridge)
 
-    max_time_per_step = 60
+    max_time_per_step = 180
 
     # Wait for bridge to startup
     start_waiting = time.monotonic()
     while not bridge.started.value and time.monotonic() < start_waiting + max_time_per_step:
       time.sleep(0.1)
-    assert p_bridge.exitcode is None, f"Bridge process should be running, but exited with code {p_bridge.exitcode}"
 
-    start_time = time.monotonic()
-    no_car_events_issues_once = False
-    car_event_issues = []
-    not_running = []
-    while time.monotonic() < start_time + max_time_per_step:
-      sm.update()
+    try:
+      assert p_bridge.exitcode is None, f"Bridge process should be running, but exited with code {p_bridge.exitcode}"
 
-      not_running = [p.name for p in sm['managerState'].processes if not p.running and p.shouldBeRunning]
-      car_event_issues = [event.name for event in sm['onroadEvents'] if any([event.noEntry, event.softDisable, event.immediateDisable])]
-
-      if sm.all_alive() and len(car_event_issues) == 0 and len(not_running) == 0:
-        no_car_events_issues_once = True
-        break
-
-    assert no_car_events_issues_once, \
-                    f"Failed because no messages received, or CarEvents '{car_event_issues}' or processes not running '{not_running}'"
+      start_time = time.monotonic()
+      no_car_events_issues_once = False
+      car_event_issues = []
+      not_running = []
+      while time.monotonic() < start_time + max_time_per_step:
+        sm.update()
 
-    start_time = time.monotonic()
-    min_counts_control_active = 100
-    control_active = 0
+        not_running = [p.name for p in sm['managerState'].processes if not p.running and p.shouldBeRunning]
+        car_event_issues = [event.name for event in sm['onroadEvents'] if any([event.noEntry, event.softDisable, event.immediateDisable])]
 
-    while time.monotonic() < start_time + max_time_per_step:
-      sm.update()
-
-      if sm.all_alive() and sm['selfdriveState'].active:
-        control_active += 1
-
-        if control_active == min_counts_control_active:
+        if sm.all_alive() and len(car_event_issues) == 0 and len(not_running) == 0:
+          no_car_events_issues_once = True
           break
-
-    assert min_counts_control_active == control_active, f"Simulator did not engage a minimal of {min_counts_control_active} steps was {control_active}"
-
-    failure_states = []
-    while bridge.started.value:
-      continue
-
-    while not q.empty():
-      state = q.get()
-      if state.type == QueueMessageType.TERMINATION_INFO:
-        done_info = state.info
-        failure_states = [done_state for done_state in done_info if done_state != "timeout" and done_info[done_state]]
-        break
-    assert len(failure_states) == 0, f"Simulator fails to finish a loop. Failure states: {failure_states}"
+        else:
+          if sm.frame % 100 == 0:
+             print(f"Waiting for healthy state... not_running: {not_running}, car_event_issues: {car_event_issues}")
+             if not sm.all_alive():
+               print(f"  NOT ALIVE: {[s for s, a in sm.alive.items() if not a]}")
+             if not sm.all_freq_ok():
+               print(f"  FREQ NOT OK: {[s for s, f in sm.freq_ok.items() if not f]}")
+             if not sm.all_valid():
+               print(f"  NOT VALID: {[s for s, v in sm.valid.items() if not v]}")
+
+      assert no_car_events_issues_once, \
+                      f"Failed because no messages received, or CarEvents '{car_event_issues}' or processes not running '{not_running}'"
+
+      start_time = time.monotonic()
+      min_counts_control_active = 100
+      control_active = 0
+
+      while time.monotonic() < start_time + max_time_per_step:
+        sm.update()
+
+        if sm.all_alive() and sm['selfdriveState'].active:
+          control_active += 1
+
+          if control_active == min_counts_control_active:
+            break
+
+      engageable = sm['selfdriveState'].engageable
+      alive = sm.all_alive()
+      events = [event.name for event in sm['onroadEvents']]
+      not_running = [p.name for p in sm['managerState'].processes if not p.running and p.shouldBeRunning]
+      err_msg = f"Sim not engaged. active: {control_active}, engageable: {engageable}, alive: {alive}, events: {events}, not_running: {not_running}. "
+      if not engageable:
+        err_msg += "Check if modeld or locationd crashed or are not publishing. "
+      assert min_counts_control_active == control_active, err_msg
+
+      failure_states = []
+      while bridge.started.value:
+        time.sleep(0.1)
+
+      while not q.empty():
+        state = q.get()
+        if state.type == QueueMessageType.TERMINATION_INFO:
+          done_info = state.info
+          failure_states = [done_state for done_state in done_info if done_state != "timeout" and done_info[done_state]]
+          break
+      assert len(failure_states) == 0, f"Simulator fails to finish a loop. Failure states: {failure_states}"
+    except Exception:
+      if p_manager.poll() is None:
+        p_manager.terminate()
+      stdout, _ = p_manager.communicate(timeout=10)
+      print("\n\n" + "="*20 + " MANAGER LOGS " + "="*20)
+      print(stdout)
+      print("="*54 + "\n\n")
+      raise
 
   def teardown_method(self):
     print("Test shutting down. CommIssues are acceptable")
     for p in reversed(self.processes):
-      p.terminate()
-
-    for p in reversed(self.processes):
-      p.kill()
+      if isinstance(p, subprocess.Popen):
+        if p.poll() is None:
+          p.terminate()
+          try:
+            p.wait(15)
+          except subprocess.TimeoutExpired:
+            p.kill()
+      else:
+        p.terminate()
+        p.join(15)
+        if p.exitcode is None:
+          p.kill()