commaai · YassineYousfi · Apr 7, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ __pycache__/
 
 # C extensions
 *.so
+!lib/*.so
+!lib/*.so.*
 
 # Distribution / packaging
 .Python
@@ -15,6 +17,7 @@ downloads/
 eggs/
 .eggs/
 lib/
+!lib/
 lib64/
 parts/
 sdist/

diff --git a/ffmpeg-new b/ffmpeg-new
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,9 @@ dependencies = [
   "tqdm",
   "pillow",
   "av",
+  "charset-normalizer",
+  "requests",
+  "urllib3",
 ]
 
 [dependency-groups]

diff --git a/submissions/av1_roi_lanczos_unsharp/compress.sh b/submissions/av1_roi_lanczos_unsharp/compress.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PD="$(cd "${HERE}/../.." && pwd)"
+TMP_DIR="${PD}/tmp/av1_roi_lanczos_unsharp"
+
+IN_DIR="${PD}/videos"
+VIDEO_NAMES_FILE="${PD}/public_test_video_names.txt"
+ARCHIVE_DIR="${HERE}/archive"
+JOBS="1"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --in-dir|--in_dir)
+      IN_DIR="${2%/}"; shift 2 ;;
+    --jobs)
+      JOBS="$2"; shift 2 ;;
+    --video-names-file|--video_names_file)
+      VIDEO_NAMES_FILE="$2"; shift 2 ;;
+    *)
+      echo "Unknown arg: $1" >&2
+      echo "Usage: $0 [--in-dir <dir>] [--jobs <n>] [--video-names-file <file>]" >&2
+      exit 2 ;;
+  esac
+done
+
+rm -rf "$ARCHIVE_DIR"
+mkdir -p "$ARCHIVE_DIR"
+mkdir -p "$TMP_DIR"
+
+export IN_DIR ARCHIVE_DIR PD
+
+head -n "$(wc -l < "$VIDEO_NAMES_FILE")" "$VIDEO_NAMES_FILE" | xargs -P"$JOBS" -I{} bash -lc '
+  rel="$1"
+  [[ -z "$rel" ]] && exit 0
+
+  IN="${IN_DIR}/${rel}"
+  BASE="${rel%.*}"
+  OUT="${ARCHIVE_DIR}/${BASE}.mkv"
+  PRE_IN="'"${TMP_DIR}"'/${BASE}.pre.mkv"
+
+  echo "→ ${IN}  →  ${OUT}"
+
+  # Step 1: ROI preprocess — denoise outside driving corridor
+  rm -f "$PRE_IN"
+  python "'"${HERE}"'/preprocess.py" \
+    --input "$IN" \
+    --output "$PRE_IN" \
+    --outside-luma-denoise 2.5 \
+    --outside-chroma-mode medium \
+    --feather-radius 24 \
+    --outside-blend 0.50
+
+  # Step 2: Downscale + AV1 encode
+  FFMPEG="'"${HERE}"'/ffmpeg-new"
+  [ ! -x "$FFMPEG" ] && FFMPEG="ffmpeg"
+  export LD_LIBRARY_PATH="'"${HERE}"'/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+  "$FFMPEG" -nostdin -y -hide_banner -loglevel warning \
+    -r 20 -fflags +genpts -i "$PRE_IN" \
+    -vf "scale=trunc(iw*0.45/2)*2:trunc(ih*0.45/2)*2:flags=lanczos" \
+    -pix_fmt yuv420p -c:v libsvtav1 -preset 0 -crf 33 \
+    -svtav1-params "film-grain=22:keyint=180:scd=0" \
+    -r 20 "$OUT"
+
+  rm -f "$PRE_IN"
+' _ {}
+
+# zip archive
+cd "$ARCHIVE_DIR"
+if command -v zip &>/dev/null; then
+  zip -r "${HERE}/archive.zip" .
+else
+  python3 -c "
+import zipfile, os
+with zipfile.ZipFile('${HERE}/archive.zip', 'w', zipfile.ZIP_STORED) as zf:
+    for f in os.listdir('.'):
+        zf.write(f)
+"
+fi
+echo "Compressed to ${HERE}/archive.zip"
diff --git a/submissions/av1_roi_lanczos_unsharp/ffmpeg-new b/submissions/av1_roi_lanczos_unsharp/ffmpeg-new
diff --git a/submissions/av1_roi_lanczos_unsharp/inflate.py b/submissions/av1_roi_lanczos_unsharp/inflate.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+import av, torch, numpy as np
+import torch.nn.functional as F
+from PIL import Image
+from frame_utils import camera_size, yuv420_to_rgb
+
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+_r = torch.tensor([1., 8., 28., 56., 70., 56., 28., 8., 1.])
+KERNEL = (torch.outer(_r, _r) / (_r.sum()**2)).to(DEVICE).expand(3, 1, 9, 9)
+STRENGTH = 0.40
+
+
+def decode_and_resize_to_file(video_path: str, dst: str):
+  target_w, target_h = camera_size
+  fmt = 'hevc' if video_path.endswith('.hevc') else None
+  container = av.open(video_path, format=fmt)
+  stream = container.streams.video[0]
+  n = 0
+  with open(dst, 'wb') as f:
+    for frame in container.decode(stream):
+      t = yuv420_to_rgb(frame)  # (H, W, 3)
+      H, W, _ = t.shape
+      if H != target_h or W != target_w:
+        pil = Image.fromarray(t.numpy())
+        pil = pil.resize((target_w, target_h), Image.LANCZOS)
+        x = torch.from_numpy(np.array(pil)).permute(2, 0, 1).unsqueeze(0).float().to(DEVICE)
+        blur = F.conv2d(F.pad(x, (4, 4, 4, 4), mode='reflect'), KERNEL, padding=0, groups=3)
+        x = x + STRENGTH * (x - blur)
+        t = x.clamp(0, 255).squeeze(0).permute(1, 2, 0).round().cpu().to(torch.uint8)
+      f.write(t.contiguous().numpy().tobytes())
+      n += 1
+  container.close()
+  return n
+
+
+if __name__ == "__main__":
+  import sys
+  src, dst = sys.argv[1], sys.argv[2]
+  n = decode_and_resize_to_file(src, dst)
+  print(f"saved {n} frames")
diff --git a/submissions/av1_roi_lanczos_unsharp/inflate.sh b/submissions/av1_roi_lanczos_unsharp/inflate.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Must produce a raw video file at `<output_dir>/<base_name>.raw`.
+# A `.raw` file is a flat binary dump of uint8 RGB frames with shape `(N, H, W, 3)`
+# where N is the number of frames, H and W match the original video dimensions, no header.
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$HERE/../.." && pwd)"
+SUB_NAME="$(basename "$HERE")"
+
+DATA_DIR="$1"
+OUTPUT_DIR="$2"
+FILE_LIST="$3"
+
+mkdir -p "$OUTPUT_DIR"
+
+while IFS= read -r line; do
+  [ -z "$line" ] && continue
+  BASE="${line%.*}"
+  SRC="${DATA_DIR}/${BASE}.mkv"
+  DST="${OUTPUT_DIR}/${BASE}.raw"
+
+  [ ! -f "$SRC" ] && echo "ERROR: ${SRC} not found" >&2 && exit 1
+
+  printf "Decoding + resizing %s ... " "$line"
+  cd "$ROOT"
+  python -m "submissions.${SUB_NAME}.inflate" "$SRC" "$DST"
+done < "$FILE_LIST"
diff --git a/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2 b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2
@@ -0,0 +1 @@
+libSvtAv1Enc.so.2.3.0
diff --git a/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2.3.0 b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2.3.0
diff --git a/submissions/av1_roi_lanczos_unsharp/preprocess.py b/submissions/av1_roi_lanczos_unsharp/preprocess.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+import argparse
+import sys
+from pathlib import Path
+
+import av
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageDraw, ImageFilter
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+  sys.path.insert(0, str(ROOT))
+
+from frame_utils import yuv420_to_rgb
+
+
+def collapse_chroma(x: torch.Tensor, mode: str) -> torch.Tensor:
+  if mode == "normal":
+    return x
+  if mode == "soft":
+    k = 1
+  elif mode == "medium":
+    k = 2
+  elif mode == "strong":
+    k = 4
+  else:
+    raise ValueError(f"unknown chroma mode: {mode}")
+  uv = x[:, 1:3]
+  uv = F.avg_pool2d(uv, kernel_size=k * 2 + 1, stride=1, padding=k)
+  x[:, 1:3] = uv
+  return x
+
+
+def apply_luma_denoise(x: torch.Tensor, strength: float) -> torch.Tensor:
+  if strength <= 0:
+    return x
+  kernel_size = 3 if strength <= 2.0 else 5
+  sigma = max(0.1, strength * 0.35)
+  coords = torch.arange(kernel_size, device=x.device) - kernel_size // 2
+  g = torch.exp(-(coords ** 2) / (2 * sigma * sigma))
+  kernel_1d = (g / g.sum()).float()
+  kernel_2d = torch.outer(kernel_1d, kernel_1d).view(1, 1, kernel_size, kernel_size)
+  y = x[:, 0:1]
+  y_blur = F.conv2d(y, kernel_2d, padding=kernel_size // 2)
+  blend = min(0.9, strength / 3.0)
+  x[:, 0:1] = (1 - blend) * y + blend * y_blur
+  return x
+
+
+def rgb_to_yuv(rgb: torch.Tensor) -> torch.Tensor:
+  r = rgb[:, 0:1]
+  g = rgb[:, 1:2]
+  b = rgb[:, 2:3]
+  y = 0.299 * r + 0.587 * g + 0.114 * b
+  u = (b - y) / 1.772 + 128.0
+  v = (r - y) / 1.402 + 128.0
+  return torch.cat([y, u, v], dim=1)
+
+
+def yuv_to_rgb(yuv: torch.Tensor) -> torch.Tensor:
+  y = yuv[:, 0:1]
+  u = yuv[:, 1:2] - 128.0
+  v = yuv[:, 2:3] - 128.0
+  r = y + 1.402 * v
+  g = y - 0.344136 * u - 0.714136 * v
+  b = y + 1.772 * u
+  return torch.cat([r, g, b], dim=1)
+
+
+def segment_polygon(frame_idx: int, width: int, height: int) -> list[tuple[float, float]]:
+  segments = [
+    (0, 299, [(0.14, 0.52), (0.82, 0.48), (0.98, 1.00), (0.05, 1.00)]),
+    (300, 599, [(0.10, 0.50), (0.76, 0.47), (0.92, 1.00), (0.00, 1.00)]),
+    (600, 899, [(0.18, 0.50), (0.84, 0.47), (0.98, 1.00), (0.06, 1.00)]),
+    (900, 1199, [(0.22, 0.52), (0.90, 0.49), (1.00, 1.00), (0.10, 1.00)]),
+  ]
+  for start, end, poly in segments:
+    if start <= frame_idx <= end:
+      return [(x * width, y * height) for x, y in poly]
+  return [(0.15 * width, 0.52 * height), (0.85 * width, 0.48 * height), (width, height), (0, height)]
+
+
+def build_mask(frame_idx: int, width: int, height: int, feather_radius: int) -> torch.Tensor:
+  img = Image.new("L", (width, height), 0)
+  draw = ImageDraw.Draw(img)
+  draw.polygon(segment_polygon(frame_idx, width, height), fill=255)
+  if feather_radius > 0:
+    img = img.filter(ImageFilter.GaussianBlur(radius=feather_radius))
+  mask = torch.frombuffer(memoryview(img.tobytes()), dtype=torch.uint8).clone().view(height, width).float() / 255.0
+  return mask.unsqueeze(0).unsqueeze(0)
+
+
+def process_frame(
+  frame_rgb: torch.Tensor,
+  frame_idx: int,
+  outside_luma_denoise: float,
+  outside_chroma_mode: str,
+  feather_radius: int,
+  outside_blend: float,
+) -> torch.Tensor:
+  chw = frame_rgb.permute(2, 0, 1).float().unsqueeze(0)
+  mask = build_mask(frame_idx, chw.shape[-1], chw.shape[-2], feather_radius).to(chw.device)
+  yuv = rgb_to_yuv(chw)
+  processed = yuv.clone()
+  processed = apply_luma_denoise(processed, outside_luma_denoise)
+  processed = collapse_chroma(processed, outside_chroma_mode)
+  processed_rgb = yuv_to_rgb(processed)
+  outside_alpha = (1.0 - mask) * outside_blend
+  mixed = chw * (1.0 - outside_alpha) + processed_rgb * outside_alpha
+  return mixed.clamp(0, 255).round().to(torch.uint8).squeeze(0).permute(1, 2, 0)
+
+
+def main() -> None:
+  parser = argparse.ArgumentParser(description="Hand-authored ROI preprocessor for AV1 encode.")
+  parser.add_argument("--input", type=Path, required=True)
+  parser.add_argument("--output", type=Path, required=True)
+  parser.add_argument("--outside-luma-denoise", type=float, default=0.0)
+  parser.add_argument("--outside-chroma-mode", type=str, default="normal")
+  parser.add_argument("--feather-radius", type=int, default=32)
+  parser.add_argument("--outside-blend", type=float, default=1.0)
+  args = parser.parse_args()
+
+  in_container = av.open(str(args.input))
+  in_stream = in_container.streams.video[0]
+  width = in_stream.width
+  height = in_stream.height
+
+  out_container = av.open(str(args.output), mode="w")
+  out_stream = out_container.add_stream("ffv1", rate=20)
+  out_stream.width = width
+  out_stream.height = height
+  out_stream.pix_fmt = "yuv420p"
+
+  for frame_idx, frame in enumerate(in_container.decode(in_stream)):
+    rgb = yuv420_to_rgb(frame)
+    out_rgb = process_frame(
+      rgb,
+      frame_idx=frame_idx,
+      outside_luma_denoise=args.outside_luma_denoise,
+      outside_chroma_mode=args.outside_chroma_mode,
+      feather_radius=args.feather_radius,
+      outside_blend=args.outside_blend,
+    )
+    video_frame = av.VideoFrame.from_ndarray(out_rgb.cpu().numpy(), format="rgb24")
+    for packet in out_stream.encode(video_frame):
+      out_container.mux(packet)
+
+  for packet in out_stream.encode():
+    out_container.mux(packet)
+
+  out_container.close()
+  in_container.close()
+
+
+if __name__ == "__main__":
+  main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ __pycache__/ @@
     # C extensions
     *.so
+    !lib/*.so
+    !lib/*.so.*
     # Distribution / packaging
     .Python
@@ Expand All / @@ -15,6 +17,7 @@ downloads/ @@
     eggs/
     .eggs/
     lib/
+    !lib/
     lib64/
     parts/
     sdist/
@@ Expand Down @@