Add roi_v2 submission: ROI preprocessing + SVT-AV1 v2.3.0 bundled

EthanYangTW · EthanYangTW · commit 67a44e5b5d9d · 2026-04-08T13:38:53.000+08:00
Bundle SVT-AV1 v2.3.0 library (libSvtAv1Enc.so.2.3.0) with a custom ffmpeg binary, using LD_LIBRARY_PATH to ensure v2.3.0 is used on CI instead of the system's newer version. Achieves estimated CI score ~1.947, beating PR commaai#31's 1.95 (same ROI preprocessing + encode params). Local evaluation: PoseNet=0.07084, SegNet=0.00509, archive=896KB
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ __pycache__/
 
 # C extensions
 *.so
+!lib/*.so
+!lib/*.so.*
 
 # Distribution / packaging
 .Python
@@ -15,6 +17,7 @@ downloads/
 eggs/
 .eggs/
 lib/
+!lib/
 lib64/
 parts/
 sdist/
diff --git a/submissions/roi_v2/compress.sh b/submissions/roi_v2/compress.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PD="$(cd "${HERE}/../.." && pwd)"
+TMP_DIR="${PD}/tmp/roi_v2"
+IN_DIR="${PD}/videos"
+VIDEO_NAMES_FILE="${PD}/public_test_video_names.txt"
+ARCHIVE_DIR="${HERE}/archive"
+rm -rf "$ARCHIVE_DIR"; mkdir -p "$ARCHIVE_DIR" "$TMP_DIR"
+export IN_DIR ARCHIVE_DIR PD
+head -n "$(wc -l < "$VIDEO_NAMES_FILE")" "$VIDEO_NAMES_FILE" | xargs -P1 -I{} bash -lc '
+  rel="$1"; [[ -z "$rel" ]] && exit 0
+  IN="${IN_DIR}/${rel}"; BASE="${rel%.*}"
+  OUT="${ARCHIVE_DIR}/${BASE}.mkv"; PRE_IN="'"${TMP_DIR}"'/${BASE}.pre.mkv"
+  rm -f "$PRE_IN"
+  cd "'"${PD}"'"
+  .venv/bin/python -m submissions.roi_v2.preprocess \
+    --input "$IN" --output "$PRE_IN" \
+    --outside-luma-denoise 2.5 --outside-chroma-mode medium \
+    --feather-radius 24 --outside-blend 0.50
+  FFMPEG="'"${HERE}"'/ffmpeg-new"
+  [ ! -x "$FFMPEG" ] && FFMPEG="ffmpeg"
+  export LD_LIBRARY_PATH="'"${HERE}"'/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+  "$FFMPEG" -nostdin -y -hide_banner -loglevel warning \
+    -r 20 -fflags +genpts -i "$PRE_IN" \
+    -vf "scale=trunc(iw*0.45/2)*2:trunc(ih*0.45/2)*2:flags=lanczos" \
+    -pix_fmt yuv420p -c:v libsvtav1 -preset 0 -crf 33 \
+    -svtav1-params "film-grain=22:keyint=180:scd=0" \
+    -r 20 "$OUT"
+  rm -f "$PRE_IN"
+' _ {}
+cd "$ARCHIVE_DIR"; zip -r "${HERE}/archive.zip" .
diff --git a/submissions/roi_v2/ffmpeg-new b/submissions/roi_v2/ffmpeg-new
diff --git a/submissions/roi_v2/inflate.py b/submissions/roi_v2/inflate.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+import av, torch, numpy as np
+import torch.nn.functional as F
+from PIL import Image
+from frame_utils import camera_size, yuv420_to_rgb
+
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# 9-tap binomial unsharp kernel (Pascal row 8 / 65536)
+_r = torch.tensor([1., 8., 28., 56., 70., 56., 28., 8., 1.])
+KERNEL = (torch.outer(_r, _r) / (_r.sum()**2)).to(DEVICE).expand(3, 1, 9, 9)
+STRENGTH = 0.40
+
+
+def decode_and_resize_to_file(video_path: str, dst: str):
+  target_w, target_h = camera_size
+  container = av.open(video_path)
+  stream = container.streams.video[0]
+  n = 0
+  with open(dst, 'wb') as f:
+    for frame in container.decode(stream):
+      t = yuv420_to_rgb(frame)
+      H, W, _ = t.shape
+      if H != target_h or W != target_w:
+        pil = Image.fromarray(t.numpy())
+        pil = pil.resize((target_w, target_h), Image.LANCZOS)
+        x = torch.from_numpy(np.array(pil)).permute(2, 0, 1).unsqueeze(0).float().to(DEVICE)
+        blur = F.conv2d(F.pad(x, (4, 4, 4, 4), mode='reflect'), KERNEL, padding=0, groups=3)
+        x = x + STRENGTH * (x - blur)
+        t = x.clamp(0, 255).squeeze(0).permute(1, 2, 0).round().cpu().to(torch.uint8)
+      f.write(t.contiguous().numpy().tobytes())
+      n += 1
+  container.close()
+  return n
+
+
+if __name__ == "__main__":
+  import sys
+  src, dst = sys.argv[1], sys.argv[2]
+  n = decode_and_resize_to_file(src, dst)
+  print(f"saved {n} frames")
diff --git a/submissions/roi_v2/inflate.sh b/submissions/roi_v2/inflate.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$HERE/../.." && pwd)"
+SUB_NAME="$(basename "$HERE")"
+
+DATA_DIR="$1"
+OUTPUT_DIR="$2"
+FILE_LIST="$3"
+
+mkdir -p "$OUTPUT_DIR"
+
+while IFS= read -r line; do
+  [ -z "$line" ] && continue
+  BASE="${line%.*}"
+  SRC="${DATA_DIR}/${BASE}.mkv"
+  DST="${OUTPUT_DIR}/${BASE}.raw"
+
+  [ ! -f "$SRC" ] && echo "ERROR: ${SRC} not found" >&2 && exit 1
+
+  printf "Decoding + resizing %s ... " "$line"
+  cd "$ROOT"
+  .venv/bin/python -m "submissions.${SUB_NAME}.inflate" "$SRC" "$DST"
+  echo "done"
+done < "$FILE_LIST"
diff --git a/submissions/roi_v2/lib/libSvtAv1Enc.so.2 b/submissions/roi_v2/lib/libSvtAv1Enc.so.2
@@ -0,0 +1 @@
+libSvtAv1Enc.so.2.3.0
diff --git a/submissions/roi_v2/lib/libSvtAv1Enc.so.2.3.0 b/submissions/roi_v2/lib/libSvtAv1Enc.so.2.3.0
diff --git a/submissions/roi_v2/preprocess.py b/submissions/roi_v2/preprocess.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""ROI-aware preprocessing: denoise outside driving corridor to save bits for encoder."""
+import argparse
+import sys
+from pathlib import Path
+
+import av
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageDraw, ImageFilter
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+  sys.path.insert(0, str(ROOT))
+
+from frame_utils import yuv420_to_rgb
+
+
+def collapse_chroma(x: torch.Tensor, mode: str) -> torch.Tensor:
+  if mode == "normal":
+    return x
+  k = {"soft": 1, "medium": 2, "strong": 4}[mode]
+  uv = x[:, 1:3]
+  uv = F.avg_pool2d(uv, kernel_size=k * 2 + 1, stride=1, padding=k)
+  x[:, 1:3] = uv
+  return x
+
+
+def apply_luma_denoise(x: torch.Tensor, strength: float) -> torch.Tensor:
+  if strength <= 0:
+    return x
+  kernel_size = 3 if strength <= 2.0 else 5
+  sigma = max(0.1, strength * 0.35)
+  coords = torch.arange(kernel_size, device=x.device) - kernel_size // 2
+  g = torch.exp(-(coords ** 2) / (2 * sigma * sigma))
+  kernel_1d = (g / g.sum()).float()
+  kernel_2d = torch.outer(kernel_1d, kernel_1d).view(1, 1, kernel_size, kernel_size)
+  y = x[:, 0:1]
+  y_blur = F.conv2d(y, kernel_2d, padding=kernel_size // 2)
+  blend = min(0.9, strength / 3.0)
+  x[:, 0:1] = (1 - blend) * y + blend * y_blur
+  return x
+
+
+def rgb_to_yuv(rgb: torch.Tensor) -> torch.Tensor:
+  r, g, b = rgb[:, 0:1], rgb[:, 1:2], rgb[:, 2:3]
+  y = 0.299 * r + 0.587 * g + 0.114 * b
+  u = (b - y) / 1.772 + 128.0
+  v = (r - y) / 1.402 + 128.0
+  return torch.cat([y, u, v], dim=1)
+
+
+def yuv_to_rgb(yuv: torch.Tensor) -> torch.Tensor:
+  y = yuv[:, 0:1]
+  u, v = yuv[:, 1:2] - 128.0, yuv[:, 2:3] - 128.0
+  r = y + 1.402 * v
+  g = y - 0.344136 * u - 0.714136 * v
+  b = y + 1.772 * u
+  return torch.cat([r, g, b], dim=1)
+
+
+def segment_polygon(frame_idx: int, width: int, height: int) -> list[tuple[float, float]]:
+  segments = [
+    (0,   299, [(0.14, 0.52), (0.82, 0.48), (0.98, 1.00), (0.05, 1.00)]),
+    (300, 599, [(0.10, 0.50), (0.76, 0.47), (0.92, 1.00), (0.00, 1.00)]),
+    (600, 899, [(0.18, 0.50), (0.84, 0.47), (0.98, 1.00), (0.06, 1.00)]),
+    (900, 1199, [(0.22, 0.52), (0.90, 0.49), (1.00, 1.00), (0.10, 1.00)]),
+  ]
+  for start, end, poly in segments:
+    if start <= frame_idx <= end:
+      return [(x * width, y * height) for x, y in poly]
+  return [(0.15 * width, 0.52 * height), (0.85 * width, 0.48 * height), (width, height), (0, height)]
+
+
+def build_mask(frame_idx: int, width: int, height: int, feather_radius: int) -> torch.Tensor:
+  img = Image.new("L", (width, height), 0)
+  draw = ImageDraw.Draw(img)
+  draw.polygon(segment_polygon(frame_idx, width, height), fill=255)
+  if feather_radius > 0:
+    img = img.filter(ImageFilter.GaussianBlur(radius=feather_radius))
+  mask = torch.frombuffer(memoryview(img.tobytes()), dtype=torch.uint8).clone().view(height, width).float() / 255.0
+  return mask.unsqueeze(0).unsqueeze(0)
+
+
+def process_frame(
+  frame_rgb: torch.Tensor,
+  frame_idx: int,
+  outside_luma_denoise: float,
+  outside_chroma_mode: str,
+  feather_radius: int,
+  outside_blend: float,
+) -> torch.Tensor:
+  chw = frame_rgb.permute(2, 0, 1).float().unsqueeze(0)
+  mask = build_mask(frame_idx, chw.shape[-1], chw.shape[-2], feather_radius).to(chw.device)
+  yuv = rgb_to_yuv(chw)
+  processed = yuv.clone()
+  processed = apply_luma_denoise(processed, outside_luma_denoise)
+  processed = collapse_chroma(processed, outside_chroma_mode)
+  processed_rgb = yuv_to_rgb(processed)
+  outside_alpha = (1.0 - mask) * outside_blend
+  mixed = chw * (1.0 - outside_alpha) + processed_rgb * outside_alpha
+  return mixed.clamp(0, 255).round().to(torch.uint8).squeeze(0).permute(1, 2, 0)
+
+
+def main() -> None:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=Path, required=True)
+  parser.add_argument("--output", type=Path, required=True)
+  parser.add_argument("--outside-luma-denoise", type=float, default=2.5)
+  parser.add_argument("--outside-chroma-mode", type=str, default="medium")
+  parser.add_argument("--feather-radius", type=int, default=24)
+  parser.add_argument("--outside-blend", type=float, default=0.60)
+  args = parser.parse_args()
+
+  in_container = av.open(str(args.input))
+  in_stream = in_container.streams.video[0]
+  width, height = in_stream.width, in_stream.height
+
+  out_container = av.open(str(args.output), mode="w")
+  out_stream = out_container.add_stream("ffv1", rate=20)
+  out_stream.width = width
+  out_stream.height = height
+  out_stream.pix_fmt = "yuv420p"
+
+  for frame_idx, frame in enumerate(in_container.decode(in_stream)):
+    rgb = yuv420_to_rgb(frame)
+    out_rgb = process_frame(
+      rgb, frame_idx,
+      outside_luma_denoise=args.outside_luma_denoise,
+      outside_chroma_mode=args.outside_chroma_mode,
+      feather_radius=args.feather_radius,
+      outside_blend=args.outside_blend,
+    )
+    video_frame = av.VideoFrame.from_ndarray(out_rgb.cpu().numpy(), format="rgb24")
+    for packet in out_stream.encode(video_frame):
+      out_container.mux(packet)
+
+  for packet in out_stream.encode():
+    out_container.mux(packet)
+
+  out_container.close()
+  in_container.close()
+
+
+if __name__ == "__main__":
+  main()