Skip to content

Commit 67a44e5

Browse files
committed
Add roi_v2 submission: ROI preprocessing + SVT-AV1 v2.3.0 bundled
Bundle SVT-AV1 v2.3.0 library (libSvtAv1Enc.so.2.3.0) with a custom ffmpeg binary, using LD_LIBRARY_PATH to ensure v2.3.0 is used on CI instead of the system's newer version. Achieves estimated CI score ~1.947, beating PR commaai#31's 1.95 (same ROI preprocessing + encode params). Local evaluation: PoseNet=0.07084, SegNet=0.00509, archive=896KB
1 parent f82933e commit 67a44e5

8 files changed

Lines changed: 249 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ __pycache__/
55

66
# C extensions
77
*.so
8+
!lib/*.so
9+
!lib/*.so.*
810

911
# Distribution / packaging
1012
.Python
@@ -15,6 +17,7 @@ downloads/
1517
eggs/
1618
.eggs/
1719
lib/
20+
!lib/
1821
lib64/
1922
parts/
2023
sdist/

submissions/roi_v2/compress.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
4+
PD="$(cd "${HERE}/../.." && pwd)"
5+
TMP_DIR="${PD}/tmp/roi_v2"
6+
IN_DIR="${PD}/videos"
7+
VIDEO_NAMES_FILE="${PD}/public_test_video_names.txt"
8+
ARCHIVE_DIR="${HERE}/archive"
9+
rm -rf "$ARCHIVE_DIR"; mkdir -p "$ARCHIVE_DIR" "$TMP_DIR"
10+
export IN_DIR ARCHIVE_DIR PD
11+
head -n "$(wc -l < "$VIDEO_NAMES_FILE")" "$VIDEO_NAMES_FILE" | xargs -P1 -I{} bash -lc '
12+
rel="$1"; [[ -z "$rel" ]] && exit 0
13+
IN="${IN_DIR}/${rel}"; BASE="${rel%.*}"
14+
OUT="${ARCHIVE_DIR}/${BASE}.mkv"; PRE_IN="'"${TMP_DIR}"'/${BASE}.pre.mkv"
15+
rm -f "$PRE_IN"
16+
cd "'"${PD}"'"
17+
.venv/bin/python -m submissions.roi_v2.preprocess \
18+
--input "$IN" --output "$PRE_IN" \
19+
--outside-luma-denoise 2.5 --outside-chroma-mode medium \
20+
--feather-radius 24 --outside-blend 0.50
21+
FFMPEG="'"${HERE}"'/ffmpeg-new"
22+
[ ! -x "$FFMPEG" ] && FFMPEG="ffmpeg"
23+
export LD_LIBRARY_PATH="'"${HERE}"'/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
24+
"$FFMPEG" -nostdin -y -hide_banner -loglevel warning \
25+
-r 20 -fflags +genpts -i "$PRE_IN" \
26+
-vf "scale=trunc(iw*0.45/2)*2:trunc(ih*0.45/2)*2:flags=lanczos" \
27+
-pix_fmt yuv420p -c:v libsvtav1 -preset 0 -crf 33 \
28+
-svtav1-params "film-grain=22:keyint=180:scd=0" \
29+
-r 20 "$OUT"
30+
rm -f "$PRE_IN"
31+
' _ {}
32+
cd "$ARCHIVE_DIR"; zip -r "${HERE}/archive.zip" .

submissions/roi_v2/ffmpeg-new

23.3 MB
Binary file not shown.

submissions/roi_v2/inflate.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env python
2+
import av, torch, numpy as np
3+
import torch.nn.functional as F
4+
from PIL import Image
5+
from frame_utils import camera_size, yuv420_to_rgb
6+
7+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8+
9+
# 9-tap binomial unsharp kernel (Pascal row 8 / 65536)
10+
_r = torch.tensor([1., 8., 28., 56., 70., 56., 28., 8., 1.])
11+
KERNEL = (torch.outer(_r, _r) / (_r.sum()**2)).to(DEVICE).expand(3, 1, 9, 9)
12+
STRENGTH = 0.40
13+
14+
15+
def decode_and_resize_to_file(video_path: str, dst: str):
16+
target_w, target_h = camera_size
17+
container = av.open(video_path)
18+
stream = container.streams.video[0]
19+
n = 0
20+
with open(dst, 'wb') as f:
21+
for frame in container.decode(stream):
22+
t = yuv420_to_rgb(frame)
23+
H, W, _ = t.shape
24+
if H != target_h or W != target_w:
25+
pil = Image.fromarray(t.numpy())
26+
pil = pil.resize((target_w, target_h), Image.LANCZOS)
27+
x = torch.from_numpy(np.array(pil)).permute(2, 0, 1).unsqueeze(0).float().to(DEVICE)
28+
blur = F.conv2d(F.pad(x, (4, 4, 4, 4), mode='reflect'), KERNEL, padding=0, groups=3)
29+
x = x + STRENGTH * (x - blur)
30+
t = x.clamp(0, 255).squeeze(0).permute(1, 2, 0).round().cpu().to(torch.uint8)
31+
f.write(t.contiguous().numpy().tobytes())
32+
n += 1
33+
container.close()
34+
return n
35+
36+
37+
if __name__ == "__main__":
38+
import sys
39+
src, dst = sys.argv[1], sys.argv[2]
40+
n = decode_and_resize_to_file(src, dst)
41+
print(f"saved {n} frames")

submissions/roi_v2/inflate.sh

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5+
ROOT="$(cd "$HERE/../.." && pwd)"
6+
SUB_NAME="$(basename "$HERE")"
7+
8+
DATA_DIR="$1"
9+
OUTPUT_DIR="$2"
10+
FILE_LIST="$3"
11+
12+
mkdir -p "$OUTPUT_DIR"
13+
14+
while IFS= read -r line; do
15+
[ -z "$line" ] && continue
16+
BASE="${line%.*}"
17+
SRC="${DATA_DIR}/${BASE}.mkv"
18+
DST="${OUTPUT_DIR}/${BASE}.raw"
19+
20+
[ ! -f "$SRC" ] && echo "ERROR: ${SRC} not found" >&2 && exit 1
21+
22+
printf "Decoding + resizing %s ... " "$line"
23+
cd "$ROOT"
24+
.venv/bin/python -m "submissions.${SUB_NAME}.inflate" "$SRC" "$DST"
25+
echo "done"
26+
done < "$FILE_LIST"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
libSvtAv1Enc.so.2.3.0
7.83 MB
Binary file not shown.

submissions/roi_v2/preprocess.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/usr/bin/env python3
2+
"""ROI-aware preprocessing: denoise outside driving corridor to save bits for encoder."""
3+
import argparse
4+
import sys
5+
from pathlib import Path
6+
7+
import av
8+
import torch
9+
import torch.nn.functional as F
10+
from PIL import Image, ImageDraw, ImageFilter
11+
12+
ROOT = Path(__file__).resolve().parents[2]
13+
if str(ROOT) not in sys.path:
14+
sys.path.insert(0, str(ROOT))
15+
16+
from frame_utils import yuv420_to_rgb
17+
18+
19+
def collapse_chroma(x: torch.Tensor, mode: str) -> torch.Tensor:
20+
if mode == "normal":
21+
return x
22+
k = {"soft": 1, "medium": 2, "strong": 4}[mode]
23+
uv = x[:, 1:3]
24+
uv = F.avg_pool2d(uv, kernel_size=k * 2 + 1, stride=1, padding=k)
25+
x[:, 1:3] = uv
26+
return x
27+
28+
29+
def apply_luma_denoise(x: torch.Tensor, strength: float) -> torch.Tensor:
30+
if strength <= 0:
31+
return x
32+
kernel_size = 3 if strength <= 2.0 else 5
33+
sigma = max(0.1, strength * 0.35)
34+
coords = torch.arange(kernel_size, device=x.device) - kernel_size // 2
35+
g = torch.exp(-(coords ** 2) / (2 * sigma * sigma))
36+
kernel_1d = (g / g.sum()).float()
37+
kernel_2d = torch.outer(kernel_1d, kernel_1d).view(1, 1, kernel_size, kernel_size)
38+
y = x[:, 0:1]
39+
y_blur = F.conv2d(y, kernel_2d, padding=kernel_size // 2)
40+
blend = min(0.9, strength / 3.0)
41+
x[:, 0:1] = (1 - blend) * y + blend * y_blur
42+
return x
43+
44+
45+
def rgb_to_yuv(rgb: torch.Tensor) -> torch.Tensor:
46+
r, g, b = rgb[:, 0:1], rgb[:, 1:2], rgb[:, 2:3]
47+
y = 0.299 * r + 0.587 * g + 0.114 * b
48+
u = (b - y) / 1.772 + 128.0
49+
v = (r - y) / 1.402 + 128.0
50+
return torch.cat([y, u, v], dim=1)
51+
52+
53+
def yuv_to_rgb(yuv: torch.Tensor) -> torch.Tensor:
54+
y = yuv[:, 0:1]
55+
u, v = yuv[:, 1:2] - 128.0, yuv[:, 2:3] - 128.0
56+
r = y + 1.402 * v
57+
g = y - 0.344136 * u - 0.714136 * v
58+
b = y + 1.772 * u
59+
return torch.cat([r, g, b], dim=1)
60+
61+
62+
def segment_polygon(frame_idx: int, width: int, height: int) -> list[tuple[float, float]]:
63+
segments = [
64+
(0, 299, [(0.14, 0.52), (0.82, 0.48), (0.98, 1.00), (0.05, 1.00)]),
65+
(300, 599, [(0.10, 0.50), (0.76, 0.47), (0.92, 1.00), (0.00, 1.00)]),
66+
(600, 899, [(0.18, 0.50), (0.84, 0.47), (0.98, 1.00), (0.06, 1.00)]),
67+
(900, 1199, [(0.22, 0.52), (0.90, 0.49), (1.00, 1.00), (0.10, 1.00)]),
68+
]
69+
for start, end, poly in segments:
70+
if start <= frame_idx <= end:
71+
return [(x * width, y * height) for x, y in poly]
72+
return [(0.15 * width, 0.52 * height), (0.85 * width, 0.48 * height), (width, height), (0, height)]
73+
74+
75+
def build_mask(frame_idx: int, width: int, height: int, feather_radius: int) -> torch.Tensor:
76+
img = Image.new("L", (width, height), 0)
77+
draw = ImageDraw.Draw(img)
78+
draw.polygon(segment_polygon(frame_idx, width, height), fill=255)
79+
if feather_radius > 0:
80+
img = img.filter(ImageFilter.GaussianBlur(radius=feather_radius))
81+
mask = torch.frombuffer(memoryview(img.tobytes()), dtype=torch.uint8).clone().view(height, width).float() / 255.0
82+
return mask.unsqueeze(0).unsqueeze(0)
83+
84+
85+
def process_frame(
86+
frame_rgb: torch.Tensor,
87+
frame_idx: int,
88+
outside_luma_denoise: float,
89+
outside_chroma_mode: str,
90+
feather_radius: int,
91+
outside_blend: float,
92+
) -> torch.Tensor:
93+
chw = frame_rgb.permute(2, 0, 1).float().unsqueeze(0)
94+
mask = build_mask(frame_idx, chw.shape[-1], chw.shape[-2], feather_radius).to(chw.device)
95+
yuv = rgb_to_yuv(chw)
96+
processed = yuv.clone()
97+
processed = apply_luma_denoise(processed, outside_luma_denoise)
98+
processed = collapse_chroma(processed, outside_chroma_mode)
99+
processed_rgb = yuv_to_rgb(processed)
100+
outside_alpha = (1.0 - mask) * outside_blend
101+
mixed = chw * (1.0 - outside_alpha) + processed_rgb * outside_alpha
102+
return mixed.clamp(0, 255).round().to(torch.uint8).squeeze(0).permute(1, 2, 0)
103+
104+
105+
def main() -> None:
106+
parser = argparse.ArgumentParser()
107+
parser.add_argument("--input", type=Path, required=True)
108+
parser.add_argument("--output", type=Path, required=True)
109+
parser.add_argument("--outside-luma-denoise", type=float, default=2.5)
110+
parser.add_argument("--outside-chroma-mode", type=str, default="medium")
111+
parser.add_argument("--feather-radius", type=int, default=24)
112+
parser.add_argument("--outside-blend", type=float, default=0.60)
113+
args = parser.parse_args()
114+
115+
in_container = av.open(str(args.input))
116+
in_stream = in_container.streams.video[0]
117+
width, height = in_stream.width, in_stream.height
118+
119+
out_container = av.open(str(args.output), mode="w")
120+
out_stream = out_container.add_stream("ffv1", rate=20)
121+
out_stream.width = width
122+
out_stream.height = height
123+
out_stream.pix_fmt = "yuv420p"
124+
125+
for frame_idx, frame in enumerate(in_container.decode(in_stream)):
126+
rgb = yuv420_to_rgb(frame)
127+
out_rgb = process_frame(
128+
rgb, frame_idx,
129+
outside_luma_denoise=args.outside_luma_denoise,
130+
outside_chroma_mode=args.outside_chroma_mode,
131+
feather_radius=args.feather_radius,
132+
outside_blend=args.outside_blend,
133+
)
134+
video_frame = av.VideoFrame.from_ndarray(out_rgb.cpu().numpy(), format="rgb24")
135+
for packet in out_stream.encode(video_frame):
136+
out_container.mux(packet)
137+
138+
for packet in out_stream.encode():
139+
out_container.mux(packet)
140+
141+
out_container.close()
142+
in_container.close()
143+
144+
145+
if __name__ == "__main__":
146+
main()

0 commit comments

Comments
 (0)