Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ __pycache__/

# C extensions
*.so
!lib/*.so
!lib/*.so.*

# Distribution / packaging
.Python
Expand All @@ -15,6 +17,7 @@ downloads/
eggs/
.eggs/
lib/
!lib/
lib64/
parts/
sdist/
Expand Down
Binary file added ffmpeg-new
Binary file not shown.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ dependencies = [
"tqdm",
"pillow",
"av",
"charset-normalizer",
"requests",
"urllib3",
]

[dependency-groups]
Expand Down
81 changes: 81 additions & 0 deletions submissions/av1_roi_lanczos_unsharp/compress.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env bash
set -euo pipefail

HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PD="$(cd "${HERE}/../.." && pwd)"
TMP_DIR="${PD}/tmp/av1_roi_lanczos_unsharp"

IN_DIR="${PD}/videos"
VIDEO_NAMES_FILE="${PD}/public_test_video_names.txt"
ARCHIVE_DIR="${HERE}/archive"
JOBS="1"

while [[ $# -gt 0 ]]; do
case "$1" in
--in-dir|--in_dir)
IN_DIR="${2%/}"; shift 2 ;;
--jobs)
JOBS="$2"; shift 2 ;;
--video-names-file|--video_names_file)
VIDEO_NAMES_FILE="$2"; shift 2 ;;
*)
echo "Unknown arg: $1" >&2
echo "Usage: $0 [--in-dir <dir>] [--jobs <n>] [--video-names-file <file>]" >&2
exit 2 ;;
esac
done

rm -rf "$ARCHIVE_DIR"
mkdir -p "$ARCHIVE_DIR"
mkdir -p "$TMP_DIR"

export IN_DIR ARCHIVE_DIR PD

head -n "$(wc -l < "$VIDEO_NAMES_FILE")" "$VIDEO_NAMES_FILE" | xargs -P"$JOBS" -I{} bash -lc '
rel="$1"
[[ -z "$rel" ]] && exit 0

IN="${IN_DIR}/${rel}"
BASE="${rel%.*}"
OUT="${ARCHIVE_DIR}/${BASE}.mkv"
PRE_IN="'"${TMP_DIR}"'/${BASE}.pre.mkv"

echo "→ ${IN} → ${OUT}"

# Step 1: ROI preprocess — denoise outside driving corridor
rm -f "$PRE_IN"
python "'"${HERE}"'/preprocess.py" \
--input "$IN" \
--output "$PRE_IN" \
--outside-luma-denoise 2.5 \
--outside-chroma-mode medium \
--feather-radius 24 \
--outside-blend 0.50

# Step 2: Downscale + AV1 encode
FFMPEG="'"${HERE}"'/ffmpeg-new"
[ ! -x "$FFMPEG" ] && FFMPEG="ffmpeg"
export LD_LIBRARY_PATH="'"${HERE}"'/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
"$FFMPEG" -nostdin -y -hide_banner -loglevel warning \
-r 20 -fflags +genpts -i "$PRE_IN" \
-vf "scale=trunc(iw*0.45/2)*2:trunc(ih*0.45/2)*2:flags=lanczos" \
-pix_fmt yuv420p -c:v libsvtav1 -preset 0 -crf 33 \
-svtav1-params "film-grain=22:keyint=180:scd=0" \
-r 20 "$OUT"

rm -f "$PRE_IN"
' _ {}

# zip archive
cd "$ARCHIVE_DIR"
if command -v zip &>/dev/null; then
zip -r "${HERE}/archive.zip" .
else
python3 -c "
import zipfile, os
with zipfile.ZipFile('${HERE}/archive.zip', 'w', zipfile.ZIP_STORED) as zf:
for f in os.listdir('.'):
zf.write(f)
"
fi
echo "Compressed to ${HERE}/archive.zip"
Binary file added submissions/av1_roi_lanczos_unsharp/ffmpeg-new
Binary file not shown.
41 changes: 41 additions & 0 deletions submissions/av1_roi_lanczos_unsharp/inflate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python
import av, torch, numpy as np
import torch.nn.functional as F
from PIL import Image
from frame_utils import camera_size, yuv420_to_rgb

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

_r = torch.tensor([1., 8., 28., 56., 70., 56., 28., 8., 1.])
KERNEL = (torch.outer(_r, _r) / (_r.sum()**2)).to(DEVICE).expand(3, 1, 9, 9)
STRENGTH = 0.40


def decode_and_resize_to_file(video_path: str, dst: str):
target_w, target_h = camera_size
fmt = 'hevc' if video_path.endswith('.hevc') else None
container = av.open(video_path, format=fmt)
stream = container.streams.video[0]
n = 0
with open(dst, 'wb') as f:
for frame in container.decode(stream):
t = yuv420_to_rgb(frame) # (H, W, 3)
H, W, _ = t.shape
if H != target_h or W != target_w:
pil = Image.fromarray(t.numpy())
pil = pil.resize((target_w, target_h), Image.LANCZOS)
x = torch.from_numpy(np.array(pil)).permute(2, 0, 1).unsqueeze(0).float().to(DEVICE)
blur = F.conv2d(F.pad(x, (4, 4, 4, 4), mode='reflect'), KERNEL, padding=0, groups=3)
x = x + STRENGTH * (x - blur)
t = x.clamp(0, 255).squeeze(0).permute(1, 2, 0).round().cpu().to(torch.uint8)
f.write(t.contiguous().numpy().tobytes())
n += 1
container.close()
return n


if __name__ == "__main__":
import sys
src, dst = sys.argv[1], sys.argv[2]
n = decode_and_resize_to_file(src, dst)
print(f"saved {n} frames")
28 changes: 28 additions & 0 deletions submissions/av1_roi_lanczos_unsharp/inflate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash
# Must produce a raw video file at `<output_dir>/<base_name>.raw`.
# A `.raw` file is a flat binary dump of uint8 RGB frames with shape `(N, H, W, 3)`
# where N is the number of frames, H and W match the original video dimensions, no header.
set -euo pipefail

HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$HERE/../.." && pwd)"
SUB_NAME="$(basename "$HERE")"

DATA_DIR="$1"
OUTPUT_DIR="$2"
FILE_LIST="$3"

mkdir -p "$OUTPUT_DIR"

while IFS= read -r line; do
[ -z "$line" ] && continue
BASE="${line%.*}"
SRC="${DATA_DIR}/${BASE}.mkv"
DST="${OUTPUT_DIR}/${BASE}.raw"

[ ! -f "$SRC" ] && echo "ERROR: ${SRC} not found" >&2 && exit 1

printf "Decoding + resizing %s ... " "$line"
cd "$ROOT"
python -m "submissions.${SUB_NAME}.inflate" "$SRC" "$DST"
done < "$FILE_LIST"
1 change: 1 addition & 0 deletions submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2
Binary file not shown.
157 changes: 157 additions & 0 deletions submissions/av1_roi_lanczos_unsharp/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/env python3
import argparse
import sys
from pathlib import Path

import av
import torch
import torch.nn.functional as F
from PIL import Image, ImageDraw, ImageFilter

ROOT = Path(__file__).resolve().parents[2]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))

from frame_utils import yuv420_to_rgb


def collapse_chroma(x: torch.Tensor, mode: str) -> torch.Tensor:
if mode == "normal":
return x
if mode == "soft":
k = 1
elif mode == "medium":
k = 2
elif mode == "strong":
k = 4
else:
raise ValueError(f"unknown chroma mode: {mode}")
uv = x[:, 1:3]
uv = F.avg_pool2d(uv, kernel_size=k * 2 + 1, stride=1, padding=k)
x[:, 1:3] = uv
return x


def apply_luma_denoise(x: torch.Tensor, strength: float) -> torch.Tensor:
if strength <= 0:
return x
kernel_size = 3 if strength <= 2.0 else 5
sigma = max(0.1, strength * 0.35)
coords = torch.arange(kernel_size, device=x.device) - kernel_size // 2
g = torch.exp(-(coords ** 2) / (2 * sigma * sigma))
kernel_1d = (g / g.sum()).float()
kernel_2d = torch.outer(kernel_1d, kernel_1d).view(1, 1, kernel_size, kernel_size)
y = x[:, 0:1]
y_blur = F.conv2d(y, kernel_2d, padding=kernel_size // 2)
blend = min(0.9, strength / 3.0)
x[:, 0:1] = (1 - blend) * y + blend * y_blur
return x


def rgb_to_yuv(rgb: torch.Tensor) -> torch.Tensor:
r = rgb[:, 0:1]
g = rgb[:, 1:2]
b = rgb[:, 2:3]
y = 0.299 * r + 0.587 * g + 0.114 * b
u = (b - y) / 1.772 + 128.0
v = (r - y) / 1.402 + 128.0
return torch.cat([y, u, v], dim=1)


def yuv_to_rgb(yuv: torch.Tensor) -> torch.Tensor:
y = yuv[:, 0:1]
u = yuv[:, 1:2] - 128.0
v = yuv[:, 2:3] - 128.0
r = y + 1.402 * v
g = y - 0.344136 * u - 0.714136 * v
b = y + 1.772 * u
return torch.cat([r, g, b], dim=1)


def segment_polygon(frame_idx: int, width: int, height: int) -> list[tuple[float, float]]:
segments = [
(0, 299, [(0.14, 0.52), (0.82, 0.48), (0.98, 1.00), (0.05, 1.00)]),
(300, 599, [(0.10, 0.50), (0.76, 0.47), (0.92, 1.00), (0.00, 1.00)]),
(600, 899, [(0.18, 0.50), (0.84, 0.47), (0.98, 1.00), (0.06, 1.00)]),
(900, 1199, [(0.22, 0.52), (0.90, 0.49), (1.00, 1.00), (0.10, 1.00)]),
]
for start, end, poly in segments:
if start <= frame_idx <= end:
return [(x * width, y * height) for x, y in poly]
return [(0.15 * width, 0.52 * height), (0.85 * width, 0.48 * height), (width, height), (0, height)]


def build_mask(frame_idx: int, width: int, height: int, feather_radius: int) -> torch.Tensor:
img = Image.new("L", (width, height), 0)
draw = ImageDraw.Draw(img)
draw.polygon(segment_polygon(frame_idx, width, height), fill=255)
if feather_radius > 0:
img = img.filter(ImageFilter.GaussianBlur(radius=feather_radius))
mask = torch.frombuffer(memoryview(img.tobytes()), dtype=torch.uint8).clone().view(height, width).float() / 255.0
return mask.unsqueeze(0).unsqueeze(0)


def process_frame(
frame_rgb: torch.Tensor,
frame_idx: int,
outside_luma_denoise: float,
outside_chroma_mode: str,
feather_radius: int,
outside_blend: float,
) -> torch.Tensor:
chw = frame_rgb.permute(2, 0, 1).float().unsqueeze(0)
mask = build_mask(frame_idx, chw.shape[-1], chw.shape[-2], feather_radius).to(chw.device)
yuv = rgb_to_yuv(chw)
processed = yuv.clone()
processed = apply_luma_denoise(processed, outside_luma_denoise)
processed = collapse_chroma(processed, outside_chroma_mode)
processed_rgb = yuv_to_rgb(processed)
outside_alpha = (1.0 - mask) * outside_blend
mixed = chw * (1.0 - outside_alpha) + processed_rgb * outside_alpha
return mixed.clamp(0, 255).round().to(torch.uint8).squeeze(0).permute(1, 2, 0)


def main() -> None:
parser = argparse.ArgumentParser(description="Hand-authored ROI preprocessor for AV1 encode.")
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument("--outside-luma-denoise", type=float, default=0.0)
parser.add_argument("--outside-chroma-mode", type=str, default="normal")
parser.add_argument("--feather-radius", type=int, default=32)
parser.add_argument("--outside-blend", type=float, default=1.0)
args = parser.parse_args()

in_container = av.open(str(args.input))
in_stream = in_container.streams.video[0]
width = in_stream.width
height = in_stream.height

out_container = av.open(str(args.output), mode="w")
out_stream = out_container.add_stream("ffv1", rate=20)
out_stream.width = width
out_stream.height = height
out_stream.pix_fmt = "yuv420p"

for frame_idx, frame in enumerate(in_container.decode(in_stream)):
rgb = yuv420_to_rgb(frame)
out_rgb = process_frame(
rgb,
frame_idx=frame_idx,
outside_luma_denoise=args.outside_luma_denoise,
outside_chroma_mode=args.outside_chroma_mode,
feather_radius=args.feather_radius,
outside_blend=args.outside_blend,
)
video_frame = av.VideoFrame.from_ndarray(out_rgb.cpu().numpy(), format="rgb24")
for packet in out_stream.encode(video_frame):
out_container.mux(packet)

for packet in out_stream.encode():
out_container.mux(packet)

out_container.close()
in_container.close()


if __name__ == "__main__":
main()