diff --git a/.gitignore b/.gitignore index 8eb1ae4..8936beb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__/ # C extensions *.so +!lib/*.so +!lib/*.so.* # Distribution / packaging .Python @@ -15,6 +17,7 @@ downloads/ eggs/ .eggs/ lib/ +!lib/ lib64/ parts/ sdist/ diff --git a/ffmpeg-new b/ffmpeg-new new file mode 100755 index 0000000..3dc330e Binary files /dev/null and b/ffmpeg-new differ diff --git a/pyproject.toml b/pyproject.toml index a92afe2..8595876 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,9 @@ dependencies = [ "tqdm", "pillow", "av", + "charset-normalizer", + "requests", + "urllib3", ] [dependency-groups] diff --git a/submissions/av1_roi_lanczos_unsharp/compress.sh b/submissions/av1_roi_lanczos_unsharp/compress.sh new file mode 100644 index 0000000..bf246df --- /dev/null +++ b/submissions/av1_roi_lanczos_unsharp/compress.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PD="$(cd "${HERE}/../.." && pwd)" +TMP_DIR="${PD}/tmp/av1_roi_lanczos_unsharp" + +IN_DIR="${PD}/videos" +VIDEO_NAMES_FILE="${PD}/public_test_video_names.txt" +ARCHIVE_DIR="${HERE}/archive" +JOBS="1" + +while [[ $# -gt 0 ]]; do + case "$1" in + --in-dir|--in_dir) + IN_DIR="${2%/}"; shift 2 ;; + --jobs) + JOBS="$2"; shift 2 ;; + --video-names-file|--video_names_file) + VIDEO_NAMES_FILE="$2"; shift 2 ;; + *) + echo "Unknown arg: $1" >&2 + echo "Usage: $0 [--in-dir ] [--jobs ] [--video-names-file ]" >&2 + exit 2 ;; + esac +done + +rm -rf "$ARCHIVE_DIR" +mkdir -p "$ARCHIVE_DIR" +mkdir -p "$TMP_DIR" + +export IN_DIR ARCHIVE_DIR PD + +head -n "$(wc -l < "$VIDEO_NAMES_FILE")" "$VIDEO_NAMES_FILE" | xargs -P"$JOBS" -I{} bash -lc ' + rel="$1" + [[ -z "$rel" ]] && exit 0 + + IN="${IN_DIR}/${rel}" + BASE="${rel%.*}" + OUT="${ARCHIVE_DIR}/${BASE}.mkv" + PRE_IN="'"${TMP_DIR}"'/${BASE}.pre.mkv" + + echo "→ ${IN} → ${OUT}" + + # Step 1: ROI preprocess — denoise outside driving corridor + rm -f "$PRE_IN" + python "'"${HERE}"'/preprocess.py" \ + --input "$IN" \ + --output "$PRE_IN" \ + --outside-luma-denoise 2.5 \ + --outside-chroma-mode medium \ + --feather-radius 24 \ + --outside-blend 0.50 + + # Step 2: Downscale + AV1 encode + FFMPEG="'"${HERE}"'/ffmpeg-new" + [ ! -x "$FFMPEG" ] && FFMPEG="ffmpeg" + export LD_LIBRARY_PATH="'"${HERE}"'/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + "$FFMPEG" -nostdin -y -hide_banner -loglevel warning \ + -r 20 -fflags +genpts -i "$PRE_IN" \ + -vf "scale=trunc(iw*0.45/2)*2:trunc(ih*0.45/2)*2:flags=lanczos" \ + -pix_fmt yuv420p -c:v libsvtav1 -preset 0 -crf 33 \ + -svtav1-params "film-grain=22:keyint=180:scd=0" \ + -r 20 "$OUT" + + rm -f "$PRE_IN" +' _ {} + +# zip archive +cd "$ARCHIVE_DIR" +if command -v zip &>/dev/null; then + zip -r "${HERE}/archive.zip" . +else + python3 -c " +import zipfile, os +with zipfile.ZipFile('${HERE}/archive.zip', 'w', zipfile.ZIP_STORED) as zf: + for f in os.listdir('.'): + zf.write(f) +" +fi +echo "Compressed to ${HERE}/archive.zip" diff --git a/submissions/av1_roi_lanczos_unsharp/ffmpeg-new b/submissions/av1_roi_lanczos_unsharp/ffmpeg-new new file mode 100644 index 0000000..3dc330e Binary files /dev/null and b/submissions/av1_roi_lanczos_unsharp/ffmpeg-new differ diff --git a/submissions/av1_roi_lanczos_unsharp/inflate.py b/submissions/av1_roi_lanczos_unsharp/inflate.py new file mode 100644 index 0000000..813eac4 --- /dev/null +++ b/submissions/av1_roi_lanczos_unsharp/inflate.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +import av, torch, numpy as np +import torch.nn.functional as F +from PIL import Image +from frame_utils import camera_size, yuv420_to_rgb + +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +_r = torch.tensor([1., 8., 28., 56., 70., 56., 28., 8., 1.]) +KERNEL = (torch.outer(_r, _r) / (_r.sum()**2)).to(DEVICE).expand(3, 1, 9, 9) +STRENGTH = 0.40 + + +def decode_and_resize_to_file(video_path: str, dst: str): + target_w, target_h = camera_size + fmt = 'hevc' if video_path.endswith('.hevc') else None + container = av.open(video_path, format=fmt) + stream = container.streams.video[0] + n = 0 + with open(dst, 'wb') as f: + for frame in container.decode(stream): + t = yuv420_to_rgb(frame) # (H, W, 3) + H, W, _ = t.shape + if H != target_h or W != target_w: + pil = Image.fromarray(t.numpy()) + pil = pil.resize((target_w, target_h), Image.LANCZOS) + x = torch.from_numpy(np.array(pil)).permute(2, 0, 1).unsqueeze(0).float().to(DEVICE) + blur = F.conv2d(F.pad(x, (4, 4, 4, 4), mode='reflect'), KERNEL, padding=0, groups=3) + x = x + STRENGTH * (x - blur) + t = x.clamp(0, 255).squeeze(0).permute(1, 2, 0).round().cpu().to(torch.uint8) + f.write(t.contiguous().numpy().tobytes()) + n += 1 + container.close() + return n + + +if __name__ == "__main__": + import sys + src, dst = sys.argv[1], sys.argv[2] + n = decode_and_resize_to_file(src, dst) + print(f"saved {n} frames") diff --git a/submissions/av1_roi_lanczos_unsharp/inflate.sh b/submissions/av1_roi_lanczos_unsharp/inflate.sh new file mode 100644 index 0000000..a54b563 --- /dev/null +++ b/submissions/av1_roi_lanczos_unsharp/inflate.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Must produce a raw video file at `/.raw`. +# A `.raw` file is a flat binary dump of uint8 RGB frames with shape `(N, H, W, 3)` +# where N is the number of frames, H and W match the original video dimensions, no header. +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$HERE/../.." && pwd)" +SUB_NAME="$(basename "$HERE")" + +DATA_DIR="$1" +OUTPUT_DIR="$2" +FILE_LIST="$3" + +mkdir -p "$OUTPUT_DIR" + +while IFS= read -r line; do + [ -z "$line" ] && continue + BASE="${line%.*}" + SRC="${DATA_DIR}/${BASE}.mkv" + DST="${OUTPUT_DIR}/${BASE}.raw" + + [ ! -f "$SRC" ] && echo "ERROR: ${SRC} not found" >&2 && exit 1 + + printf "Decoding + resizing %s ... " "$line" + cd "$ROOT" + python -m "submissions.${SUB_NAME}.inflate" "$SRC" "$DST" +done < "$FILE_LIST" diff --git a/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2 b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2 new file mode 120000 index 0000000..6b76c45 --- /dev/null +++ b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2 @@ -0,0 +1 @@ +libSvtAv1Enc.so.2.3.0 \ No newline at end of file diff --git a/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2.3.0 b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2.3.0 new file mode 100644 index 0000000..263b8df Binary files /dev/null and b/submissions/av1_roi_lanczos_unsharp/lib/libSvtAv1Enc.so.2.3.0 differ diff --git a/submissions/av1_roi_lanczos_unsharp/preprocess.py b/submissions/av1_roi_lanczos_unsharp/preprocess.py new file mode 100644 index 0000000..b4aa751 --- /dev/null +++ b/submissions/av1_roi_lanczos_unsharp/preprocess.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +import argparse +import sys +from pathlib import Path + +import av +import torch +import torch.nn.functional as F +from PIL import Image, ImageDraw, ImageFilter + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from frame_utils import yuv420_to_rgb + + +def collapse_chroma(x: torch.Tensor, mode: str) -> torch.Tensor: + if mode == "normal": + return x + if mode == "soft": + k = 1 + elif mode == "medium": + k = 2 + elif mode == "strong": + k = 4 + else: + raise ValueError(f"unknown chroma mode: {mode}") + uv = x[:, 1:3] + uv = F.avg_pool2d(uv, kernel_size=k * 2 + 1, stride=1, padding=k) + x[:, 1:3] = uv + return x + + +def apply_luma_denoise(x: torch.Tensor, strength: float) -> torch.Tensor: + if strength <= 0: + return x + kernel_size = 3 if strength <= 2.0 else 5 + sigma = max(0.1, strength * 0.35) + coords = torch.arange(kernel_size, device=x.device) - kernel_size // 2 + g = torch.exp(-(coords ** 2) / (2 * sigma * sigma)) + kernel_1d = (g / g.sum()).float() + kernel_2d = torch.outer(kernel_1d, kernel_1d).view(1, 1, kernel_size, kernel_size) + y = x[:, 0:1] + y_blur = F.conv2d(y, kernel_2d, padding=kernel_size // 2) + blend = min(0.9, strength / 3.0) + x[:, 0:1] = (1 - blend) * y + blend * y_blur + return x + + +def rgb_to_yuv(rgb: torch.Tensor) -> torch.Tensor: + r = rgb[:, 0:1] + g = rgb[:, 1:2] + b = rgb[:, 2:3] + y = 0.299 * r + 0.587 * g + 0.114 * b + u = (b - y) / 1.772 + 128.0 + v = (r - y) / 1.402 + 128.0 + return torch.cat([y, u, v], dim=1) + + +def yuv_to_rgb(yuv: torch.Tensor) -> torch.Tensor: + y = yuv[:, 0:1] + u = yuv[:, 1:2] - 128.0 + v = yuv[:, 2:3] - 128.0 + r = y + 1.402 * v + g = y - 0.344136 * u - 0.714136 * v + b = y + 1.772 * u + return torch.cat([r, g, b], dim=1) + + +def segment_polygon(frame_idx: int, width: int, height: int) -> list[tuple[float, float]]: + segments = [ + (0, 299, [(0.14, 0.52), (0.82, 0.48), (0.98, 1.00), (0.05, 1.00)]), + (300, 599, [(0.10, 0.50), (0.76, 0.47), (0.92, 1.00), (0.00, 1.00)]), + (600, 899, [(0.18, 0.50), (0.84, 0.47), (0.98, 1.00), (0.06, 1.00)]), + (900, 1199, [(0.22, 0.52), (0.90, 0.49), (1.00, 1.00), (0.10, 1.00)]), + ] + for start, end, poly in segments: + if start <= frame_idx <= end: + return [(x * width, y * height) for x, y in poly] + return [(0.15 * width, 0.52 * height), (0.85 * width, 0.48 * height), (width, height), (0, height)] + + +def build_mask(frame_idx: int, width: int, height: int, feather_radius: int) -> torch.Tensor: + img = Image.new("L", (width, height), 0) + draw = ImageDraw.Draw(img) + draw.polygon(segment_polygon(frame_idx, width, height), fill=255) + if feather_radius > 0: + img = img.filter(ImageFilter.GaussianBlur(radius=feather_radius)) + mask = torch.frombuffer(memoryview(img.tobytes()), dtype=torch.uint8).clone().view(height, width).float() / 255.0 + return mask.unsqueeze(0).unsqueeze(0) + + +def process_frame( + frame_rgb: torch.Tensor, + frame_idx: int, + outside_luma_denoise: float, + outside_chroma_mode: str, + feather_radius: int, + outside_blend: float, +) -> torch.Tensor: + chw = frame_rgb.permute(2, 0, 1).float().unsqueeze(0) + mask = build_mask(frame_idx, chw.shape[-1], chw.shape[-2], feather_radius).to(chw.device) + yuv = rgb_to_yuv(chw) + processed = yuv.clone() + processed = apply_luma_denoise(processed, outside_luma_denoise) + processed = collapse_chroma(processed, outside_chroma_mode) + processed_rgb = yuv_to_rgb(processed) + outside_alpha = (1.0 - mask) * outside_blend + mixed = chw * (1.0 - outside_alpha) + processed_rgb * outside_alpha + return mixed.clamp(0, 255).round().to(torch.uint8).squeeze(0).permute(1, 2, 0) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Hand-authored ROI preprocessor for AV1 encode.") + parser.add_argument("--input", type=Path, required=True) + parser.add_argument("--output", type=Path, required=True) + parser.add_argument("--outside-luma-denoise", type=float, default=0.0) + parser.add_argument("--outside-chroma-mode", type=str, default="normal") + parser.add_argument("--feather-radius", type=int, default=32) + parser.add_argument("--outside-blend", type=float, default=1.0) + args = parser.parse_args() + + in_container = av.open(str(args.input)) + in_stream = in_container.streams.video[0] + width = in_stream.width + height = in_stream.height + + out_container = av.open(str(args.output), mode="w") + out_stream = out_container.add_stream("ffv1", rate=20) + out_stream.width = width + out_stream.height = height + out_stream.pix_fmt = "yuv420p" + + for frame_idx, frame in enumerate(in_container.decode(in_stream)): + rgb = yuv420_to_rgb(frame) + out_rgb = process_frame( + rgb, + frame_idx=frame_idx, + outside_luma_denoise=args.outside_luma_denoise, + outside_chroma_mode=args.outside_chroma_mode, + feather_radius=args.feather_radius, + outside_blend=args.outside_blend, + ) + video_frame = av.VideoFrame.from_ndarray(out_rgb.cpu().numpy(), format="rgb24") + for packet in out_stream.encode(video_frame): + out_container.mux(packet) + + for packet in out_stream.encode(): + out_container.mux(packet) + + out_container.close() + in_container.close() + + +if __name__ == "__main__": + main()