From ced98978059b1e08b8c7c367b6b7f4cd4b6eb9e7 Mon Sep 17 00:00:00 2001
From: weihexiang <w.cranesoar@gmail.com>
Date: Thu, 2 Jul 2026 13:11:03 +0800
Subject: [PATCH] =?UTF-8?q?Fix=20off-by-one=20crash=20in=20forward=5Fslidi?=
 =?UTF-8?q?ng1=20for=20T=20=E2=89=A1=201=20(mod=20chunk=5Fsize)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-chunk loop iterates `num_chunks = ceil((T-1)/chunk_size)` times
(driven by the pairwise frame count B_pair = T-1), but points / masks /
world_points / camera_poses contain T frames and are split into
`ceil(T/chunk_size)` chunks. When T ≡ 1 (mod chunk_size) — e.g. T=133 with
chunk_size=12 — the last chunk is never appended, so the returned points
sequence is one frame short.

This surfaces downstream as a RuntimeError in infer_pair:
  "The size of tensor a (132) must match the size of tensor b (131)"
at `flow3d = flows3d_e[None] + points[None, 0:-1]`, and also in
get_aligned_scene_flow_temporal.

Fix: append the leftover trailing chunks after the loop so points/masks/
world_points/camera_poses regain their full T frames. Verified by rerunning
a 133-frame clip in 3d_efep mode (previously crashed, now completes).
---
 track4world/nets/model.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/track4world/nets/model.py b/track4world/nets/model.py
index a5740be..df55e34 100644
--- a/track4world/nets/model.py
+++ b/track4world/nets/model.py
@@ -1409,6 +1409,16 @@ def forward_sliding1(
             del fmaps_chunk, ctxfeats_chunk, pms_chunk, fmaps3d_detail_chunk
             torch.cuda.empty_cache()
 
+        # The loop runs num_chunks = ceil((T-1)/chunk_size) times, but points/masks/
+        # world_points/camera_poses are split into ceil(T/chunk_size) chunks. When
+        # T ≡ 1 (mod chunk_size) the last chunk is dropped, leaving points one frame
+        # short (causes a shape mismatch in infer_pair). Append the leftover chunks.
+        for j in range(num_chunks, len(points_chunks)):
+            points_list.append(padder.unpad(points_chunks[j]))
+            masks_list.append(padder.unpad(masks_chunks[j]))
+            world_points_list.append(padder.unpad(world_points_chunks[j]))
+            camera_poses_list.append(camera_poses_chunks[j])
+
         # --- Aggregate Chunks into Final Tensors ---
         all_flow_preds, all_visconf_preds = [], []
         all_flow3d_preds = [] if tracking3d else None