From ced98978059b1e08b8c7c367b6b7f4cd4b6eb9e7 Mon Sep 17 00:00:00 2001 From: weihexiang Date: Thu, 2 Jul 2026 13:11:03 +0800 Subject: [PATCH] =?UTF-8?q?Fix=20off-by-one=20crash=20in=20forward=5Fslidi?= =?UTF-8?q?ng1=20for=20T=20=E2=89=A1=201=20(mod=20chunk=5Fsize)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-chunk loop iterates `num_chunks = ceil((T-1)/chunk_size)` times (driven by the pairwise frame count B_pair = T-1), but points / masks / world_points / camera_poses contain T frames and are split into `ceil(T/chunk_size)` chunks. When T ≡ 1 (mod chunk_size) — e.g. T=133 with chunk_size=12 — the last chunk is never appended, so the returned points sequence is one frame short. This surfaces downstream as a RuntimeError in infer_pair: "The size of tensor a (132) must match the size of tensor b (131)" at `flow3d = flows3d_e[None] + points[None, 0:-1]`, and also in get_aligned_scene_flow_temporal. Fix: append the leftover trailing chunks after the loop so points/masks/ world_points/camera_poses regain their full T frames. Verified by rerunning a 133-frame clip in 3d_efep mode (previously crashed, now completes). --- track4world/nets/model.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/track4world/nets/model.py b/track4world/nets/model.py index a5740be..df55e34 100644 --- a/track4world/nets/model.py +++ b/track4world/nets/model.py @@ -1409,6 +1409,16 @@ def forward_sliding1( del fmaps_chunk, ctxfeats_chunk, pms_chunk, fmaps3d_detail_chunk torch.cuda.empty_cache() + # The loop runs num_chunks = ceil((T-1)/chunk_size) times, but points/masks/ + # world_points/camera_poses are split into ceil(T/chunk_size) chunks. When + # T ≡ 1 (mod chunk_size) the last chunk is dropped, leaving points one frame + # short (causes a shape mismatch in infer_pair). Append the leftover chunks. + for j in range(num_chunks, len(points_chunks)): + points_list.append(padder.unpad(points_chunks[j])) + masks_list.append(padder.unpad(masks_chunks[j])) + world_points_list.append(padder.unpad(world_points_chunks[j])) + camera_poses_list.append(camera_poses_chunks[j]) + # --- Aggregate Chunks into Final Tensors --- all_flow_preds, all_visconf_preds = [], [] all_flow3d_preds = [] if tracking3d else None