diff --git a/environment_linux.yml b/environment_linux.yml index 4c7da25b0..267475af1 100644 --- a/environment_linux.yml +++ b/environment_linux.yml @@ -56,7 +56,7 @@ dependencies: - pydegensac - colour - trimesh[easy] - - gtsam-develop==4.3a1.dev202603090403 + - gtsam-develop==4.3a1.dev202604240153 - pydot # dust3r/mast3r - roma diff --git a/environment_linux_cpuonly.yml b/environment_linux_cpuonly.yml index 03362fdf2..6858531d4 100644 --- a/environment_linux_cpuonly.yml +++ b/environment_linux_cpuonly.yml @@ -56,7 +56,7 @@ dependencies: - pydegensac - colour - trimesh[easy] - - gtsam-develop==4.3a1.dev202603090403 + - gtsam-develop==4.3a1.dev202604240153 - pydot - roma - tqdm diff --git a/environment_mac.yml b/environment_mac.yml index 5f4c0a488..89a8359e5 100644 --- a/environment_mac.yml +++ b/environment_mac.yml @@ -57,7 +57,7 @@ dependencies: - pydegensac - colour - trimesh[easy] - - gtsam-develop==4.3a1.dev202603090403 + - gtsam-develop==4.3a1.dev202604240153 - pydot # dust3r/mast3r - roma diff --git a/gtsfm/averaging/translation/averaging_1dsfm.py b/gtsfm/averaging/translation/averaging_1dsfm.py index 2c4463176..0600fbf73 100644 --- a/gtsfm/averaging/translation/averaging_1dsfm.py +++ b/gtsfm/averaging/translation/averaging_1dsfm.py @@ -45,6 +45,7 @@ from gtsfm.evaluation.metrics import GtsfmMetric, GtsfmMetricsGroup from gtsfm.products.visibility_graph import AnnotatedGraph, ImageIndexPair, ImageIndexPairs from gtsfm.utils import align, transform +from gtsfm.utils.tracks import compute_world_directions # Hyperparameters for 1D-SFM # maximum number of times 1dsfm will project the Unit3's to a 1d subspace for outlier rejection @@ -416,25 +417,9 @@ def _get_landmark_directions( Returns: Dictionary of unit directions from camera to track in world frame indexed by (track_id, camera_id). """ - landmark_directions = {} - for track_id, track in enumerate(tracks_2d): - for j in range(track.number_measurements()): - measurement = track.measurement(j) - cam_idx = measurement.i - - # TODO(akshay-krishnan): consider changing input type and checking elsewhere - intrinsics_i = intrinsics[cam_idx] - wRi = wRi_list[cam_idx] - assert wRi is not None, "Camera must have valid rotation to use tracks for averaging." - assert intrinsics_i is not None, "Camera must be calibrated to use tracks for averaging." - - measurement_xy = intrinsics_i.calibrate(measurement.uv) - measurement_homogeneous = Point3(measurement_xy[0], measurement_xy[1], 1.0) - w_cam_U_track = Unit3(wRi.rotate(Unit3(measurement_homogeneous).point3())) - - # Direction starts at camera, but first index is track_id. - landmark_directions[(track_id, cam_idx)] = w_cam_U_track - return landmark_directions + observations = compute_world_directions(tracks_2d, intrinsics, wRi_list) + return {(track_id, cam_idx): Unit3(Point3(*direction)) + for track_id, cam_idx, direction in observations} def __run_averaging( self, diff --git a/gtsfm/configs/sift_global_positioner.yaml b/gtsfm/configs/sift_global_positioner.yaml new file mode 100644 index 000000000..2e27472e0 --- /dev/null +++ b/gtsfm/configs/sift_global_positioner.yaml @@ -0,0 +1,128 @@ +# MegaLoc + SIFT + Global Positioner with Single partition. +# Based on unified_megaloc.yaml with SIFT detector/matcher swap + GP. + +# @package _global_ +_target_: gtsfm.scene_optimizer.SceneOptimizer + +loader: + _target_: gtsfm.loader.Olsson + +image_pairs_generator: + _target_: gtsfm.retriever.image_pairs_generator.ImagePairsGenerator + global_descriptor: + _target_: gtsfm.frontend.cacher.global_descriptor_cacher.GlobalDescriptorCacher + global_descriptor_obj: + _target_: gtsfm.frontend.global_descriptor.MegaLoc + retriever: + _target_: gtsfm.retriever.Similarity + num_matched: 100 + min_score: 0.15 + use_diffusion_preprocessing: false + use_mst_augmentation: false + batch_size: 16 + +graph_partitioner: + _target_: gtsfm.graph_partitioner.Single + +cluster_optimizer: + _target_: gtsfm.cluster_optimizer.Multiview + + correspondence_generator: + _target_: gtsfm.frontend.correspondence_generator.det_desc_correspondence_generator.DetDescCorrespondenceGenerator + + detector_descriptor: + _target_: gtsfm.frontend.cacher.detector_descriptor_cacher.DetectorDescriptorCacher + detector_descriptor_obj: + _target_: gtsfm.frontend.detector_descriptor.colmap_sift.ColmapSIFTDetectorDescriptor + max_keypoints: 8192 + + matcher: + _target_: gtsfm.frontend.cacher.matcher_cacher.MatcherCacher + matcher_obj: + _target_: gtsfm.frontend.matcher.twoway_matcher.TwoWayMatcher + ratio_test_threshold: 0.8 + + two_view_estimator: + _target_: gtsfm.two_view_estimator_cacher.TwoViewEstimatorCacher + two_view_estimator_obj: + _target_: gtsfm.two_view_estimator.TwoViewEstimator + bundle_adjust_2view: True + eval_threshold_px: 4 + ba_reproj_error_thresholds: [0.5] + bundle_adjust_2view_maxiters: 100 + + verifier: + _target_: gtsfm.frontend.verifier.poselib_verifier.PoseLibVerifier + estimation_threshold_px: 2 + + triangulation_options: + _target_: gtsfm.data_association.point3d_initializer.TriangulationOptions + mode: + _target_: gtsfm.data_association.point3d_initializer.TriangulationSamplingMode + value: NO_RANSAC + + inlier_support_processor: + _target_: gtsfm.two_view_estimator.InlierSupportProcessor + min_num_inliers_est_model: 30 + min_inlier_ratio_est_model: 0.15 + save_gtsfm_data: True + save_3d_viz: False + save_two_view_viz: False + pose_angular_error_thresh: 5 + multiview_optimizer: + _target_: gtsfm.multi_view_optimizer.MultiViewOptimizer + + view_graph_estimator: + _target_: gtsfm.view_graph_estimator.cycle_consistent_rotation_estimator.CycleConsistentRotationViewGraphEstimator + edge_error_aggregation_criterion: MIN_EDGE_ERROR + error_threshold: 7.0 + + rot_avg_module: + _target_: gtsfm.averaging.rotation.shonan.ShonanRotationAveraging + weight_by_inliers: True + + trans_avg_module: + _target_: gtsfm.averaging.translation.averaging_1dsfm.TranslationAveraging1DSFM + robust_measurement_noise: True + projection_sampling_method: SAMPLE_INPUT_MEASUREMENTS + reject_outliers: False + use_all_tracks_for_averaging: True + use_relative_camera_poses: True + + data_association_module: + _target_: gtsfm.data_association.data_assoc.DataAssociation + min_track_len: 3 + triangulation_options: + _target_: gtsfm.data_association.point3d_initializer.TriangulationOptions + reproj_error_threshold: 10 + mode: + _target_: gtsfm.data_association.point3d_initializer.TriangulationSamplingMode + value: RANSAC_SAMPLE_UNIFORM + max_num_hypotheses: 100 + save_track_patches_viz: False + + global_positioner: + _target_: gtsfm.global_positioner.global_positioner.GlobalPositioner + noise_sigma: 0.01 + huber_loss_scale: 0.1 + max_iterations: 50 + min_track_measurements: 3 + max_reproj_error: 10.0 + save_iteration_visualization: True + + run_view_graph_calibration: true + rotation_outlier_threshold_deg: 0 + + bundle_adjustment_module: + _target_: gtsfm.bundle.bundle_adjustment.BundleAdjustmentOptimizer + reproj_error_thresholds: [10, 5, 3] + robust_ba_mode: "HUBER" + shared_calib: False + cam_pose3_prior_noise_sigma: 0.1 + calibration_prior_focal_sigma: 200.0 + measurement_noise_sigma: 1.0 + use_calibration_prior: true + + dense_multiview_optimizer: null + gaussian_splatting_optimizer: null + diff --git a/gtsfm/global_positioner/__init__.py b/gtsfm/global_positioner/__init__.py new file mode 100644 index 000000000..b70013e14 --- /dev/null +++ b/gtsfm/global_positioner/__init__.py @@ -0,0 +1 @@ +"""Global positioner module — joint camera + point estimation using bearing constraints.""" diff --git a/gtsfm/global_positioner/global_positioner.py b/gtsfm/global_positioner/global_positioner.py new file mode 100644 index 000000000..69bfbc437 --- /dev/null +++ b/gtsfm/global_positioner/global_positioner.py @@ -0,0 +1,403 @@ +"""Global positioning of cameras and 3D points via direction constraints. + +Implements the global positioning step from GLOMAP (Pan et al., ECCV 2024), +which jointly estimates camera positions and 3D point positions using +unit direction constraints from 2D feature tracks. + +This module replaces BOTH translation averaging AND data association +(triangulation) in the GTSFM pipeline. It outputs GtsfmData directly — +cameras with full poses + 3D tracks with landmarks — ready for bundle +adjustment. + +Pipeline integration: + BEFORE: rot_avg → trans_avg → data_association → BA + AFTER: rot_avg → GlobalPositioner → BA + +Uses gtsam.GlobalPositioner, which solves the bipartite camera+landmark +estimation problem directly using BilinearAngleTranslationFactor (BATA). + +References: + Pan, L., Barath, D., Pollefeys, M., Schonberger, J.L. + "Global Structure-from-Motion Revisited." ECCV 2024. + +Authors: Kathir Gounder +""" + +import os +import pickle +import time +from pathlib import Path +from typing import List, Optional, Set, Tuple, Union + +import gtsam +import numpy as np +from gtsam import ( + BinaryMeasurementUnit3, + BinaryMeasurementsUnit3, + KeySet, + Point3, + Pose3, + Rot3, + SfmTrack, + Unit3, + Values, +) +from gtsam.symbol_shorthand import A as C # Camera position variables +from gtsam.symbol_shorthand import B as L # Landmark position variables + +import gtsfm.common.types as gtsfm_types +import gtsfm.utils.logger as logger_utils +from gtsfm.common.gtsfm_data import GtsfmData +from gtsfm.common.sfm_track import SfmTrack2d +from gtsfm.evaluation.metrics import GtsfmMetric, GtsfmMetricsGroup +from gtsfm.utils.tracks import compute_world_directions, filter_tracks_by_measurements + +logger = logger_utils.get_logger() + +MIN_TRACK_MEASUREMENTS = 3 +DEFAULT_HUBER_LOSS_SCALE = 0.1 +DEFAULT_DIRECTION_NOISE_SIGMA = 0.01 +DEFAULT_MAX_ITERATIONS = 100 +DEFAULT_MAX_REPROJ_ERROR = 5.0 +DEFAULT_MAX_TRACKS = 0 + + +def _build_inputs( + observations: List[Tuple[int, int, np.ndarray]], + noise_sigma: float, + huber_loss_scale: float, +) -> Tuple[BinaryMeasurementsUnit3, KeySet, KeySet]: + """Build direction measurements and explicit camera/landmark key sets.""" + noise_model = gtsam.noiseModel.Isotropic.Sigma(2, noise_sigma) + huber = gtsam.noiseModel.mEstimator.Huber.Create(huber_loss_scale) + robust_noise = gtsam.noiseModel.Robust.Create(huber, noise_model) + + measurements = BinaryMeasurementsUnit3() + camera_keys = KeySet() + landmark_keys = KeySet() + + for track_idx, cam_idx, direction in observations: + measurements.append(BinaryMeasurementUnit3( + C(cam_idx), L(track_idx), Unit3(Point3(*direction)), robust_noise, + )) + camera_keys.insert(C(cam_idx)) + landmark_keys.insert(L(track_idx)) + + return measurements, camera_keys, landmark_keys + + +def build_and_solve( + observations: List[Tuple[int, int, np.ndarray]], + noise_sigma: float, + huber_loss_scale: float, + max_iterations: int, +) -> Tuple[Values, Set[int], Set[int]]: + """Solve via gtsam.GlobalPositioner — graph, gauge, init, and LM in one call.""" + measurements, camera_keys, landmark_keys = _build_inputs(observations, noise_sigma, huber_loss_scale) + anchor_key = min(camera_keys) + + logger.info("GlobalPositioner: %d measurements, %d cameras, %d landmarks", + len(observations), len(camera_keys), len(landmark_keys)) + + lm_params = gtsam.LevenbergMarquardtParams() + lm_params.setMaxIterations(max_iterations) + lm_params.setRelativeErrorTol(1e-5) + lm_params.setVerbosityLM("SUMMARY") + + gp = gtsam.GlobalPositioner(lm_params) + result = gp.run(measurements, camera_keys, landmark_keys, anchor_key) + + cam_indices = {gtsam.Symbol(k).index() for k in camera_keys} + track_indices = {gtsam.Symbol(k).index() for k in landmark_keys} + return result, cam_indices, track_indices + + +def build_and_solve_with_trace( + observations: List[Tuple[int, int, np.ndarray]], + noise_sigma: float, + huber_loss_scale: float, + max_iterations: int, +) -> Tuple[Values, Set[int], Set[int], List[Values], List[float]]: + """Solve with per-iteration capture for convergence visualization.""" + measurements, camera_keys, landmark_keys = _build_inputs(observations, noise_sigma, huber_loss_scale) + anchor_key = min(camera_keys) + + logger.info("GlobalPositioner (trace): %d measurements, %d cameras, %d landmarks", + len(observations), len(camera_keys), len(landmark_keys)) + + lm_params = gtsam.LevenbergMarquardtParams() + lm_params.setMaxIterations(max_iterations) + lm_params.setRelativeErrorTol(1e-5) + lm_params.setVerbosityLM("SILENT") + + gp = gtsam.GlobalPositioner(lm_params) + graph = gp.buildGraph(measurements, True) + gp.addAnchorPrior(anchor_key, graph) + initial = gp.initializeRandomly(camera_keys, landmark_keys, measurements) + + initial_error = graph.error(initial) + logger.info("GlobalPositioner (trace): initial error = %.2f", initial_error) + + optimizer = gtsam.LevenbergMarquardtOptimizer(graph, initial, lm_params) + values_trace = [initial] + error_trace = [initial_error] + + for iteration in range(1, max_iterations + 1): + t0 = time.time() + optimizer.iterate() + dt = time.time() - t0 + error = optimizer.error() + values_trace.append(optimizer.values()) + error_trace.append(error) + if iteration <= 5 or iteration % 5 == 0: + logger.info(" iter %3d: error=%.2f (%.2fs)", iteration, error, dt) + # Early stopping on convergence. + if len(error_trace) >= 2: + relative_change = abs(error_trace[-2] - error) / max(error_trace[-2], 1e-12) + if relative_change < 1e-5: + logger.info(" Converged at iteration %d (relative change %.2e)", iteration, relative_change) + break + + result = optimizer.values() + cam_indices = {gtsam.Symbol(k).index() for k in camera_keys} + track_indices = {gtsam.Symbol(k).index() for k in landmark_keys} + logger.info("GlobalPositioner (trace): final error = %.4f (%d frames)", error_trace[-1], len(values_trace)) + return result, cam_indices, track_indices, values_trace, error_trace + + +def build_gtsfm_data( + result_values: Values, + num_images: int, + wRi_list: List[Optional[Rot3]], + intrinsics: List[Optional[gtsfm_types.CALIBRATION_TYPE]], + tracks_2d: List[SfmTrack2d], + camera_indices: Set[int], + track_indices: Set[int], + valid_cameras: Set[int], + max_reproj_error: float = DEFAULT_MAX_REPROJ_ERROR, + max_angle_error_deg: float = 1.0, +) -> Tuple[GtsfmData, int, int]: + """Build GtsfmData from optimized camera + landmark positions.""" + gtsfm_data = GtsfmData(number_images=num_images) + camera_class = gtsfm_types.get_camera_class_for_calibration(intrinsics[0]) + + cameras_added = 0 + for cam_idx in camera_indices: + wRi, Ki = wRi_list[cam_idx], intrinsics[cam_idx] + if wRi is None or Ki is None: + continue + try: + pose = Pose3(wRi, result_values.atPoint3(C(cam_idx))) + gtsfm_data.add_camera(cam_idx, camera_class(pose, Ki)) + cameras_added += 1 + except RuntimeError: + logger.warning("GlobalPositioner: failed to build camera %d.", cam_idx) + + num_tracks_before = 0 + num_tracks_after = 0 + estimated_camera_set = set(gtsfm_data.get_valid_camera_indices()) + + for track_idx in sorted(track_indices): + try: + point_3d = result_values.atPoint3(L(track_idx)) + except RuntimeError: + continue + if track_idx >= len(tracks_2d): + continue + + sfm_track = SfmTrack(point_3d) + num_valid = 0 + for m_idx in range(tracks_2d[track_idx].number_measurements()): + m = tracks_2d[track_idx].measurement(m_idx) + if m.i in estimated_camera_set: + sfm_track.addMeasurement(m.i, m.uv) + num_valid += 1 + if num_valid < 2: + continue + + num_tracks_before += 1 + # Per-observation angular error filter (GLOMAP FilterTracksByAngle). + # Remove observations where camera ray disagrees with 3D point direction. + cos_angle_thresh = np.cos(np.radians(max_angle_error_deg)) + filtered_track = SfmTrack(point_3d) + for m_idx in range(sfm_track.numberMeasurements()): + cam_idx_m, uv_m = sfm_track.measurement(m_idx) + camera_m = gtsfm_data.get_camera(cam_idx_m) + if camera_m is None: + continue + try: + # Compute normalized ray from camera to 3D point. + pt_cam = camera_m.pose().transformTo(point_3d) + if pt_cam[2] < 1e-12: + continue # Behind camera. + pt_cam_dir = pt_cam / np.linalg.norm(pt_cam) + # Compute observed feature ray direction. + Ki = intrinsics[cam_idx_m] + uv_norm = Ki.calibrate(uv_m) + obs_dir = np.array([uv_norm[0], uv_norm[1], 1.0]) + obs_dir /= np.linalg.norm(obs_dir) + # Angular consistency check. + if pt_cam_dir.dot(obs_dir) < cos_angle_thresh: + continue # Angle too large, skip this observation. + # Also check reprojection error. + proj = camera_m.project(point_3d) + if np.sqrt((proj[0] - uv_m[0])**2 + (proj[1] - uv_m[1])**2) > max_reproj_error: + continue + filtered_track.addMeasurement(cam_idx_m, uv_m) + except RuntimeError: + continue + if filtered_track.numberMeasurements() >= 2: + gtsfm_data.add_track(filtered_track) + num_tracks_after += 1 + + logger.info("GlobalPositioner: %d cameras, %d/%d tracks (%.1f%% kept).", + cameras_added, num_tracks_after, num_tracks_before, + 100 * num_tracks_after / max(num_tracks_before, 1)) + return gtsfm_data, num_tracks_before, num_tracks_after + + +class GlobalPositioner: + """Joint camera + landmark position estimation via BATA direction constraints. + + Uses gtsam.GlobalPositioner to solve the bipartite estimation problem. + + Pipeline: rot_avg → GlobalPositioner.run() → BA + """ + + def __init__( + self, + noise_sigma: float = DEFAULT_DIRECTION_NOISE_SIGMA, + huber_loss_scale: float = DEFAULT_HUBER_LOSS_SCALE, + max_iterations: int = DEFAULT_MAX_ITERATIONS, + min_track_measurements: int = MIN_TRACK_MEASUREMENTS, + max_reproj_error: float = DEFAULT_MAX_REPROJ_ERROR, + max_tracks: int = DEFAULT_MAX_TRACKS, + save_iteration_visualization: bool = False, + ) -> None: + self._noise_sigma = noise_sigma + self._huber_loss_scale = huber_loss_scale + self._max_iterations = max_iterations + self._min_track_measurements = min_track_measurements + self._max_reproj_error = max_reproj_error + self._max_tracks = max_tracks + self._save_iteration_visualization = save_iteration_visualization + + def run( + self, + num_images: int, + wRi_list: List[Optional[Rot3]], + tracks_2d: List[SfmTrack2d], + intrinsics: List[Optional[gtsfm_types.CALIBRATION_TYPE]], + output_root: Optional[Union[str, Path]] = None, + ) -> Tuple[GtsfmData, GtsfmMetricsGroup]: + """Run global positioning and produce GtsfmData for BA.""" + start_time = time.time() + + if not tracks_2d: + logger.error("GlobalPositioner: no 2D tracks provided.") + return GtsfmData(number_images=num_images), GtsfmMetricsGroup("global_positioning_metrics", []) + if intrinsics is None: + logger.error("GlobalPositioner: no intrinsics provided.") + return GtsfmData(number_images=num_images), GtsfmMetricsGroup("global_positioning_metrics", []) + + valid_cameras = {i for i, wRi in enumerate(wRi_list) if wRi is not None} + if len(valid_cameras) < 2: + logger.error("GlobalPositioner: need >= 2 cameras with rotations.") + return GtsfmData(number_images=num_images), GtsfmMetricsGroup("global_positioning_metrics", []) + + logger.info("GlobalPositioner: %d valid cameras, %d input tracks.", len(valid_cameras), len(tracks_2d)) + + filtered_tracks = filter_tracks_by_measurements( + tracks_2d, valid_cameras, self._min_track_measurements, self._max_tracks, + ) + logger.info("GlobalPositioner: %d tracks after filtering.", len(filtered_tracks)) + if not filtered_tracks: + logger.error("GlobalPositioner: no tracks survived filtering.") + return GtsfmData(number_images=num_images), GtsfmMetricsGroup("global_positioning_metrics", []) + + t0 = time.time() + observations = compute_world_directions(filtered_tracks, intrinsics, wRi_list, valid_cameras) + dt_dir = time.time() - t0 + logger.info("GlobalPositioner: %d direction observations (%.2fs).", len(observations), dt_dir) + + if not observations: + return GtsfmData(number_images=num_images), GtsfmMetricsGroup("global_positioning_metrics", []) + + t0 = time.time() + if self._save_iteration_visualization: + result_values, cam_indices, track_indices, values_trace, error_trace = build_and_solve_with_trace( + observations, self._noise_sigma, self._huber_loss_scale, self._max_iterations, + ) + # Extract positions as numpy arrays (avoids GTSAM pickling issues) + sorted_cams = sorted(cam_indices) + sorted_tracks = sorted(track_indices) + positions_trace = [] + for vals in values_trace: + cam_pos = {ci: np.array(vals.atPoint3(C(ci))) for ci in sorted_cams} + lmk_pos = {} + for ti in sorted_tracks: + try: + lmk_pos[ti] = np.array(vals.atPoint3(L(ti))) + except RuntimeError: + pass + positions_trace.append({"cameras": cam_pos, "landmarks": lmk_pos}) + + # Convert wRi to rotation matrices (pure numpy, no GTSAM objects) + wRi_matrices = {} + for ci in sorted_cams: + wRi = wRi_list[ci] + if wRi is not None: + wRi_matrices[ci] = wRi.matrix() + + # Save trace under the dataset-specific output root so multiple + # runs don't clobber each other's traces. + if output_root is not None: + trace_path = str(Path(output_root) / "results" / "gp_convergence_trace.pkl") + else: + trace_path = "results/gp_convergence_trace.pkl" + os.makedirs(os.path.dirname(trace_path), exist_ok=True) + with open(trace_path, "wb") as f: + pickle.dump({ + "positions_trace": positions_trace, "error_trace": error_trace, + "camera_indices": cam_indices, "track_indices": track_indices, + "wRi_matrices": wRi_matrices, + "filtered_tracks": filtered_tracks, + }, f) + logger.info("GlobalPositioner: saved trace (%d frames) to %s", len(positions_trace), trace_path) + else: + result_values, cam_indices, track_indices = build_and_solve( + observations, self._noise_sigma, self._huber_loss_scale, self._max_iterations, + ) + dt_solve = time.time() - t0 + logger.info("GlobalPositioner: optimization took %.2fs.", dt_solve) + + t0 = time.time() + gtsfm_data, tracks_before, tracks_after = build_gtsfm_data( + result_values, num_images, wRi_list, intrinsics, filtered_tracks, + cam_indices, track_indices, valid_cameras, self._max_reproj_error, + ) + dt_build = time.time() - t0 + total_duration = time.time() - start_time + + metrics = GtsfmMetricsGroup("global_positioning_metrics", [ + GtsfmMetric("num_valid_cameras", len(valid_cameras)), + GtsfmMetric("num_tracks_used", len(filtered_tracks)), + GtsfmMetric("num_direction_observations", len(observations)), + GtsfmMetric("num_camera_variables", len(cam_indices)), + GtsfmMetric("num_landmark_variables", len(track_indices)), + GtsfmMetric("num_cameras_in_output", len(gtsfm_data.get_valid_camera_indices()) if gtsfm_data else 0), + GtsfmMetric("num_tracks_before_filter", tracks_before), + GtsfmMetric("num_tracks_after_filter", tracks_after), + GtsfmMetric("direction_computation_sec", dt_dir), + GtsfmMetric("optimization_sec", dt_solve), + GtsfmMetric("gtsfm_data_build_sec", dt_build), + GtsfmMetric("total_duration_sec", total_duration), + ]) + + logger.info( + "GlobalPositioner: done in %.2fs — %d cameras, %d tracks.", + total_duration, + len(gtsfm_data.get_valid_camera_indices()) if gtsfm_data else 0, + gtsfm_data.number_tracks() if gtsfm_data else 0, + ) + return gtsfm_data, metrics diff --git a/gtsfm/multi_view_optimizer.py b/gtsfm/multi_view_optimizer.py index 571c766ff..e2bc3e8d5 100644 --- a/gtsfm/multi_view_optimizer.py +++ b/gtsfm/multi_view_optimizer.py @@ -25,6 +25,7 @@ from gtsfm.data_association.cpp_dsf_tracks_estimator import CppDsfTracksEstimator from gtsfm.data_association.data_assoc import DataAssociation from gtsfm.evaluation.metrics import GtsfmMetricsGroup +from gtsfm.global_positioner.global_positioner import GlobalPositioner from gtsfm.products.one_view_data import OneViewData from gtsfm.products.two_view_result import TwoViewResult from gtsfm.products.visibility_graph import AnnotatedGraph @@ -71,12 +72,14 @@ def __init__( data_association_module: DataAssociation, bundle_adjustment_module: GlobalBundleAdjustment, view_graph_estimator: Optional[ViewGraphEstimatorBase] = None, + global_positioner: Optional[GlobalPositioner] = None, ) -> None: self.view_graph_estimator = view_graph_estimator self.rot_avg_module = rot_avg_module self.trans_avg_module = trans_avg_module self.data_association_module = data_association_module self.ba_optimizer = bundle_adjustment_module + self.global_positioner = global_positioner self._run_view_graph_estimator: bool = self.view_graph_estimator is not None self.view_graph_estimator_v2 = CycleConsistentRotationViewGraphEstimator( @@ -185,32 +188,43 @@ def create_computation_graph( tracks2d_graph = delayed(get_2d_tracks)(viewgraph_v_corr_idxs_graph, keypoints_graph) absolute_pose_priors = [one_view_data_dict[idx].absolute_pose_prior for idx in range(num_images)] - wTi_graph, ta_metrics, ta_inlier_idx_i1_i2 = self.trans_avg_module.create_computation_graph( - num_images, - pruned_i2Ui1_graph, - delayed_wRi, - tracks2d_graph, - all_intrinsics, - absolute_pose_priors, - pose_priors_graph, - gt_wTi_list=gt_wTi_list, - ) - ta_v_corr_idxs_graph = delayed(filter_corr_by_idx)(viewgraph_v_corr_idxs_graph, ta_inlier_idx_i1_i2) - ta_inlier_tracks_2d_graph = delayed(get_2d_tracks)(ta_v_corr_idxs_graph, keypoints_graph) - # TODO(akshay-krishnan): update pose priors also with the same inlier indices, right now these are unused. - - init_cameras_graph = delayed(init_cameras)(wTi_graph, all_intrinsics) - cameras_gt = [one_view_data_dict[idx].camera_gt for idx in sorted(list(one_view_data_dict.keys()))] - images: List[Future] = [image_future_map[idx] for idx in sorted(list(one_view_data_dict.keys()))] - ba_input_graph, data_assoc_metrics_graph = self.data_association_module.create_computation_graph( - num_images, - init_cameras_graph, - ta_inlier_tracks_2d_graph, - cameras_gt, - pose_priors_graph, - images, - ) + + if self.global_positioner is not None: + # Path B: Global positioner replaces trans_avg + data_assoc. + ba_input_graph, gp_metrics = delayed(self.global_positioner.run, nout=2)( + num_images, delayed_wRi, tracks2d_graph, all_intrinsics, + output_root=output_root, + ) + ta_metrics = gp_metrics + data_assoc_metrics_graph = delayed(GtsfmMetricsGroup)("data_association_metrics", []) + else: + # Path A: Existing pipeline — trans_avg + data_assoc. + wTi_graph, ta_metrics, ta_inlier_idx_i1_i2 = self.trans_avg_module.create_computation_graph( + num_images, + pruned_i2Ui1_graph, + delayed_wRi, + tracks2d_graph, + all_intrinsics, + absolute_pose_priors, + pose_priors_graph, + gt_wTi_list=gt_wTi_list, + ) + ta_v_corr_idxs_graph = delayed(filter_corr_by_idx)(viewgraph_v_corr_idxs_graph, ta_inlier_idx_i1_i2) + ta_inlier_tracks_2d_graph = delayed(get_2d_tracks)(ta_v_corr_idxs_graph, keypoints_graph) + # TODO(akshay-krishnan): update pose priors also with the same inlier indices, right now these are unused. + + init_cameras_graph = delayed(init_cameras)(wTi_graph, all_intrinsics) + + images: List[Future] = [image_future_map[idx] for idx in sorted(list(one_view_data_dict.keys()))] + ba_input_graph, data_assoc_metrics_graph = self.data_association_module.create_computation_graph( + num_images, + init_cameras_graph, + ta_inlier_tracks_2d_graph, + cameras_gt, + pose_priors_graph, + images, + ) ba_result_graph, ba_metrics_graph = self.ba_optimizer.create_computation_graph( ba_input_graph, diff --git a/gtsfm/utils/metrics.py b/gtsfm/utils/metrics.py index a0dafdc73..aae2eadcb 100644 --- a/gtsfm/utils/metrics.py +++ b/gtsfm/utils/metrics.py @@ -10,6 +10,9 @@ from pathlib import Path from typing import List, Optional, Sequence, Set, Tuple, Union +import matplotlib + +matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np import seaborn as sns # type: ignore diff --git a/gtsfm/utils/tracks.py b/gtsfm/utils/tracks.py index ea973e41e..12f1b15c5 100644 --- a/gtsfm/utils/tracks.py +++ b/gtsfm/utils/tracks.py @@ -4,9 +4,10 @@ """ import itertools -from typing import Dict, List +from typing import Dict, List, Optional, Set, Tuple -from gtsam import PinholeCameraCal3Bundler, SfmTrack # type: ignore +import numpy as np +from gtsam import PinholeCameraCal3Bundler, Point3, Rot3, SfmTrack # type: ignore import gtsfm.common.types as gtsfm_types from gtsfm.common.sfm_track import SfmMeasurement, SfmTrack2d @@ -80,6 +81,85 @@ def classify_tracks3d_with_gt_cameras( return classify_tracks2d_with_gt_cameras(tracks_2d, cameras_gt, reproj_error_thresh_px) +def filter_tracks_by_measurements( + tracks_2d: List[SfmTrack2d], + valid_cameras: Set[int], + min_measurements: int = 3, + max_tracks: int = 0, +) -> List[SfmTrack2d]: + """Filter 2D tracks by minimum count of measurements from valid cameras. + + Tracks are sorted longest-first; if `max_tracks > 0` only the top-K survive. + Used by global positioning to drop short / orphan tracks before solving. + """ + scored: List[Tuple[int, SfmTrack2d]] = [] + for track in tracks_2d: + valid_count = sum( + 1 + for m_idx in range(track.number_measurements()) + if track.measurement(m_idx).i in valid_cameras + ) + if valid_count >= min_measurements: + scored.append((valid_count, track)) + scored.sort(key=lambda x: x[0], reverse=True) + if max_tracks > 0 and len(scored) > max_tracks: + scored = scored[:max_tracks] + return [track for _, track in scored] + + +def compute_world_directions( + tracks_2d: List[SfmTrack2d], + intrinsics: List[Optional[gtsfm_types.CALIBRATION_TYPE]], + wRi_list: List[Optional[Rot3]], + valid_cameras: Optional[Set[int]] = None, +) -> List[Tuple[int, int, np.ndarray]]: + """Compute world-frame unit directions from each camera to each observed track point. + + For every (track, measurement) pair, the function: + 1. Calibrates the pixel observation via the camera's intrinsics. + 2. Lifts the calibrated point to a unit ray in the camera frame. + 3. Rotates the ray into the world frame using `wRi`. + + Used by the global positioner (BATA factor graph). The 1DSfM translation-averaging + module computes the same quantity in averaging_1dsfm.py:_get_landmark_directions — + TODO(akshay-krishnan): unify those two implementations behind this util. + + Args: + tracks_2d: 2D tracks across images. + intrinsics: Per-camera calibration; entries may be None for invalid cameras. + wRi_list: Per-camera rotations to world frame; entries may be None. + valid_cameras: If given, only measurements from these cameras are returned. + Measurements from cameras with None rotation/intrinsics are always skipped. + + Returns: + List of `(track_idx, cam_idx, world_direction)` triplets where `world_direction` + is a unit-norm `np.ndarray` of shape (3,). + """ + observations: List[Tuple[int, int, np.ndarray]] = [] + for track_idx, track in enumerate(tracks_2d): + for m_idx in range(track.number_measurements()): + m = track.measurement(m_idx) + cam_idx, uv = m.i, m.uv + if valid_cameras is not None and cam_idx not in valid_cameras: + continue + wRi, Ki = wRi_list[cam_idx], intrinsics[cam_idx] + if wRi is None or Ki is None: + continue + try: + normalized_pt = Ki.calibrate(uv) + except RuntimeError: + continue + camera_ray = np.array([normalized_pt[0], normalized_pt[1], 1.0]) + camera_ray /= np.linalg.norm(camera_ray) + world_dir = wRi.rotate(Point3(*camera_ray)) + wd = np.array([world_dir[0], world_dir[1], world_dir[2]]) + norm = np.linalg.norm(wd) + if norm < 1e-12: + continue + observations.append((track_idx, cam_idx, wd / norm)) + return observations + + def get_max_triangulation_angle(track3d: SfmTrack, cameras: Dict[int, gtsfm_types.CAMERA_TYPE]) -> float: """Get the angle (in degrees) subtended by the cameras at the 3D landmark of the track. diff --git a/pyproject.toml b/pyproject.toml index 93fae53e1..401169758 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dependencies = [ "pydegensac", "colour", "trimesh[easy]", - "gtsam-develop==4.3a1.dev202603090403", + "gtsam-develop==4.3a1.dev202604240153", "pydot", # Dust3r/Mast3r diff --git a/scripts/visualize_gp_convergence.py b/scripts/visualize_gp_convergence.py new file mode 100644 index 000000000..febe8a9a8 --- /dev/null +++ b/scripts/visualize_gp_convergence.py @@ -0,0 +1,561 @@ +"""Visualize global positioner convergence from random initialization. + +Creates a frame-by-frame animation showing cameras (red) and 3D points (black) +converging from random positions to a structured scene over LM iterations. + +Two-phase workflow: + 1. Run the GTSFM pipeline normally (saves results to results/ directory) + 2. Run this script to load the saved results, replay GP with per-iteration + capture, and render frames. + +Usage: + # First, run the pipeline: + ./run --config_name sift_global_positioner --loader olsson \ + --dataset_dir tests/data/set1_lund_door --num_workers 1 + + # Then, render the convergence animation: + python scripts/visualize_gp_convergence.py \ + --results_dir results/ba_input \ + --max_iterations 50 \ + --output_dir /tmp/gp_frames + + # Stitch to video: + ffmpeg -framerate 5 -i /tmp/gp_frames/frame_%04d.png \ + -c:v libx264 -pix_fmt yuv420p convergence.mp4 +""" + +import argparse +import os +import sys +import time +from pathlib import Path +from typing import List, Optional, Set, Tuple + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +from mpl_toolkits.mplot3d import Axes3D # noqa: F401 + +import gtsam +from gtsam import ( + BinaryMeasurementUnit3, + BinaryMeasurementsUnit3, + Point3, + Pose3, + Rot3, + Symbol, + TranslationRecovery, + Unit3, + Values, +) +from gtsam.symbol_shorthand import A as C # Camera keys +from gtsam.symbol_shorthand import B as L # Landmark keys + +# Add project root to path +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import gtsfm.common.types as gtsfm_types +import gtsfm.utils.io as io_utils +from gtsfm.common.sfm_track import SfmTrack2d +from gtsfm.global_positioner.global_positioner import compute_world_bearings, filter_tracks + + +def load_rotations_and_tracks_from_colmap( + colmap_dir: str, + images_dir: Optional[str] = None, +) -> Tuple[List[Optional[Rot3]], List[Optional[gtsfm_types.CALIBRATION_TYPE]], List[SfmTrack2d], int, Optional[np.ndarray]]: + """Load rotations, intrinsics, 2D tracks, and per-track RGB colors from COLMAP-format output. + + Returns: + wRi_list, intrinsics, tracks_2d, num_images, track_colors (N,3) float [0,1] or None + """ + wTi_list, img_fnames, calibrations, point_cloud, rgb, img_dims = io_utils.read_scene_data_from_colmap_format( + colmap_dir + ) + + num_images = len(wTi_list) + + # Extract rotations + wRi_list: List[Optional[Rot3]] = [] + for wTi in wTi_list: + if wTi is not None: + wRi_list.append(wTi.rotation()) + else: + wRi_list.append(None) + + # Build intrinsics list + intrinsics: List[Optional[gtsfm_types.CALIBRATION_TYPE]] = list(calibrations) + + # Build 2D tracks from the COLMAP points3D data + import pycolmap + from gtsfm.common.sfm_track import SfmMeasurement + from PIL import Image + + reconstruction = pycolmap.Reconstruction(colmap_dir) + + # Try to load images for color sampling + loaded_images = {} + if images_dir is None: + # Try common relative paths + for candidate in [os.path.join(os.path.dirname(colmap_dir), "images"), + os.path.join(colmap_dir, "..", "images")]: + if os.path.isdir(candidate): + images_dir = candidate + break + + if images_dir and os.path.isdir(images_dir): + # Try matching by COLMAP name first, then fall back to sorted directory listing + for img_id, image in reconstruction.images.items(): + img_path = os.path.join(images_dir, image.name) + if os.path.exists(img_path): + try: + loaded_images[img_id] = np.array(Image.open(img_path)) + except Exception: + pass + + # If no matches by name, load by sorted order (GTSFM renames images internally) + if not loaded_images: + sorted_img_ids = sorted(reconstruction.images.keys()) + img_files = sorted([f for f in os.listdir(images_dir) + if f.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff'))]) + for img_id, fname in zip(sorted_img_ids, img_files): + try: + loaded_images[img_id] = np.array(Image.open(os.path.join(images_dir, fname))) + except Exception: + pass + + print(f"Loaded {len(loaded_images)} images for color sampling") + + tracks_2d = [] + track_colors = [] + + for point3D_id, point3D in reconstruction.points3D.items(): + measurements = [] + sampled_color = None + + for track_elem in point3D.track.elements: + img_id = track_elem.image_id + point2D_idx = track_elem.point2D_idx + image = reconstruction.images[img_id] + uv = image.points2D[point2D_idx].xy + cam_idx = img_id - 1 if img_id > 0 else img_id + measurements.append(SfmMeasurement(i=cam_idx, uv=np.array(uv))) + + # Sample color from first available image + if sampled_color is None and img_id in loaded_images: + img_arr = loaded_images[img_id] + u, v = int(round(uv[0])), int(round(uv[1])) + if 0 <= v < img_arr.shape[0] and 0 <= u < img_arr.shape[1]: + pixel = img_arr[v, u] + if len(pixel) >= 3: + sampled_color = pixel[:3] / 255.0 + + if len(measurements) >= 2: + tracks_2d.append(SfmTrack2d(measurements=measurements)) + track_colors.append(sampled_color if sampled_color is not None else [0.3, 0.3, 0.3]) + + track_colors_arr = np.array(track_colors) if track_colors else None + print(f"Loaded {num_images} cameras, {len(tracks_2d)} tracks from {colmap_dir}") + if track_colors_arr is not None and len(loaded_images) > 0: + colored = np.sum(np.any(track_colors_arr != 0.3, axis=1)) + print(f"Sampled RGB color for {colored}/{len(tracks_2d)} tracks") + + return wRi_list, intrinsics, tracks_2d, num_images, track_colors_arr + + +def build_graph_and_iterate( + wRi_list: List[Optional[Rot3]], + intrinsics: List[Optional[gtsfm_types.CALIBRATION_TYPE]], + tracks_2d: List[SfmTrack2d], + num_images: int, + max_iterations: int = 50, + min_track_measurements: int = 3, +) -> Tuple[List[Values], List[float], Set[int], Set[int]]: + """Build the global positioner graph and capture values at each iteration.""" + valid_cameras = {i for i, wRi in enumerate(wRi_list) if wRi is not None} + filtered_tracks = filter_tracks(tracks_2d, valid_cameras, min_track_measurements) + observations = compute_world_bearings(filtered_tracks, intrinsics, wRi_list, valid_cameras) + + print(f"Building graph: {len(observations)} observations, {len(valid_cameras)} cameras, {len(filtered_tracks)} tracks") + + # Build measurements + noise_model = gtsam.noiseModel.Isotropic.Sigma(2, 0.01) + huber = gtsam.noiseModel.mEstimator.Huber.Create(0.1) + robust_noise = gtsam.noiseModel.Robust.Create(huber, noise_model) + + measurements = BinaryMeasurementsUnit3() + camera_indices: Set[int] = set() + track_indices: Set[int] = set() + + for track_idx, cam_idx, bearing in observations: + measurements.append(BinaryMeasurementUnit3( + C(cam_idx), L(track_idx), Unit3(Point3(*bearing)), robust_noise, + )) + camera_indices.add(cam_idx) + track_indices.add(track_idx) + + # Build graph via TranslationRecovery (uses native C++ BilinearAngleTranslationFactor) + lm_params = gtsam.LevenbergMarquardtParams() + lm_params.setMaxIterations(max_iterations) + lm_params.setVerbosityLM("SILENT") + tr = TranslationRecovery(lm_params, use_bilinear_translation_factor=True) + + graph = tr.buildGraph(measurements) + tr.addPrior(measurements, 1.0, gtsam.BinaryMeasurementsPoint3(), graph) + + # Initialize randomly (matching TranslationRecovery C++ code: uniform(-1, 1)) + rng = np.random.RandomState(42) + initial = Values() + for cam_idx in camera_indices: + initial.insert(C(cam_idx), Point3(*rng.uniform(-1, 1, 3))) + for track_idx in track_indices: + initial.insert(L(track_idx), Point3(*rng.uniform(-1, 1, 3))) + # Scale variables (one per observation, initialized to 1.0) + for i in range(len(observations)): + initial.insert(Symbol('S', i).key(), np.array([[1.0]])) + + initial_error = graph.error(initial) + print(f"Initial error: {initial_error:.2f}") + + # Iterate and capture values at each step + optimizer = gtsam.LevenbergMarquardtOptimizer(graph, initial, lm_params) + values_trace = [initial] + error_trace = [initial_error] + + for iteration in range(1, max_iterations + 1): + t0 = time.time() + optimizer.iterate() + dt = time.time() - t0 + error = optimizer.error() + values_trace.append(optimizer.values()) + error_trace.append(error) + + if iteration <= 5 or iteration % 5 == 0: + print(f" iter {iteration:3d}: error={error:.2f} ({dt:.2f}s)") + + print(f"Final error: {error_trace[-1]:.4f} ({len(values_trace)} frames captured)") + return values_trace, error_trace, camera_indices, track_indices + + +def compute_auto_view_angle(cam_positions: np.ndarray) -> Tuple[float, float]: + """Compute elev/azim that looks along the normal to the camera plane. + + Uses PCA to find the plane that best fits the camera positions. + The view direction is set along the least-variance axis (normal to the + camera arrangement), giving a natural front-on or overhead view depending + on the scene geometry. + """ + centered = cam_positions - cam_positions.mean(axis=0) + _, _, Vt = np.linalg.svd(centered, full_matrices=False) + + # 3rd principal component = normal to camera plane (least variance direction) + normal = Vt[2] + + # Ensure we look from the "positive" side (arbitrary but consistent) + if normal[2] < 0: + normal = -normal + + # Convert to matplotlib elev/azim (degrees) + elev = np.degrees(np.arcsin(np.clip(normal[2], -1, 1))) + azim = np.degrees(np.arctan2(normal[1], normal[0])) + + return elev, azim + + +def extract_positions( + values: Values, + wRi_list: List[Optional[Rot3]], + camera_indices: Set[int], + track_indices: Set[int], +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Extract camera positions, forward directions, and landmark positions.""" + cam_positions = [] + cam_forwards = [] + for cam_idx in sorted(camera_indices): + pos = values.atPoint3(C(cam_idx)) + cam_positions.append(pos) + wRi = wRi_list[cam_idx] + if wRi is not None: + fwd = wRi.rotate(Point3(0, 0, 1)) + cam_forwards.append(np.array([fwd[0], fwd[1], fwd[2]])) + else: + cam_forwards.append(np.array([0, 0, 1])) + + landmarks = [] + for track_idx in sorted(track_indices): + try: + pt = values.atPoint3(L(track_idx)) + landmarks.append(pt) + except RuntimeError: + continue + + return np.array(cam_positions), np.array(cam_forwards), np.array(landmarks) + + +def render_frame( + cam_positions: np.ndarray, + cam_forwards: np.ndarray, + landmarks: np.ndarray, + iteration: int, + error: float, + initial_error: float, + output_path: str, + view_center: np.ndarray, + view_range: float, + point_colors: Optional[np.ndarray] = None, + elev: float = 30, + azim: float = -60, +): + """Render a single frame with cameras (red) and points (colored or black).""" + fig = plt.figure(figsize=(16, 10), facecolor='white') + ax = fig.add_subplot(111, projection='3d', facecolor='white') + + # Points + if len(landmarks) > 0: + # Subsample for rendering speed + max_pts = 5000 + # Match color array size to landmarks (trace may have more tracks than COLMAP output) + if point_colors is not None and len(point_colors) < len(landmarks): + point_colors = None # size mismatch, fall back to no color + + if len(landmarks) > max_pts: + idx = np.random.RandomState(0).choice(len(landmarks), max_pts, replace=False) + lm_plot = landmarks[idx] + colors_plot = point_colors[idx] if point_colors is not None else None + else: + lm_plot = landmarks + colors_plot = point_colors + + if colors_plot is not None: + ax.scatter(lm_plot[:, 0], lm_plot[:, 1], lm_plot[:, 2], + c=colors_plot, s=0.8, alpha=0.7, depthshade=False) + else: + ax.scatter(lm_plot[:, 0], lm_plot[:, 1], lm_plot[:, 2], + c='black', s=0.5, alpha=0.4, depthshade=True) + + # Cameras (red triangles with forward direction) + if len(cam_positions) > 0: + ax.scatter(cam_positions[:, 0], cam_positions[:, 1], cam_positions[:, 2], + c='red', s=60, marker='^', alpha=0.9, depthshade=True, + edgecolors='darkred', linewidths=0.3) + + # Draw forward direction as short lines + scale = view_range * 0.03 + for pos, fwd in zip(cam_positions, cam_forwards): + end = pos + fwd * scale + ax.plot([pos[0], end[0]], [pos[1], end[1]], [pos[2], end[2]], + c='red', alpha=0.6, linewidth=1) + + # Consistent view bounds + ax.set_xlim(view_center[0] - view_range, view_center[0] + view_range) + ax.set_ylim(view_center[1] - view_range, view_center[1] + view_range) + ax.set_zlim(view_center[2] - view_range, view_center[2] + view_range) + + ax.view_init(elev=elev, azim=azim) + ax.set_axis_off() + + # Compute error reduction percentage + reduction = (1.0 - error / initial_error) * 100 if initial_error > 0 else 0 + + ax.set_title( + f"Global Positioner Convergence — GLOMAP-style BATA via GTSAM\n" + f"Iteration {iteration} | Cost: {error:,.0f} | " + f"Reduction: {reduction:.1f}% | " + f"{len(cam_positions)} cameras, {len(landmarks)} points", + fontsize=13, fontweight='bold', pad=20, + ) + + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white') + plt.close(fig) + + +def main(): + parser = argparse.ArgumentParser(description="Visualize global positioner convergence") + parser.add_argument("--trace_file", type=str, default="results/gp_convergence_trace.pkl", + help="Path to GP convergence trace pickle from a pipeline run with save_iteration_visualization=True") + parser.add_argument("--results_dir", type=str, default="results/ba_input", + help="Fallback: Path to COLMAP-format results (used if trace file not found)") + parser.add_argument("--images_dir", type=str, default=None, + help="Path to images directory for RGB color sampling") + parser.add_argument("--max_iterations", type=int, default=50) + parser.add_argument("--min_track_measurements", type=int, default=3) + parser.add_argument("--output_dir", type=str, default="/tmp/gp_convergence_frames") + parser.add_argument("--interp_steps", type=int, default=5, help="Interpolated sub-frames between each iteration") + parser.add_argument("--elev", type=float, default=None, help="Camera elevation angle (auto-computed if not set)") + parser.add_argument("--azim", type=float, default=None, help="Camera azimuth angle (auto-computed if not set)") + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + # Try loading pre-saved trace from pipeline run (preferred) + if os.path.exists(args.trace_file): + print("=" * 60) + print(f"Loading convergence trace from {args.trace_file}...") + print("=" * 60) + import pickle + with open(args.trace_file, "rb") as f: + trace_data = pickle.load(f) + values_trace = trace_data["values_trace"] + error_trace = trace_data["error_trace"] + camera_indices = trace_data["camera_indices"] + track_indices = trace_data["track_indices"] + # Reconstruct wRi_list from saved data + wRi_list_sparse = trace_data["wRi_list"] + wRi_indices = trace_data["wRi_indices"] + wRi_list = [None] * (max(wRi_indices) + 1) + for idx, wRi in zip(wRi_indices, wRi_list_sparse): + wRi_list[idx] = wRi + track_colors = trace_data.get("track_colors", None) + filtered_tracks = trace_data.get("filtered_tracks", None) + print(f"Loaded {len(values_trace)} frames, {len(camera_indices)} cameras, {len(track_indices)} tracks") + + # Sample colors from images using the saved 2D tracks + if track_colors is None and filtered_tracks is not None and args.images_dir: + print("Sampling colors from images using saved tracks...") + from PIL import Image as PILImage + from gtsfm.common.sfm_track import SfmMeasurement + + # Load images by sorted filename + img_files = sorted([f for f in os.listdir(args.images_dir) + if f.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff'))]) + loaded_images = {} + for i, fname in enumerate(img_files): + try: + loaded_images[i] = np.array(PILImage.open(os.path.join(args.images_dir, fname))) + except Exception: + pass + print(f" Loaded {len(loaded_images)} images") + + # Compute UV scale factor: pipeline may have downscaled images + sample_img = next(iter(loaded_images.values())) + img_h, img_w = sample_img.shape[:2] + max_uv_u, max_uv_v = 0, 0 + for t in filtered_tracks[:200]: + for m_idx in range(t.number_measurements()): + m = t.measurement(m_idx) + max_uv_u = max(max_uv_u, m.uv[0]) + max_uv_v = max(max_uv_v, m.uv[1]) + # If UVs are much smaller than image dims, features were detected at lower res + scale_u = img_w / max(max_uv_u, 1) if max_uv_u < img_w * 0.8 else 1.0 + scale_v = img_h / max(max_uv_v, 1) if max_uv_v < img_h * 0.8 else 1.0 + if scale_u > 1.5 or scale_v > 1.5: + print(f" UV scale correction: {scale_u:.2f}x, {scale_v:.2f}x (pipeline used downscaled images)") + else: + scale_u, scale_v = 1.0, 1.0 + + colors_list = [] + for track in filtered_tracks: + sampled = None + # Collect colors from all observing cameras, take median + all_colors = [] + for m_idx in range(track.number_measurements()): + m = track.measurement(m_idx) + if m.i in loaded_images: + img = loaded_images[m.i] + u = int(round(m.uv[0] * scale_u)) + v = int(round(m.uv[1] * scale_v)) + if 0 <= v < img.shape[0] and 0 <= u < img.shape[1]: + pixel = img[v, u] + if img.ndim == 3 and img.shape[2] >= 3: + all_colors.append(pixel[:3] / 255.0) + elif img.ndim == 2 or (img.ndim == 3 and img.shape[2] == 1): + gray = float(pixel if img.ndim == 2 else pixel[0]) / 255.0 + all_colors.append([gray, gray, gray]) + if all_colors: + sampled = np.median(all_colors, axis=0) + colors_list.append(sampled if sampled is not None else [0.3, 0.3, 0.3]) + track_colors = np.array(colors_list) + colored = np.sum(np.any(track_colors != 0.3, axis=1)) + print(f" Sampled color for {colored}/{len(filtered_tracks)} tracks") + else: + # Fallback: load from COLMAP results and replay + print("=" * 60) + print(f"No trace file found at {args.trace_file}") + print("Loading from COLMAP results and replaying optimization...") + print("=" * 60) + wRi_list, intrinsics, tracks_2d, num_images, track_colors = load_rotations_and_tracks_from_colmap( + args.results_dir, getattr(args, 'images_dir', None) + ) + + values_trace, error_trace, camera_indices, track_indices = build_graph_and_iterate( + wRi_list, intrinsics, tracks_2d, num_images, + max_iterations=args.max_iterations, + min_track_measurements=args.min_track_measurements, + ) + + # Step 3: Compute view bounds from final converged result + print("\n" + "=" * 60) + print(f"Step 3: Rendering {len(values_trace)} frames...") + print("=" * 60) + final_cams, _, final_lmks = extract_positions( + values_trace[-1], wRi_list, camera_indices, track_indices + ) + + all_pts = np.vstack([final_cams, final_lmks]) if len(final_lmks) > 0 else final_cams + view_center = np.median(all_pts, axis=0) + dists = np.linalg.norm(all_pts - view_center, axis=1) + view_range = np.percentile(dists, 90) * 1.05 + + # Auto-compute view angle from camera geometry (or use manual override) + if args.elev is None or args.azim is None: + auto_elev, auto_azim = compute_auto_view_angle(final_cams) + # Cap elevation so it's never too top-down — keep between 20° and 45° + auto_elev = np.clip(auto_elev, 20.0, 45.0) + elev = auto_elev if args.elev is None else args.elev + azim = auto_azim if args.azim is None else args.azim + print(f"Auto view angle: elev={elev:.1f}°, azim={azim:.1f}°") + else: + elev, azim = args.elev, args.azim + print(f"Manual view angle: elev={elev:.1f}°, azim={azim:.1f}°") + + # Render frames with interpolation between iterations + slow orbit + initial_error = error_trace[0] + num_iters = len(values_trace) + interp_steps = args.interp_steps # sub-frames between each iteration + total_frames = (num_iters - 1) * interp_steps + 1 + azim_sweep = 120 # total degrees to sweep + + frame_idx = 0 + for i in range(num_iters): + cam_pos_i, cam_fwd_i, lmk_pos_i = extract_positions(values_trace[i], wRi_list, camera_indices, track_indices) + + if i < num_iters - 1: + cam_pos_next, cam_fwd_next, lmk_pos_next = extract_positions( + values_trace[i + 1], wRi_list, camera_indices, track_indices + ) + steps = interp_steps + else: + steps = 1 # last iteration, no interpolation + + for k in range(steps): + t = k / interp_steps if steps > 1 else 0.0 + # Lerp positions + cam_pos = cam_pos_i * (1 - t) + cam_pos_next * t if i < num_iters - 1 else cam_pos_i + lmk_pos = lmk_pos_i * (1 - t) + lmk_pos_next * t if i < num_iters - 1 else lmk_pos_i + error = error_trace[i] * (1 - t) + error_trace[min(i + 1, num_iters - 1)] * t + + frame_azim = azim + (frame_idx / max(total_frames - 1, 1)) * azim_sweep + frame_path = os.path.join(args.output_dir, f"frame_{frame_idx:04d}.png") + render_frame( + cam_pos, cam_fwd_i, lmk_pos, i, error, initial_error, + frame_path, view_center, view_range, track_colors, elev, frame_azim, + ) + + if frame_idx % 20 == 0 or frame_idx == total_frames - 1: + print(f" Rendered frame {frame_idx}/{total_frames - 1} (iter {i}, error: {error:,.0f})") + frame_idx += 1 + + print(f"\n{'=' * 60}") + print(f"{frame_idx} frames saved to {args.output_dir}") + print(f"\nTo create video:") + print(f" ffmpeg -y -framerate 10 -i {args.output_dir}/frame_%04d.png \\") + print(f" -vf \"scale=trunc(iw/2)*2:trunc(ih/2)*2\" \\") + print(f" -c:v libx264 -pix_fmt yuv420p convergence.mp4") + print(f"{'=' * 60}") + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index db6a2f3ac..ed4503ebb 100644 --- a/uv.lock +++ b/uv.lock @@ -779,16 +779,16 @@ wheels = [ [[package]] name = "gtsam-develop" -version = "4.3a1.dev202603090403" +version = "4.3a1.dev202604240153" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, { name = "pytest" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/17/6d/2b643109654d967bdfff721b4d68d4dd606050ad8b0986ba9733bb80c712/gtsam_develop-4.3a1.dev202603090403-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:796db0c4e43532555471ed942d3bc2a086d796203c9ddc3265ee0952053b97f2", size = 40361647, upload-time = "2026-03-09T04:46:49.76Z" }, - { url = "https://files.pythonhosted.org/packages/95/57/e23378236435385a749f7929a9ed43a14bf848a13a479f3525be55401891/gtsam_develop-4.3a1.dev202603090403-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:54cb9fac1d794ec65e19763895704fa74dfd992c06f65d068ae1b06f799578c5", size = 28581917, upload-time = "2026-03-09T04:46:52.522Z" }, - { url = "https://files.pythonhosted.org/packages/8d/77/5cff4bca2c936bedc8bc576db0b5dce2750b4e7831257beb32909eea5ef1/gtsam_develop-4.3a1.dev202603090403-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:79bcbc23889b7dc289b62938bb96da8d6d93728824b9a83636bc993d4bec131d", size = 29907408, upload-time = "2026-03-09T04:46:55.76Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f0/ef174952c8ee654536b7e03d82196a8b35f2ffc00755e125a2b1fbe113c7/gtsam_develop-4.3a1.dev202604240153-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:2f79d4fdf7acbbaf47cccb15bdc5a8a6d0cc42e3d9372999e0b8dd8f35039f1a", size = 41381638, upload-time = "2026-04-24T02:54:29.027Z" }, + { url = "https://files.pythonhosted.org/packages/4a/85/afafe81e8fb0ab6ae204c97960f6a6df5476edc4ce20f171d88c6e3070cf/gtsam_develop-4.3a1.dev202604240153-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f1625828122764ecf02f9337c97f31ca5193700262414bb9a42ea2244d03606f", size = 29265671, upload-time = "2026-04-24T02:54:35.989Z" }, + { url = "https://files.pythonhosted.org/packages/d0/e7/902fe4fb7d8f97801047298eec75c3f65900c288462d06398358b3799b3a/gtsam_develop-4.3a1.dev202604240153-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:79e6bc7581f24f35a9cf1f7ea44cfa8931fca03b609e5c2441db164ed2b9be4e", size = 30660189, upload-time = "2026-04-24T02:54:38.773Z" }, ] [[package]] @@ -905,7 +905,7 @@ requires-dist = [ { name = "etils", specifier = "==1.7.0" }, { name = "flask", specifier = ">=3.0,<4.0" }, { name = "gsplat", specifier = "==1.5.3" }, - { name = "gtsam-develop", specifier = "==4.3a1.dev202603090403" }, + { name = "gtsam-develop", specifier = "==4.3a1.dev202604240153" }, { name = "h5py" }, { name = "huggingface-hub", extras = ["torch"], specifier = "==0.34.4" }, { name = "hydra-core" }, @@ -2803,8 +2803,8 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac", upload-time = "2025-05-14T03:37:30Z" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce", upload-time = "2025-04-22T18:20:33Z" }, ] [[package]] @@ -2824,7 +2824,7 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.7.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7b4f8b2b83bd08f7d399025a9a7b323bdbb53d20566f1e0d584689bb92d82f9a" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.7.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7b4f8b2b83bd08f7d399025a9a7b323bdbb53d20566f1e0d584689bb92d82f9a", upload-time = "2025-06-03T18:28:06Z" }, ] [[package]] @@ -2877,7 +2877,7 @@ dependencies = [ { name = "torch", version = "2.7.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e9752b48c1cdd7f6428bcd30c3d198b30ecea348d16afb651f95035e5252506" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e9752b48c1cdd7f6428bcd30c3d198b30ecea348d16afb651f95035e5252506", upload-time = "2025-04-22T18:30:21Z" }, ] [[package]] @@ -2893,7 +2893,7 @@ dependencies = [ { name = "torch", version = "2.7.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:06c101f40e1ff94869be14487c91fd5352e376f202fdeafb8f53c58cee2fbeb5" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:06c101f40e1ff94869be14487c91fd5352e376f202fdeafb8f53c58cee2fbeb5", upload-time = "2025-04-22T18:30:20Z" }, ] [[package]] @@ -2909,7 +2909,7 @@ dependencies = [ { name = "torch", version = "2.7.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:153f1790e505bd6da123e21eee6e83e2e155df05c0fe7d56347303067d8543c5" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchvision-0.22.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:153f1790e505bd6da123e21eee6e83e2e155df05c0fe7d56347303067d8543c5", upload-time = "2025-06-03T18:37:22Z" }, ] [[package]]