isl-org · asrathore-ai · Aug 13, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 15, 2025
diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
@@ -18,15 +18,15 @@ jobs:
         with:
           submodules: 'recursive'
 
-      - name: Set up Python 3.8.12
+      - name: Set up Python 3.9
         uses: actions/setup-python@v5
         with:
-          python-version: "3.8.12"
+          python-version: "3.9"
       - name: Install dependencies
         run: |
           pip install black[jupyter]==22.3.0 pytest
-          pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/cpu
-          BUILD_NO_CUDA=1 pip install .
+          pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+          BUILD_NO_CUDA=1 pip install --no-build-isolation .
       - name: Run Black Format Check
         run: black . gsplat/ tests/ examples/ profiling/ --check
       - name: Run Tests.

diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
@@ -26,8 +26,8 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -r docs/requirements.txt
-          pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/cpu
-          BUILD_NO_CUDA=1 pip install .
+          pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+          BUILD_NO_CUDA=1 pip install --no-build-isolation .
 
       # Get version.
       - name: Get version + subdirectory

diff --git a/docs/source/apis/utils.rst b/docs/source/apis/utils.rst
@@ -27,8 +27,6 @@ Below are the basic functions that supports the rasterization.
 
 .. autofunction:: rasterize_to_indices_in_range
 
-.. autofunction:: accumulate
-
 .. autofunction:: rasterization_inria_wrapper
 
 2DGS
@@ -41,6 +39,4 @@ Below are the basic functions that supports the rasterization.
 
 .. autofunction:: rasterize_to_indices_in_range_2dgs
 
-.. autofunction:: accumulate_2dgs
-
 .. autofunction:: rasterization_2dgs_inria_wrapper
diff --git a/examples/image_fitting.py b/examples/image_fitting.py
@@ -10,7 +10,7 @@
 from PIL import Image
 from torch import Tensor, optim
 
-from gsplat import rasterization, rasterization_2dgs
+from gsplat import torch_acc, rasterization, rasterization_2dgs
 
 
 class SimpleTrainer:
@@ -21,7 +21,7 @@ def __init__(
         gt_image: Tensor,
         num_points: int = 2000,
     ):
-        self.device = torch.device("cuda:0")
+        self.device = torch_acc._get_device(0)
         self.gt_image = gt_image.to(device=self.device)
         self.num_points = num_points
 
@@ -117,13 +117,13 @@ def train(
                 packed=False,
             )[0]
             out_img = renders[0]
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             times[0] += time.time() - start
             loss = mse_loss(out_img, self.gt_image)
             optimizer.zero_grad()
             start = time.time()
             loss.backward()
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             times[1] += time.time() - start
             optimizer.step()
             print(f"Iteration {iter + 1}/{iterations}, Loss: {loss.item()}")

diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -19,6 +19,6 @@ tensorboard
 tensorly
 pyyaml
 matplotlib
-git+https://github.com/rahul-goel/fused-ssim@328dc9836f513d00c4b5bc38fe30478b4435cbb5
-git+https://github.com/harry7557558/fused-bilagrid@90f9788e57d3545e3a033c1038bb9986549632fe
+#git+https://github.com/rahul-goel/fused-ssim@328dc9836f513d00c4b5bc38fe30478b4435cbb5
+#git+https://github.com/harry7557558/fused-bilagrid@90f9788e57d3545e3a033c1038bb9986549632fe
 splines
diff --git a/examples/simple_trainer.py b/examples/simple_trainer.py
@@ -21,7 +21,7 @@
     generate_interpolated_path,
     generate_spiral_path,
 )
-from fused_ssim import fused_ssim
+from fusedssim_sycl import fusedssim
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
@@ -30,7 +30,7 @@
 from typing_extensions import Literal, assert_never
 from utils import AppearanceOptModule, CameraOptModule, knn, rgb_to_sh, set_random_seed
 
-from gsplat import export_splats
+from gsplat import export_splats, torch_acc
 from gsplat.compression import PngCompression
 from gsplat.distributed import cli
 from gsplat.optimizers import SelectiveAdam
@@ -79,13 +79,13 @@ class Config:
     # Number of training steps
     max_steps: int = 30_000
     # Steps to evaluate the model
-    eval_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    eval_steps: List[int] = field(default_factory=lambda: [2_000, 7_000, 30_000])
     # Steps to save the model
-    save_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    save_steps: List[int] = field(default_factory=lambda: [2_000, 7_000, 30_000])
     # Whether to save ply file (storage size can be large)
     save_ply: bool = False
     # Steps to save the model as ply
-    ply_steps: List[int] = field(default_factory=lambda: [7_000, 30_000])
+    ply_steps: List[int] = field(default_factory=lambda: [2_000, 7_000, 30_000])
     # Whether to disable video generation during training and evaluation
     disable_video: bool = False
 
@@ -227,7 +227,7 @@ def create_splats_with_optimizers(
     visible_adam: bool = False,
     batch_size: int = 1,
     feature_dim: Optional[int] = None,
-    device: str = "cuda",
+    device: str = torch_acc._device(0).type,
     world_rank: int = 0,
     world_size: int = 1,
 ) -> Tuple[torch.nn.ParameterDict, Dict[str, torch.optim.Optimizer]]:
@@ -312,7 +312,7 @@ def __init__(
         self.world_rank = world_rank
         self.local_rank = local_rank
         self.world_size = world_size
-        self.device = f"cuda:{local_rank}"
+        self.device = str(torch_acc._device(local_rank))
 
         # Where to dump results.
         os.makedirs(cfg.result_dir, exist_ok=True)
@@ -681,7 +681,7 @@ def train(self):
 
             # loss
             l1loss = F.l1_loss(colors, pixels)
-            ssimloss = 1.0 - fused_ssim(
+            ssimloss = 1.0 - fusedssim(
                 colors.permute(0, 3, 1, 2), pixels.permute(0, 3, 1, 2), padding="valid"
             )
             loss = l1loss * (1.0 - cfg.ssim_lambda) + ssimloss * cfg.ssim_lambda
@@ -735,7 +735,7 @@ def train(self):
             #     )
 
             if world_rank == 0 and cfg.tb_every > 0 and step % cfg.tb_every == 0:
-                mem = torch.cuda.max_memory_allocated() / 1024**3
+                mem = torch_acc.max_memory_allocated() / 1024**3
                 self.writer.add_scalar("train/loss", loss.item(), step)
                 self.writer.add_scalar("train/l1loss", l1loss.item(), step)
                 self.writer.add_scalar("train/ssimloss", ssimloss.item(), step)
@@ -753,7 +753,7 @@ def train(self):
 
             # save checkpoint before updating the model
             if step in [i - 1 for i in cfg.save_steps] or step == max_steps - 1:
-                mem = torch.cuda.max_memory_allocated() / 1024**3
+                mem = torch_acc.max_memory_allocated() / 1024**3
                 stats = {
                     "mem": mem,
                     "ellipse_time": time.time() - global_tic,
@@ -923,7 +923,7 @@ def eval(self, step: int, stage: str = "val"):
             masks = data["mask"].to(device) if "mask" in data else None
             height, width = pixels.shape[1:3]
 
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             tic = time.time()
             colors, _, _ = self.rasterize_splats(
                 camtoworlds=camtoworlds,
@@ -935,7 +935,7 @@ def eval(self, step: int, stage: str = "val"):
                 far_plane=cfg.far_plane,
                 masks=masks,
             )  # [1, H, W, 3]
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             ellipse_time += max(time.time() - tic, 1e-10)
 
             colors = torch.clamp(colors, 0.0, 1.0)
@@ -1177,6 +1177,37 @@ def main(local_rank: int, world_rank, world_size: int, cfg: Config):
         step = ckpts[0]["step"]
         runner.eval(step=step)
         runner.render_traj(step=step)
+        if cfg.save_ply:
+            if runner.cfg.app_opt:
+                # eval at origin to bake the appeareance into the colors
+                rgb = runner.app_module(
+                    features=runner.splats["features"],
+                    embed_ids=None,
+                    dirs=torch.zeros_like(runner.splats["means"][None, :, :]),
+                    sh_degree=runner.cfg.sh_degree,
+                )
+                rgb = rgb + runner.splats["colors"]
+                rgb = torch.sigmoid(rgb).squeeze(0).unsqueeze(1)
+                sh0 = rgb_to_sh(rgb)
+                shN = torch.empty([sh0.shape[0], 0, 3], device=sh0.device)
+            else:
+                sh0 = runner.splats["sh0"]
+                shN = runner.splats["shN"]
+
+            means = runner.splats["means"]
+            scales = runner.splats["scales"]
+            quats = runner.splats["quats"]
+            opacities = runner.splats["opacities"]
+            export_splats(
+                means=means,
+                scales=scales,
+                quats=quats,
+                opacities=opacities,
+                sh0=sh0,
+                shN=shN,
+                format="ply",
+                save_to=f"{cfg.result_dir}/point_cloud_{step}.ply",
+            )
         if cfg.compression is not None:
             runner.run_compression(step=step)
     else:

diff --git a/examples/simple_trainer_2dgs.py b/examples/simple_trainer_2dgs.py
@@ -29,6 +29,7 @@
     rgb_to_sh,
     set_random_seed,
 )
+from gsplat import torch_acc
 from gsplat_viewer_2dgs import GsplatViewer, GsplatRenderTabState
 from gsplat.rendering import rasterization_2dgs, rasterization_2dgs_inria_wrapper
 from gsplat.strategy import DefaultStrategy
@@ -194,7 +195,7 @@ def create_splats_with_optimizers(
     sparse_grad: bool = False,
     batch_size: int = 1,
     feature_dim: Optional[int] = None,
-    device: str = "cuda",
+    device: str = torch_acc._device(0).type,
 ) -> Tuple[torch.nn.ParameterDict, Dict[str, torch.optim.Optimizer]]:
     if init_type == "sfm":
         points = torch.from_numpy(parser.points).float()
@@ -257,7 +258,7 @@ def __init__(self, cfg: Config) -> None:
         set_random_seed(42)
 
         self.cfg = cfg
-        self.device = "cuda"
+        self.device = torch_acc._device(0).type
 
         # Where to dump results.
         os.makedirs(cfg.result_dir, exist_ok=True)
@@ -650,7 +651,7 @@ def train(self):
             pbar.set_description(desc)
 
             if cfg.tb_every > 0 and step % cfg.tb_every == 0:
-                mem = torch.cuda.max_memory_allocated() / 1024**3
+                mem = torch_acc.max_memory_allocated() / 1024**3
                 self.writer.add_scalar("train/loss", loss.item(), step)
                 self.writer.add_scalar("train/l1loss", l1loss.item(), step)
                 self.writer.add_scalar("train/ssimloss", ssimloss.item(), step)
@@ -712,7 +713,7 @@ def train(self):
 
             # save checkpoint
             if step in [i - 1 for i in cfg.save_steps] or step == max_steps - 1:
-                mem = torch.cuda.max_memory_allocated() / 1024**3
+                mem = torch_acc.max_memory_allocated() / 1024**3
                 stats = {
                     "mem": mem,
                     "ellipse_time": time.time() - global_tic,
@@ -765,7 +766,7 @@ def eval(self, step: int):
             pixels = data["image"].to(device) / 255.0
             height, width = pixels.shape[1:3]
 
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             tic = time.time()
             (
                 colors,
@@ -787,7 +788,7 @@ def eval(self, step: int):
             )  # [1, H, W, 3]
             colors = torch.clamp(colors, 0.0, 1.0)
             colors = colors[..., :3]  # Take RGB channels
-            torch.cuda.synchronize()
+            torch_acc.synchronize()
             ellipse_time += max(time.time() - tic, 1e-10)
 
             # write images

diff --git a/examples/simple_viewer.py b/examples/simple_viewer.py
@@ -20,7 +20,7 @@
 
 def main(local_rank: int, world_rank, world_size: int, args):
     torch.manual_seed(42)
-    device = torch.device("cuda", local_rank)
+    device = torch.device(local_rank)
 
     if args.ckpt is None:
         (

diff --git a/gsplat/__init__.py b/gsplat/__init__.py
@@ -1,9 +1,12 @@
-import warnings
+import os
+import sys
+import torch
 
-from .compression import PngCompression
-from .cuda._torch_impl import accumulate
-from .cuda._torch_impl_2dgs import accumulate_2dgs
-from .cuda._wrapper import (
+BACKEND: str = ""
+torch_acc = torch.cpu
+_force_backend = os.getenv("GSPLAT_BACKEND", "").lower()
+
+from .cuda._wrapper import (  # Default to CUDA imports, works even if no CUDA is available
     RollingShutterType,
     fully_fused_projection,
     fully_fused_projection_2dgs,
@@ -20,6 +23,44 @@
     spherical_harmonics,
     world_to_cam,
 )
+
+if _force_backend == "cuda" or (_force_backend == "" and torch.cuda.is_available()):
+    BACKEND = "cuda"
+    torch_acc = torch.cuda
+    print("gsplat: Using CUDA backend.", file=sys.stderr)
+    # Functions already imported above
+
+if (
+    not BACKEND
+    and _force_backend in ("sycl", "xpu")
+    or _force_backend == ""
+    and hasattr(torch, "xpu")
+    and torch.xpu.is_available()
+):
+    from .sycl._wrapper import (  # Overwrite imports for SYCL backend
+        RollingShutterType,
+        fully_fused_projection,
+        fully_fused_projection_2dgs,
+        fully_fused_projection_with_ut,
+        isect_offset_encode,
+        isect_tiles,
+        proj,
+        quat_scale_to_covar_preci,
+        rasterize_to_indices_in_range,
+        rasterize_to_indices_in_range_2dgs,
+        rasterize_to_pixels,
+        rasterize_to_pixels_2dgs,
+        rasterize_to_pixels_eval3d,
+        spherical_harmonics,
+        world_to_cam,
+    )
+
+    BACKEND = "sycl"
+    torch_acc = torch.xpu
+    print("gsplat: Using SYCL XPU backend.", file=sys.stderr)
+
+
+from .compression import PngCompression
 from .exporter import export_splats
 from .optimizers import SelectiveAdam
 from .rendering import (
@@ -31,7 +72,10 @@
 from .strategy import DefaultStrategy, MCMCStrategy, Strategy
 from .version import __version__
 
-all = [
+
+__all__ = [
+    "BACKEND",
+    "torch_acc",
     "PngCompression",
     "DefaultStrategy",
     "MCMCStrategy",
@@ -47,16 +91,16 @@
     "quat_scale_to_covar_preci",
     "rasterize_to_pixels",
     "world_to_cam",
-    "accumulate",
     "rasterize_to_indices_in_range",
     "fully_fused_projection_2dgs",
     "rasterize_to_pixels_2dgs",
     "rasterize_to_indices_in_range_2dgs",
-    "accumulate_2dgs",
     "rasterization_2dgs_inria_wrapper",
     "RollingShutterType",
     "fully_fused_projection_with_ut",
     "rasterize_to_pixels_eval3d",
     "export_splats",
     "__version__",
+    "SelectiveAdam",
+    # Note: accumulate and accumulate_2dgs are not typically part of the public API
 ]