diff --git a/.github/workflows/test-fabric-multi-gpu.yaml b/.github/workflows/test-fabric-multi-gpu.yaml new file mode 100644 index 000000000000..5c343521d5c2 --- /dev/null +++ b/.github/workflows/test-fabric-multi-gpu.yaml @@ -0,0 +1,69 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +# FabricFrameView multi-GPU unit tests +# +# Runs the cuda:1-parameterized unit tests in +# source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py on the +# dedicated multi-GPU runner. Kept separate from test-multi-gpu.yaml so +# changes to FabricFrameView do not gate distributed-training validation and +# vice versa. Runner preconditions (label, install step, GPU pre-flight) are +# identical to the training workflow. + +name: FabricFrameView Multi-GPU Tests + +# DISABLED: No self-hosted runner with the 'multi-gpu' label is currently +# registered. All runs queue indefinitely. Re-enable once infra provisions +# a multi-GPU runner (see also .github/workflows/test-multi-gpu.yaml which +# has the same issue). +# +# on: +# pull_request: +# paths: +# - "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py" +# - "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py" +# - ".github/workflows/test-fabric-multi-gpu.yaml" +# workflow_dispatch: +on: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-fabric-multi-gpu: + name: FabricFrameView multi-GPU unit tests + runs-on: [self-hosted, linux, x64, gpu, multi-gpu] + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Isaac Lab + run: ./isaaclab.sh --install + + - name: Verify multi-GPU availability + # Fail loud if the runner regressed to a single GPU. Without this, the + # test helper's ``_skip_if_unavailable`` would skip every ``cuda:1`` case + # and the job would go green with no real coverage. + run: | + echo "=== GPU Info ===" + nvidia-smi --query-gpu=index,name,memory.total --format=csv + + GPU_COUNT=$(./isaaclab.sh -p -c "import torch; print(torch.cuda.device_count())") + echo "Detected $GPU_COUNT GPU(s)" + + if [ "$GPU_COUNT" -lt 2 ]; then + echo "::error::At least 2 GPUs required for Fabric multi-GPU tests, found $GPU_COUNT" + exit 1 + fi + + - name: Run FabricFrameView multi-GPU unit tests + env: + ISAACLAB_TEST_MULTI_GPU: "1" + run: | + ./isaaclab.sh -p -m pytest \ + source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v diff --git a/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst new file mode 100644 index 000000000000..6688cc827134 --- /dev/null +++ b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst @@ -0,0 +1,9 @@ +Fixed +^^^^^ + +* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to + the slow USD path on every CUDA device other than ``cuda:0``. USDRT + ``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration + runs on the simulation device the view was constructed with (e.g. + ``cuda:1``). This unblocks distributed training where each rank is + pinned to a non-primary GPU. diff --git a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py index 1bcff86d57ac..ee329c7976e6 100644 --- a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py +++ b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py @@ -23,12 +23,6 @@ logger = logging.getLogger(__name__) -# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view. -# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the -# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still -# assumes a single device — to be tackled in a follow-up. -_fabric_supported_devices = ("cpu", "cuda", "cuda:0") - def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor: """Ensure array is compatible with Fabric kernels (2-D float32). @@ -54,8 +48,11 @@ class FabricFrameView(BaseFrameView): when Fabric is disabled). When Fabric is enabled, world-pose and scale operations use Warp kernels - operating on ``omni:fabric:worldMatrix``. All other operations delegate - to the internal USD view. + operating on ``omni:fabric:worldMatrix``. Fabric acceleration runs on + the same CUDA device the view was constructed with — ``cuda:0``, + ``cuda:1``, or any other available CUDA index — so this view is safe + to use from distributed-training workers pinned to non-primary GPUs. + All other operations delegate to the internal USD view. After every Fabric write (``set_world_poses``, ``set_scales``), :meth:`PrepareForReuse` is called on the ``PrimSelection`` to notify @@ -78,7 +75,9 @@ def __init__( Args: prim_path: USD prim-path pattern to match. - device: Device for Warp arrays (``"cpu"`` or ``"cuda:0"``). + device: Device for Warp arrays. Either ``"cpu"`` or any CUDA + device string (``"cuda:0"``, ``"cuda:1"``, …); Fabric + acceleration is supported on every CUDA index. validate_xform_ops: Whether to validate prim xform-ops. stage: USD stage; defaults to the current sim context's stage. **kwargs: Additional keyword arguments (ignored). Matches the signature of @@ -91,15 +90,9 @@ def __init__( settings = SettingsManager.instance() self._use_fabric = bool(settings.get("/physics/fabricEnabled", False)) - - if self._use_fabric and self._device not in _fabric_supported_devices: - logger.warning( - f"Fabric mode is not supported on device '{self._device}'. " - "USDRT SelectPrims and Warp fabric arrays are currently " - f"only supported on {', '.join(_fabric_supported_devices)}. " - "Falling back to standard USD operations. This may impact performance." - ) - self._use_fabric = False + # TODO(pv): Misleading abstraction — FabricFrameView can fall back to USD internally; + # the concrete class should be determined by the factory instead. (PR #5673 pv/fabric-view-no-fallback) + # TODO(pv): Fuse set_world_poses/set_scales into single kernel launch (PR #5674 pv/fabric-fused-compose) self._fabric_initialized = False self._fabric_usd_sync_done = False @@ -404,9 +397,6 @@ def _initialize_fabric(self) -> None: ) wp.synchronize() - # The constructor should have taken care of this, but double check here to avoid regressions - assert self._device in _fabric_supported_devices - self._fabric_selection = fabric_stage.SelectPrims( require_attrs=[ (usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read), diff --git a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py index f0c18ccb98c7..3cfe70095fd3 100644 --- a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py +++ b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py @@ -10,6 +10,7 @@ Camera prim type for Fabric SelectPrims compatibility). """ +import os import sys from pathlib import Path @@ -44,8 +45,19 @@ def test_setup_teardown(): def _skip_if_unavailable(device: str): - if device.startswith("cuda") and not torch.cuda.is_available(): + if not device.startswith("cuda"): + return + if not torch.cuda.is_available(): pytest.skip("CUDA not available") + idx = int(device.split(":")[1]) if ":" in device else 0 + n = torch.cuda.device_count() + if idx >= n: + # Always skip rather than fail: the dedicated multi-GPU workflow does its own + # pre-flight ``torch.cuda.device_count() >= 2`` check before invoking pytest, so + # a misconfigured multi-GPU runner is already caught there. Failing here would + # only break the standard single-GPU CI runners that legitimately can't run + # ``cuda:1+`` tests. + pytest.skip(f"{device} not available (device_count={n}) — multi-GPU test skipped") # ------------------------------------------------------------------ @@ -169,7 +181,7 @@ def test_fabric_set_world_does_not_write_back_to_usd(device, view_factory): # Verify Fabric has the new position fab_pos, _ = view.get_world_poses() - pos_torch = wp.to_torch(fab_pos) + pos_torch = torch.as_tensor(fab_pos, device=device) assert torch.allclose(pos_torch, torch.tensor([[99.0, 99.0, 99.0]], device=device), atol=0.1), ( f"Fabric should have new position, got {pos_torch}" ) @@ -230,6 +242,92 @@ def force_topology_changed(): # Read back — proves the rebuilt _view_to_fabric and _fabric_world_matrices # are still consistent. ret_pos, _ = view.get_world_poses() - pos_torch = wp.to_torch(ret_pos) + pos_torch = torch.as_tensor(ret_pos, device=device) expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device) assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}" + + +# ------------------------------------------------------------------ +# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations +# ------------------------------------------------------------------ + + +@pytest.mark.skipif( + not os.environ.get("ISAACLAB_TEST_MULTI_GPU"), + reason="Multi-GPU tests disabled (set ISAACLAB_TEST_MULTI_GPU=1 to enable)", +) +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_world_pose_roundtrip(device, view_factory): + """set_world_poses -> get_world_poses roundtrip works on cuda:1. + + Verifies that FabricFrameView operates correctly on a non-primary CUDA + device without falling back to the USD path. + """ + bundle = view_factory(2, device) + view = bundle.view + + new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device) + view.set_world_poses(positions=new_pos) + + ret_pos, _ = view.get_world_poses() + pos_torch = torch.as_tensor(ret_pos, device=device) + expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device) + assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}" + + +@pytest.mark.skipif( + not os.environ.get("ISAACLAB_TEST_MULTI_GPU"), + reason="Multi-GPU tests disabled (set ISAACLAB_TEST_MULTI_GPU=1 to enable)", +) +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_no_usd_writeback(device, view_factory): + """set_world_poses on cuda:1 does not write back to USD. + + Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1 + device to confirm the no-writeback invariant holds across GPU indices. + """ + bundle = view_factory(1, device) + view = bundle.view + + stage = sim_utils.get_current_stage() + prim = stage.GetPrimAtPath(view.prim_paths[0]) + xform_cache = UsdGeom.XformCache() + t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation() + orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])]) + + new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device) + view.set_world_poses(positions=new_pos) + + # USD must not have moved at all — equality, not approximate. + t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation() + usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])]) + assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), ( + f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}" + ) + + +@pytest.mark.skipif( + not os.environ.get("ISAACLAB_TEST_MULTI_GPU"), + reason="Multi-GPU tests disabled (set ISAACLAB_TEST_MULTI_GPU=1 to enable)", +) +@pytest.mark.parametrize("device", ["cuda:1"]) +def test_fabric_cuda1_scales_roundtrip(device, view_factory): + """set_scales -> get_scales roundtrip works on cuda:1. + + Both write paths (``set_world_poses`` and ``set_scales``) call + ``_prepare_for_reuse`` and launch on ``self._device``; this test covers + the scales path on the non-primary CUDA device. + """ + bundle = view_factory(2, device) + view = bundle.view + + new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device) + wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device) + view.set_scales(new_scales) + + ret_scales = view.get_scales() + scales_torch = torch.as_tensor(ret_scales, device=device) + expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device) + assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"