Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/source/setup/installation/uv_run.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,33 @@ Clone the repo and start training immediately — no virtual environment setup r
``uv`` resolves and manages the environment automatically on each invocation. Supported
libraries for ``--rl_library`` are: ``rsl_rl``, ``rl_games``, ``skrl``, ``sb3``, and ``rlinf``.

Multi-GPU Training
------------------

Use ``train_multigpu`` for torch distributed training. It defaults to ``rsl_rl``, launches one
process per visible GPU, adds ``--distributed`` automatically, and forwards the remaining
arguments to the selected training library:

.. code-block:: bash
uv run train_multigpu \
--task Isaac-Dexsuite-Kuka-Allegro-Reorient-v0 \
--headless --num_envs 4096 --max_iterations 100 \
--run_name gpu4_vis presets=newton
Override the GPU count or torch distributed settings when needed:

.. code-block:: bash
uv run train_multigpu --num_gpus 4 --master_port 29504 \
--task Isaac-Dexsuite-Kuka-Allegro-Reorient-v0 \
--headless --num_envs 4096 --max_iterations 100 \
--run_name gpu4_vis presets=newton
Use ``--rl_library`` for other distributed-capable libraries: ``rsl_rl``, ``rl_games``, or ``skrl``.
For multi-node jobs, pass torchrun settings such as ``--nnodes``, ``--node_rank``,
Copy link
Copy Markdown
Contributor

@Toni-SM Toni-SM May 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about if I want to run multi-gpu with JAX: https://isaac-sim.github.io/IsaacLab/main/source/features/multi_gpu.html#jax-implementation ?

Also, in the JAX multi-GPU setup, parameters such as rdzv_backend, rdzv_endpoint and rdzv_id do not exist

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @Toni-SM, good question! There's a couple choices we can try. We can detect in the arguments if the user has skrl + jax and modify the command for them. We could also create a special simple argument for jax (--jax) that can be used in combination with --rl_library skrl. For args validation, we can add that as well, that's what makes this new entry point script really strong, we can do very quick early parsing and make sure the correct args are there and error out very early if they arent.

Which option would you prefer?

``--rdzv_backend``, ``--rdzv_endpoint``, and ``--rdzv_id`` before the training arguments.

Play / Evaluation
-----------------

Expand Down
150 changes: 150 additions & 0 deletions scripts/reinforcement_learning/train_multigpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Multi-GPU training entrypoint for Isaac Lab reinforcement learning workflows."""

from __future__ import annotations

import argparse
import shlex
import subprocess
import sys
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent
TRAIN_SCRIPT = SCRIPT_DIR / "train.py"

DISTRIBUTED_LIBRARIES = ("rl_games", "rsl_rl", "skrl")


def _parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
"""Parse multi-GPU launcher arguments and return forwarded training arguments."""
parser = argparse.ArgumentParser(
description="Launch multi-GPU RL training with torch.distributed.run.",
formatter_class=argparse.RawDescriptionHelpFormatter,
allow_abbrev=False,
epilog=(
"Examples:\n"
" train_multigpu --num_gpus 4 --task Isaac-Cartpole-v0 --headless\n"
" train_multigpu --rl_library skrl --num_gpus 2 --task Isaac-Cartpole-v0 --headless\n"
"\n"
"All unrecognized arguments are forwarded to the selected training library."
),
)
parser.add_argument(
"--rl_library",
choices=DISTRIBUTED_LIBRARIES,
default="rsl_rl",
help="Distributed-capable training library to use. Defaults to rsl_rl.",
)
parser.add_argument(
"--num_gpus",
"--nproc_per_node",
dest="nproc_per_node",
default="gpu",
help=(
"Number of trainer processes to launch on each node. Accepts an integer or torchrun values "
"'gpu', 'cpu', and 'auto'. Defaults to 'gpu'."
),
)
parser.add_argument("--nnodes", default=None, help="Number of nodes to use for distributed training.")
parser.add_argument("--node_rank", default=None, help="Rank of this node in a multi-node job.")
parser.add_argument("--master_addr", default=None, help="Master node address for static rendezvous.")
parser.add_argument("--master_port", default=None, help="Master node port for static rendezvous.")
parser.add_argument("--rdzv_backend", default=None, help="Rendezvous backend used by torchrun.")
parser.add_argument("--rdzv_endpoint", default=None, help="Rendezvous endpoint used by torchrun.")
parser.add_argument("--rdzv_id", default=None, help="User-defined rendezvous id used by torchrun.")
parser.add_argument("--max_restarts", default=None, help="Maximum worker group restarts before failing.")
parser.add_argument("--monitor_interval", default=None, help="Worker monitor interval [s].")
parser.add_argument(
"--start_method",
choices=("spawn", "fork", "forkserver"),
default=None,
help="Multiprocessing start method used by torchrun.",
)
parser.add_argument("--role", default=None, help="User-defined worker role used by torchrun.")
parser.add_argument("--tee", default=None, help="Tee selected worker stdout/stderr streams.")
parser.add_argument("--redirects", default=None, help="Redirect selected worker stdout/stderr streams.")
parser.add_argument("--local_ranks_filter", default=None, help="Only show logs from the listed local ranks.")
parser.add_argument("--log_dir", default=None, help="Directory used by torchrun for worker logs.")
parser.add_argument("--dry_run", action="store_true", help="Print the torchrun command without launching it.")

args_cli, train_args = parser.parse_known_args(argv)
if train_args[:1] == ["--"]:
train_args = train_args[1:]
return args_cli, train_args


def _append_optional_torchrun_arg(command: list[str], args_cli: argparse.Namespace, name: str) -> None:
"""Append a torchrun argument when it was provided."""
value = getattr(args_cli, name)
if value is not None:
command.extend([f"--{name}", str(value)])


def _with_distributed_arg(train_args: list[str]) -> list[str]:
"""Ensure the selected training library receives the distributed flag."""
if "--distributed" in train_args:
return train_args
return ["--distributed", *train_args]


def _build_torchrun_command(args_cli: argparse.Namespace, train_args: list[str]) -> list[str]:
"""Build the torchrun command for multi-GPU training."""
command = [
sys.executable,
"-m",
"torch.distributed.run",
"--nproc_per_node",
str(args_cli.nproc_per_node),
]
for name in (
"nnodes",
"node_rank",
"master_addr",
"master_port",
"rdzv_backend",
"rdzv_endpoint",
"rdzv_id",
"max_restarts",
"monitor_interval",
"start_method",
"role",
"tee",
"redirects",
"local_ranks_filter",
"log_dir",
):
_append_optional_torchrun_arg(command, args_cli, name)

command.extend(
[
str(TRAIN_SCRIPT),
"--rl_library",
args_cli.rl_library,
*_with_distributed_arg(train_args),
]
)
return command


def main(argv: list[str] | None = None) -> int:
"""Launch multi-GPU training with ``torch.distributed.run``."""
if argv is None:
argv = sys.argv[1:]

args_cli, train_args = _parse_args(argv)
command = _build_torchrun_command(args_cli, train_args)

if args_cli.dry_run:
print(shlex.join(command))
return 0

print(f"[INFO] Launching distributed training with: {shlex.join(command)}")
return subprocess.run(command, check=False).returncode


if __name__ == "__main__":
raise SystemExit(main())
5 changes: 5 additions & 0 deletions source/isaaclab/changelog.d/train-multigpu-entrypoint.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Added
^^^^^

* Added the ``train_multigpu`` entry point for launching distributed RL training with
``torch.distributed.run``.
20 changes: 17 additions & 3 deletions source/isaaclab/isaaclab/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ def train(args: list[str] | None = None) -> None:
run_python_command(ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train.py", args, check=True)


def train_multigpu(args: list[str] | None = None) -> None:
"""Run the unified multi-GPU reinforcement learning training script."""
if args is None:
args = sys.argv[1:]
run_python_command(
ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train_multigpu.py", args, check=True
)


def play(args: list[str] | None = None) -> None:
"""Run the unified reinforcement learning play script."""
if args is None:
Expand All @@ -44,6 +53,9 @@ def cli() -> None:
if len(sys.argv) > 1 and sys.argv[1] == "train":
train(sys.argv[2:])
return
if len(sys.argv) > 1 and sys.argv[1] == "train_multigpu":
train_multigpu(sys.argv[2:])
return
if len(sys.argv) > 1 and sys.argv[1] == "play":
play(sys.argv[2:])
return
Expand All @@ -56,8 +68,9 @@ def cli() -> None:
formatter_class=argparse.RawTextHelpFormatter,
epilog=(
"commands:\n"
" train Run scripts/reinforcement_learning/train.py\n"
" play Run scripts/reinforcement_learning/play.py"
" train Run scripts/reinforcement_learning/train.py\n"
" train_multigpu Run scripts/reinforcement_learning/train_multigpu.py\n"
" play Run scripts/reinforcement_learning/play.py"
),
)

Expand All @@ -73,7 +86,8 @@ def cli() -> None:
"Accepts a comma-separated list of submodule names, one of the RL frameworks, or a special value.\n"
"\n"
f"* Isaac Lab submodules: {_submodules_str}\n"
" Any submodule accepts an editable selector, e.g. visualizers[all|kit|newton|rerun|viser], rl[rsl_rl|skrl].\n"
" Any submodule accepts an editable selector, e.g. "
"visualizers[all|kit|newton|rerun|viser], rl[rsl_rl|skrl].\n"
"\n"
f"* RL frameworks: {_frameworks_str}\n"
" Passing an RL framework name installs all Isaac Lab submodules + that framework.\n"
Expand Down
1 change: 1 addition & 0 deletions source/isaaclab/isaaclab/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# Short script names supported by ``isaaclab -p``.
_PYTHON_SCRIPT_ALIASES = {
"train.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train.py",
"train_multigpu.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train_multigpu.py",
"play.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "play.py",
}

Expand Down
1 change: 1 addition & 0 deletions source/isaaclab/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@
"isaaclab=isaaclab.cli:cli",
"play=isaaclab.cli:play",
"train=isaaclab.cli:train",
"train_multigpu=isaaclab.cli:train_multigpu",
],
},
dependency_links=PYTORCH_INDEX_URL,
Expand Down
133 changes: 133 additions & 0 deletions source/isaaclab/test/cli/test_train_multigpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Tests for the multi-GPU training launcher."""

import importlib.util
import subprocess
import sys
from pathlib import Path
from unittest import mock

from isaaclab.cli.utils import ISAACLAB_ROOT


def _load_train_multigpu_module():
"""Load the train_multigpu script as a test module."""
module_path = ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train_multigpu.py"
spec = importlib.util.spec_from_file_location("isaaclab_test_train_multigpu", module_path)
assert spec is not None
assert spec.loader is not None

module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module


TRAIN_MULTIGPU = _load_train_multigpu_module()


def test_builds_single_node_rsl_rl_torchrun_command():
"""Multi-GPU launcher should preserve training args and inject distributed mode."""
args_cli, train_args = TRAIN_MULTIGPU._parse_args(
[
"--num_gpus",
"4",
"--master_port",
"29504",
"--task=Isaac-Dexsuite-Kuka-Allegro-Reorient-v0",
"--headless",
"--num_envs=4096",
"--max_iterations=100",
"--run_name=gpu4_vis",
"presets=newton",
]
)

command = TRAIN_MULTIGPU._build_torchrun_command(args_cli, train_args)
train_script_index = command.index(str(TRAIN_MULTIGPU.TRAIN_SCRIPT))

assert command[:5] == [sys.executable, "-m", "torch.distributed.run", "--nproc_per_node", "4"]
assert command[5:7] == ["--master_port", "29504"]
assert command[train_script_index + 1 : train_script_index + 4] == ["--rl_library", "rsl_rl", "--distributed"]
assert command[-5:] == [
"--headless",
"--num_envs=4096",
"--max_iterations=100",
"--run_name=gpu4_vis",
"presets=newton",
]


def test_builds_multi_node_skrl_torchrun_command():
"""Multi-node torchrun settings should be forwarded before the training script."""
args_cli, train_args = TRAIN_MULTIGPU._parse_args(
[
"--rl_library",
"skrl",
"--nproc_per_node",
"2",
"--nnodes",
"2",
"--node_rank",
"1",
"--rdzv_backend",
"c10d",
"--rdzv_endpoint",
"host.example.com:5555",
"--rdzv_id",
"job-1",
"--task",
"Isaac-Cartpole-v0",
"--distributed",
]
)

command = TRAIN_MULTIGPU._build_torchrun_command(args_cli, train_args)
train_script_index = command.index(str(TRAIN_MULTIGPU.TRAIN_SCRIPT))

assert command[:5] == [sys.executable, "-m", "torch.distributed.run", "--nproc_per_node", "2"]
assert command[5:train_script_index] == [
"--nnodes",
"2",
"--node_rank",
"1",
"--rdzv_backend",
"c10d",
"--rdzv_endpoint",
"host.example.com:5555",
"--rdzv_id",
"job-1",
]
assert command[train_script_index + 1 : train_script_index + 3] == ["--rl_library", "skrl"]
assert command.count("--distributed") == 1


def test_dry_run_prints_command_without_launching(capsys):
"""Dry-run mode should not start torchrun."""
with mock.patch.object(subprocess, "run") as mock_run:
result = TRAIN_MULTIGPU.main(["--dry_run", "--num_gpus", "2", "--task", "Isaac-Cartpole-v0"])

assert result == 0
mock_run.assert_not_called()
output = capsys.readouterr().out
assert "torch.distributed.run" in output
assert "--nproc_per_node 2" in output
assert "--distributed --task Isaac-Cartpole-v0" in output


def test_cli_helper_runs_multigpu_script():
"""The isaaclab CLI helper should dispatch to the multi-GPU training script."""
from isaaclab import cli

with mock.patch("isaaclab.cli.run_python_command") as mock_run:
cli.train_multigpu(["--dry_run"])

mock_run.assert_called_once_with(
Path(ISAACLAB_ROOT) / "scripts" / "reinforcement_learning" / "train_multigpu.py",
["--dry_run"],
check=True,
)