diff --git a/docs/source/setup/installation/uv_run.rst b/docs/source/setup/installation/uv_run.rst index 63429658166..018a49c8dfe 100644 --- a/docs/source/setup/installation/uv_run.rst +++ b/docs/source/setup/installation/uv_run.rst @@ -31,6 +31,33 @@ Clone the repo and start training immediately — no virtual environment setup r ``uv`` resolves and manages the environment automatically on each invocation. Supported libraries for ``--rl_library`` are: ``rsl_rl``, ``rl_games``, ``skrl``, ``sb3``, and ``rlinf``. +Multi-GPU Training +------------------ + +Use ``train_multigpu`` for torch distributed training. It defaults to ``rsl_rl``, launches one +process per visible GPU, adds ``--distributed`` automatically, and forwards the remaining +arguments to the selected training library: + +.. code-block:: bash + + uv run train_multigpu \ + --task Isaac-Dexsuite-Kuka-Allegro-Reorient-v0 \ + --headless --num_envs 4096 --max_iterations 100 \ + --run_name gpu4_vis presets=newton + +Override the GPU count or torch distributed settings when needed: + +.. code-block:: bash + + uv run train_multigpu --num_gpus 4 --master_port 29504 \ + --task Isaac-Dexsuite-Kuka-Allegro-Reorient-v0 \ + --headless --num_envs 4096 --max_iterations 100 \ + --run_name gpu4_vis presets=newton + +Use ``--rl_library`` for other distributed-capable libraries: ``rsl_rl``, ``rl_games``, or ``skrl``. +For multi-node jobs, pass torchrun settings such as ``--nnodes``, ``--node_rank``, +``--rdzv_backend``, ``--rdzv_endpoint``, and ``--rdzv_id`` before the training arguments. + Play / Evaluation ----------------- diff --git a/scripts/reinforcement_learning/train_multigpu.py b/scripts/reinforcement_learning/train_multigpu.py new file mode 100644 index 00000000000..1230c48b595 --- /dev/null +++ b/scripts/reinforcement_learning/train_multigpu.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Multi-GPU training entrypoint for Isaac Lab reinforcement learning workflows.""" + +from __future__ import annotations + +import argparse +import shlex +import signal +import subprocess +import sys +from pathlib import Path +from types import FrameType + +SCRIPT_DIR = Path(__file__).resolve().parent +TRAIN_SCRIPT = SCRIPT_DIR / "train.py" + +DISTRIBUTED_LIBRARIES = ("rl_games", "rsl_rl", "skrl") + + +def _parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: + """Parse multi-GPU launcher arguments and return forwarded training arguments.""" + parser = argparse.ArgumentParser( + description="Launch multi-GPU RL training with torch.distributed.run.", + formatter_class=argparse.RawDescriptionHelpFormatter, + allow_abbrev=False, + epilog=( + "Examples:\n" + " train_multigpu --num_gpus 4 --task Isaac-Cartpole-v0 --headless\n" + " train_multigpu --rl_library skrl --num_gpus 2 --task Isaac-Cartpole-v0 --headless\n" + "\n" + "All unrecognized arguments are forwarded to the selected training library." + ), + ) + parser.add_argument( + "--rl_library", + choices=DISTRIBUTED_LIBRARIES, + default="rsl_rl", + help="Distributed-capable training library to use. Defaults to rsl_rl.", + ) + parser.add_argument( + "--num_gpus", + "--nproc_per_node", + dest="nproc_per_node", + default="gpu", + help=( + "Number of trainer processes to launch on each node. Accepts an integer or torchrun values " + "'gpu', 'cpu', and 'auto'. Defaults to 'gpu'." + ), + ) + parser.add_argument("--nnodes", default=None, help="Number of nodes to use for distributed training.") + parser.add_argument("--node_rank", default=None, help="Rank of this node in a multi-node job.") + parser.add_argument("--master_addr", default=None, help="Master node address for static rendezvous.") + parser.add_argument("--master_port", default=None, help="Master node port for static rendezvous.") + parser.add_argument("--rdzv_backend", default=None, help="Rendezvous backend used by torchrun.") + parser.add_argument("--rdzv_endpoint", default=None, help="Rendezvous endpoint used by torchrun.") + parser.add_argument("--rdzv_id", default=None, help="User-defined rendezvous id used by torchrun.") + parser.add_argument("--max_restarts", default=None, help="Maximum worker group restarts before failing.") + parser.add_argument("--monitor_interval", default=None, help="Worker monitor interval [s].") + parser.add_argument( + "--start_method", + choices=("spawn", "fork", "forkserver"), + default=None, + help="Multiprocessing start method used by torchrun.", + ) + parser.add_argument("--role", default=None, help="User-defined worker role used by torchrun.") + parser.add_argument("--tee", default=None, help="Tee selected worker stdout/stderr streams.") + parser.add_argument("--redirects", default=None, help="Redirect selected worker stdout/stderr streams.") + parser.add_argument("--local_ranks_filter", default=None, help="Only show logs from the listed local ranks.") + parser.add_argument("--log_dir", default=None, help="Directory used by torchrun for worker logs.") + parser.add_argument("--dry_run", action="store_true", help="Print the torchrun command without launching it.") + + args_cli, train_args = parser.parse_known_args(argv) + if train_args[:1] == ["--"]: + train_args = train_args[1:] + return args_cli, train_args + + +def _append_optional_torchrun_arg(command: list[str], args_cli: argparse.Namespace, name: str) -> None: + """Append a torchrun argument when it was provided.""" + value = getattr(args_cli, name) + if value is not None: + command.extend([f"--{name}", str(value)]) + + +def _with_distributed_arg(train_args: list[str]) -> list[str]: + """Ensure the selected training library receives the distributed flag.""" + if "--distributed" in train_args: + return train_args + return [*train_args, "--distributed"] + + +def _run_torchrun_command(command: list[str]) -> int: + """Run torchrun and forward termination signals to the child process.""" + proc = subprocess.Popen(command) + + def _terminate_child(_signum: int, _frame: FrameType | None) -> None: + proc.terminate() + + previous_sigterm = signal.signal(signal.SIGTERM, _terminate_child) + previous_sigint = signal.signal(signal.SIGINT, _terminate_child) + try: + return proc.wait() + finally: + signal.signal(signal.SIGTERM, previous_sigterm) + signal.signal(signal.SIGINT, previous_sigint) + + +def _build_torchrun_command(args_cli: argparse.Namespace, train_args: list[str]) -> list[str]: + """Build the torchrun command for multi-GPU training.""" + command = [ + sys.executable, + "-m", + "torch.distributed.run", + "--nproc_per_node", + str(args_cli.nproc_per_node), + ] + for name in ( + "nnodes", + "node_rank", + "master_addr", + "master_port", + "rdzv_backend", + "rdzv_endpoint", + "rdzv_id", + "max_restarts", + "monitor_interval", + "start_method", + "role", + "tee", + "redirects", + "local_ranks_filter", + "log_dir", + ): + _append_optional_torchrun_arg(command, args_cli, name) + + command.extend( + [ + str(TRAIN_SCRIPT), + "--rl_library", + args_cli.rl_library, + *_with_distributed_arg(train_args), + ] + ) + return command + + +def main(argv: list[str] | None = None) -> int: + """Launch multi-GPU training with ``torch.distributed.run``.""" + if argv is None: + argv = sys.argv[1:] + + args_cli, train_args = _parse_args(argv) + command = _build_torchrun_command(args_cli, train_args) + + if args_cli.dry_run: + print(shlex.join(command)) + return 0 + + print(f"[INFO] Launching distributed training with: {shlex.join(command)}") + return _run_torchrun_command(command) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/source/isaaclab/changelog.d/train-multigpu-entrypoint.rst b/source/isaaclab/changelog.d/train-multigpu-entrypoint.rst new file mode 100644 index 00000000000..4608f06c617 --- /dev/null +++ b/source/isaaclab/changelog.d/train-multigpu-entrypoint.rst @@ -0,0 +1,5 @@ +Added +^^^^^ + +* Added the ``train_multigpu`` entry point for launching distributed RL training with + ``torch.distributed.run``. diff --git a/source/isaaclab/isaaclab/cli/__init__.py b/source/isaaclab/isaaclab/cli/__init__.py index 0d71d4e48bc..1253f67103a 100644 --- a/source/isaaclab/isaaclab/cli/__init__.py +++ b/source/isaaclab/isaaclab/cli/__init__.py @@ -37,6 +37,15 @@ def train(args: list[str] | None = None) -> None: run_python_command(ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train.py", args, check=True) +def train_multigpu(args: list[str] | None = None) -> None: + """Run the unified multi-GPU reinforcement learning training script.""" + if args is None: + args = sys.argv[1:] + run_python_command( + ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train_multigpu.py", args, check=True + ) + + def play(args: list[str] | None = None) -> None: """Run the unified reinforcement learning play script.""" if args is None: @@ -49,6 +58,9 @@ def cli() -> None: if len(sys.argv) > 1 and sys.argv[1] == "train": train(sys.argv[2:]) return + if len(sys.argv) > 1 and sys.argv[1] == "train_multigpu": + train_multigpu(sys.argv[2:]) + return if len(sys.argv) > 1 and sys.argv[1] == "play": play(sys.argv[2:]) return @@ -61,8 +73,9 @@ def cli() -> None: formatter_class=argparse.RawTextHelpFormatter, epilog=( "commands:\n" - " train Run scripts/reinforcement_learning/train.py\n" - " play Run scripts/reinforcement_learning/play.py" + " train Run scripts/reinforcement_learning/train.py\n" + " train_multigpu Run scripts/reinforcement_learning/train_multigpu.py\n" + " play Run scripts/reinforcement_learning/play.py" ), ) diff --git a/source/isaaclab/isaaclab/cli/utils.py b/source/isaaclab/isaaclab/cli/utils.py index c7de00d9656..211930b2b46 100644 --- a/source/isaaclab/isaaclab/cli/utils.py +++ b/source/isaaclab/isaaclab/cli/utils.py @@ -20,6 +20,7 @@ # Short script names supported by ``isaaclab -p``. _PYTHON_SCRIPT_ALIASES = { "train.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train.py", + "train_multigpu.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "train_multigpu.py", "play.py": ISAACLAB_ROOT / "scripts" / "reinforcement_learning" / "play.py", } diff --git a/source/isaaclab/setup.py b/source/isaaclab/setup.py index bc64609de3f..3eb70374927 100644 --- a/source/isaaclab/setup.py +++ b/source/isaaclab/setup.py @@ -130,6 +130,7 @@ "isaaclab=isaaclab.cli:cli", "play=isaaclab.cli:play", "train=isaaclab.cli:train", + "train_multigpu=isaaclab.cli:train_multigpu", ], }, dependency_links=PYTORCH_INDEX_URL,