From 1c8ac8ae4e62eb60a32bf0d068013b389db0c3c6 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 3 Nov 2025 11:21:21 -0800 Subject: [PATCH 1/3] Remove the `--profile` option Instead, check whether the script is under nsys via `NSYS_PROFILE_SESSION_ID`. Note that it's still possible to profile warmup iterations -- just don't specify `--capture-range cudaProfilerStart` in the `nsys` command. --- thunder/benchmarks/benchmark_inference.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py index e7dd90361a..93adfe5b9f 100644 --- a/thunder/benchmarks/benchmark_inference.py +++ b/thunder/benchmarks/benchmark_inference.py @@ -155,7 +155,6 @@ class InferenceBenchmarkConfig: mode: str disable_moe_replacement: bool attn_implementation: str | None - profile: bool @dataclass @@ -484,10 +483,11 @@ def run_benchmark(self) -> InferenceMetrics: for _ in tqdm(range(self.config.num_iterations), disable=LOCAL_RANK != 0): past_key_values.reset() - if self.config.profile: + is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID")) + if is_under_nsys: torch.cuda.cudart().cudaProfilerStart() iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length) - if self.config.profile: + if is_under_nsys: torch.cuda.cudart().cudaProfilerStop() all_metrics.append(iter_metrics) @@ -671,11 +671,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="let nvfuser take care of linear and matmul, note that this might fail with distributed run. See: https://github.com/NVIDIA/Fuser/issues/4507", ) - parser.add_argument( - "--profile", - action="store_true", - help="Wrap each non-warmup iteration with cudaProfilerStart() and cudaProfilerStop(). This allows us to run `nsys profile --capture-range=cudaProfilerApi --capture-range-end=repeat: ... --profile` to record only the non-warmup iterations.", - ) parser.add_argument("--save-results", action="store_true", help="Save results to JSON file") parser.add_argument("--output-dir", type=str, default="./results", help="Directory to save results") @@ -712,7 +707,6 @@ def main(): enable_nv_linear=args.enable_nv_linear, disable_moe_replacement=args.disable_moe_replacement, attn_implementation=args.attn_implementation, - profile=args.profile, ) benchmark = InferenceBenchmark(config) From b22e297074f2f6c0f54f888aea7d2e6d4e99f31e Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 3 Nov 2025 11:28:03 -0800 Subject: [PATCH 2/3] Comment --- thunder/benchmarks/benchmark_inference.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py index 93adfe5b9f..1be483ee96 100644 --- a/thunder/benchmarks/benchmark_inference.py +++ b/thunder/benchmarks/benchmark_inference.py @@ -484,6 +484,10 @@ def run_benchmark(self) -> InferenceMetrics: past_key_values.reset() is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID")) + # Wrap each non-warmup iteration with cudaProfilerStart() and + # cudaProfilerStop(). This allows the user to run `nsys profile + # --capture-range=cudaProfilerApi --capture-range-end=repeat: + # ... --profile` to record only the non-warmup iterations. if is_under_nsys: torch.cuda.cudart().cudaProfilerStart() iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length) From cff6d6db194633a3dc3d8a6229ac34f1b36678d4 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 4 Nov 2025 09:20:03 -0800 Subject: [PATCH 3/3] Fix comment --- thunder/benchmarks/benchmark_inference.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py index 09e1bdf461..083b8dbcc0 100644 --- a/thunder/benchmarks/benchmark_inference.py +++ b/thunder/benchmarks/benchmark_inference.py @@ -494,9 +494,11 @@ def run_benchmark(self) -> InferenceMetrics: is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID")) # Wrap each non-warmup iteration with cudaProfilerStart() and - # cudaProfilerStop(). This allows the user to run `nsys profile - # --capture-range=cudaProfilerApi --capture-range-end=repeat: - # ... --profile` to record only the non-warmup iterations. + # cudaProfilerStop(). This allows the user to run + # ```shell + # nsys profile --capture-range=cudaProfilerApi --capture-range-end=repeat: ... + # ``` + # to record only the non-warmup iterations. if is_under_nsys: torch.cuda.cudart().cudaProfilerStart() iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length)