From 1c8ac8ae4e62eb60a32bf0d068013b389db0c3c6 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 3 Nov 2025 11:21:21 -0800
Subject: [PATCH 1/3] Remove the `--profile` option

Instead, check whether the script is under nsys via
`NSYS_PROFILE_SESSION_ID`. Note that it's still possible to profile
warmup iterations -- just don't specify `--capture-range
cudaProfilerStart` in the `nsys` command.
---
 thunder/benchmarks/benchmark_inference.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py
index e7dd90361a..93adfe5b9f 100644
--- a/thunder/benchmarks/benchmark_inference.py
+++ b/thunder/benchmarks/benchmark_inference.py
@@ -155,7 +155,6 @@ class InferenceBenchmarkConfig:
     mode: str
     disable_moe_replacement: bool
     attn_implementation: str | None
-    profile: bool
 
 
 @dataclass
@@ -484,10 +483,11 @@ def run_benchmark(self) -> InferenceMetrics:
         for _ in tqdm(range(self.config.num_iterations), disable=LOCAL_RANK != 0):
             past_key_values.reset()
 
-            if self.config.profile:
+            is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID"))
+            if is_under_nsys:
                 torch.cuda.cudart().cudaProfilerStart()
             iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length)
-            if self.config.profile:
+            if is_under_nsys:
                 torch.cuda.cudart().cudaProfilerStop()
 
             all_metrics.append(iter_metrics)
@@ -671,11 +671,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="let nvfuser take care of linear and matmul, note that this might fail with distributed run. See: https://github.com/NVIDIA/Fuser/issues/4507",
     )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Wrap each non-warmup iteration with cudaProfilerStart() and cudaProfilerStop(). This allows us to run `nsys profile --capture-range=cudaProfilerApi --capture-range-end=repeat:<N> ... --profile` to record only the non-warmup iterations.",
-    )
 
     parser.add_argument("--save-results", action="store_true", help="Save results to JSON file")
     parser.add_argument("--output-dir", type=str, default="./results", help="Directory to save results")
@@ -712,7 +707,6 @@ def main():
         enable_nv_linear=args.enable_nv_linear,
         disable_moe_replacement=args.disable_moe_replacement,
         attn_implementation=args.attn_implementation,
-        profile=args.profile,
     )
     benchmark = InferenceBenchmark(config)
 

From b22e297074f2f6c0f54f888aea7d2e6d4e99f31e Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 3 Nov 2025 11:28:03 -0800
Subject: [PATCH 2/3] Comment

---
 thunder/benchmarks/benchmark_inference.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py
index 93adfe5b9f..1be483ee96 100644
--- a/thunder/benchmarks/benchmark_inference.py
+++ b/thunder/benchmarks/benchmark_inference.py
@@ -484,6 +484,10 @@ def run_benchmark(self) -> InferenceMetrics:
             past_key_values.reset()
 
             is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID"))
+            # Wrap each non-warmup iteration with cudaProfilerStart() and
+            # cudaProfilerStop(). This allows the user to run `nsys profile
+            # --capture-range=cudaProfilerApi --capture-range-end=repeat:<N>
+            # ... --profile` to record only the non-warmup iterations.
             if is_under_nsys:
                 torch.cuda.cudart().cudaProfilerStart()
             iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length)

From cff6d6db194633a3dc3d8a6229ac34f1b36678d4 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Tue, 4 Nov 2025 09:20:03 -0800
Subject: [PATCH 3/3] Fix comment

---
 thunder/benchmarks/benchmark_inference.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/thunder/benchmarks/benchmark_inference.py b/thunder/benchmarks/benchmark_inference.py
index 09e1bdf461..083b8dbcc0 100644
--- a/thunder/benchmarks/benchmark_inference.py
+++ b/thunder/benchmarks/benchmark_inference.py
@@ -494,9 +494,11 @@ def run_benchmark(self) -> InferenceMetrics:
 
             is_under_nsys = bool(os.environ.get("NSYS_PROFILING_SESSION_ID"))
             # Wrap each non-warmup iteration with cudaProfilerStart() and
-            # cudaProfilerStop(). This allows the user to run `nsys profile
-            # --capture-range=cudaProfilerApi --capture-range-end=repeat:<N>
-            # ... --profile` to record only the non-warmup iterations.
+            # cudaProfilerStop(). This allows the user to run
+            # ```shell
+            # nsys profile --capture-range=cudaProfilerApi --capture-range-end=repeat:<N> ...
+            # ```
+            # to record only the non-warmup iterations.
             if is_under_nsys:
                 torch.cuda.cudart().cudaProfilerStart()
             iter_metrics = self.measure_inference_step(input_ids, past_key_values, self.config.output_length)