NVIDIA · mmacklin · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/asv/benchmarks/atomics.py b/asv/benchmarks/atomics.py
@@ -13,12 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Benchmarks for atomic operations under high thread contention.
+"""Benchmarks for atomic operations and deterministic mode overhead.
 
 All threads write to a single output location (index 0) to maximize contention
 and measure worst-case atomic operation performance.
 """
 
+import inspect
 from typing import Any
 
 import numpy as np
@@ -32,9 +33,21 @@
 }
 
 NUM_ELEMENTS = 32 * 1024 * 1024
-
-
-@wp.kernel
+DETERMINISTIC_BENCHMARK_SIZES = [64 * 1024, 256 * 1024, 1024 * 1024]
+DETERMINISM_SUPPORTED = "deterministic" in inspect.signature(wp.kernel).parameters
+DETERMINISTIC_BENCHMARK_MODES = ("normal", "deterministic")
+DETERMINISTIC_KERNEL_OPTIONS = {"enable_backward": False}
+if DETERMINISM_SUPPORTED:
+    DETERMINISTIC_KERNEL_OPTIONS.update(
+        {
+            "deterministic": True,
+            "deterministic_max_records": 1,
+            "module": "unique",
+        }
+    )
+
+
+@wp.kernel(enable_backward=False)
 def max_kernel(
     vals: wp.array(dtype=Any),
     out: wp.array(dtype=Any),
@@ -44,7 +57,7 @@ def max_kernel(
     wp.atomic_max(out, 0, val)  # All threads contend on out[0]
 
 
-@wp.kernel
+@wp.kernel(enable_backward=False)
 def min_kernel(
     vals: wp.array(dtype=Any),
     out: wp.array(dtype=Any),
@@ -54,6 +67,60 @@ def min_kernel(
     wp.atomic_min(out, 0, val)  # All threads contend on out[0]
 
 
+@wp.kernel(enable_backward=False)
+def scatter_add_kernel(
+    vals: wp.array(dtype=wp.float32),
+    indices: wp.array(dtype=wp.int32),
+    out: wp.array(dtype=wp.float32),
+):
+    tid = wp.tid()
+    wp.atomic_add(out, indices[tid], vals[tid])
+
+
+@wp.kernel(**DETERMINISTIC_KERNEL_OPTIONS)
+def scatter_add_kernel_deterministic(
+    vals: wp.array(dtype=wp.float32),
+    indices: wp.array(dtype=wp.int32),
+    out: wp.array(dtype=wp.float32),
+):
+    tid = wp.tid()
+    wp.atomic_add(out, indices[tid], vals[tid])
+
+
+@wp.kernel(enable_backward=False)
+def counter_kernel(
+    vals: wp.array(dtype=wp.float32),
+    counter: wp.array(dtype=wp.int32),
+    out: wp.array(dtype=wp.float32),
+):
+    tid = wp.tid()
+    slot = wp.atomic_add(counter, 0, 1)
+    out[slot] = vals[tid]
+
+
+@wp.kernel(**DETERMINISTIC_KERNEL_OPTIONS)
+def counter_kernel_deterministic(
+    vals: wp.array(dtype=wp.float32),
+    counter: wp.array(dtype=wp.int32),
+    out: wp.array(dtype=wp.float32),
+):
+    tid = wp.tid()
+    slot = wp.atomic_add(counter, 0, 1)
+    out[slot] = vals[tid]
+
+
+@wp.kernel(enable_backward=False)
+def zero_float_array_kernel(out: wp.array(dtype=wp.float32)):
+    tid = wp.tid()
+    out[tid] = 0.0
+
+
+@wp.kernel(enable_backward=False)
+def zero_int_array_kernel(out: wp.array(dtype=wp.int32)):
+    tid = wp.tid()
+    out[tid] = 0
+
+
 class AtomicMax:
     """Benchmark wp.atomic_max() with high thread contention.
 
@@ -166,3 +233,165 @@ def time_cuda(self, vals_np_dict, dtype_str):
         self.out.zero_()
         self.cmd.launch()
         wp.synchronize_device(self.device)
+
+
+class AtomicAddDeterminismOverhead:
+    """Benchmark the overhead of deterministic accumulation atomics.
+
+    The benchmark compares the normal atomic-add path against deterministic
+    scatter-sort-reduce for the same kernel using CUDA graph replay. A small
+    size sweep exposes where deterministic execution crosses over. Two
+    destination counts are used:
+
+    - ``1``: worst-case contention, where every thread targets the same output.
+    - ``65536``: lower contention, closer to a scatter workload.
+    """
+
+    params = (DETERMINISTIC_BENCHMARK_MODES, (1, 65536), tuple(DETERMINISTIC_BENCHMARK_SIZES))
+    param_names = ("mode", "num_outputs", "num_elements")
+
+    repeat = 10
+    number = 5
+
+    def setup_cache(self):
+        rng = np.random.default_rng(123)
+        vals_np = {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES}
+        indices_np = {}
+        for n in DETERMINISTIC_BENCHMARK_SIZES:
+            indices_np[n] = {
+                1: np.zeros(n, dtype=np.int32),
+                65536: rng.integers(0, 65536, size=n, dtype=np.int32),
+            }
+        return vals_np, indices_np
+
+    def setup(self, cache, mode, num_outputs, num_elements):
+        wp.init()
+        self.device = wp.get_device("cuda:0")
+
+        if mode == "deterministic" and not DETERMINISM_SUPPORTED:
+            raise NotImplementedError("deterministic kernel options are not supported by this Warp version")
+
+        vals_np, indices_np = cache
+        self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device)
+        self.indices = wp.array(indices_np[num_elements][num_outputs], dtype=wp.int32, device=self.device)
+        self.out = wp.zeros(shape=(num_outputs,), dtype=wp.float32, device=self.device)
+
+        self.kernel = scatter_add_kernel_deterministic if mode == "deterministic" else scatter_add_kernel
+        wp.launch(
+            zero_float_array_kernel,
+            dim=num_outputs,
+            inputs=[self.out],
+            device=self.device,
+        )
+        wp.launch(
+            self.kernel,
+            (num_elements,),
+            inputs=[self.vals, self.indices],
+            outputs=[self.out],
+            device=self.device,
+        )
+        wp.synchronize_device(self.device)
+
+        with wp.ScopedCapture(device=self.device, force_module_load=False) as capture:
+            wp.launch(
+                zero_float_array_kernel,
+                dim=num_outputs,
+                inputs=[self.out],
+                device=self.device,
+            )
+            wp.launch(
+                self.kernel,
+                (num_elements,),
+                inputs=[self.vals, self.indices],
+                outputs=[self.out],
+                device=self.device,
+            )
+
+        self.graph = capture.graph
+
+        for _ in range(5):
+            wp.capture_launch(self.graph)
+        wp.synchronize_device(self.device)
+
+    def time_cuda(self, cache, mode, num_outputs, num_elements):
+        wp.capture_launch(self.graph)
+        wp.synchronize_device(self.device)
+
+
+class AtomicCounterDeterminismOverhead:
+    """Benchmark the overhead of deterministic counter/allocator atomics.
+
+    The timed path uses CUDA graph replay and includes resetting the output
+    state inside the captured graph so the benchmark isolates device work.
+    """
+
+    params = (DETERMINISTIC_BENCHMARK_MODES, tuple(DETERMINISTIC_BENCHMARK_SIZES))
+    param_names = ("mode", "num_elements")
+
+    repeat = 10
+    number = 5
+
+    def setup_cache(self):
+        rng = np.random.default_rng(321)
+        return {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES}
+
+    def setup(self, vals_np, mode, num_elements):
+        wp.init()
+        self.device = wp.get_device("cuda:0")
+
+        self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device)
+        self.counter = wp.zeros(shape=(1,), dtype=wp.int32, device=self.device)
+        self.out = wp.zeros(shape=(num_elements,), dtype=wp.float32, device=self.device)
+
+        self.kernel = counter_kernel_deterministic if mode == "deterministic" else counter_kernel
+        wp.launch(
+            zero_int_array_kernel,
+            dim=1,
+            inputs=[self.counter],
+            device=self.device,
+        )
+        wp.launch(
+            zero_float_array_kernel,
+            dim=num_elements,
+            inputs=[self.out],
+            device=self.device,
+        )
+        wp.launch(
+            self.kernel,
+            (num_elements,),
+            inputs=[self.vals, self.counter],
+            outputs=[self.out],
+            device=self.device,
+        )
+        wp.synchronize_device(self.device)
+
+        with wp.ScopedCapture(device=self.device, force_module_load=False) as capture:
+            wp.launch(
+                zero_int_array_kernel,
+                dim=1,
+                inputs=[self.counter],
+                device=self.device,
+            )
+            wp.launch(
+                zero_float_array_kernel,
+                dim=num_elements,
+                inputs=[self.out],
+                device=self.device,
+            )
+            wp.launch(
+                self.kernel,
+                (num_elements,),
+                inputs=[self.vals, self.counter],
+                outputs=[self.out],
+                device=self.device,
+            )
+
+        self.graph = capture.graph
+
+        for _ in range(5):
+            wp.capture_launch(self.graph)
+        wp.synchronize_device(self.device)
+
+    def time_cuda(self, vals_np, mode, num_elements):
+        wp.capture_launch(self.graph)
+        wp.synchronize_device(self.device)
diff --git a/build_lib.py b/build_lib.py
@@ -522,6 +522,7 @@ def main(argv: list[str] | None = None) -> int:
             "native/texture.cpp",
             "native/mathdx.cpp",
             "native/coloring.cpp",
+            "native/deterministic.cpp",
             "native/fastcall.cpp",
         ]
         warp_cpp_paths = [os.path.join(build_path, cpp) for cpp in cpp_sources]
@@ -533,6 +534,7 @@ def main(argv: list[str] | None = None) -> int:
         else:
             cuda_sources = [
                 "native/bvh.cu",
+                "native/deterministic.cu",
                 "native/mesh.cu",
                 "native/sort.cu",
                 "native/hashgrid.cu",