Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
968c7bb
Add deterministic execution mode for atomic operations
Apr 10, 2026
d5ef316
Fix deterministic launch edge cases
Apr 10, 2026
5ec9b25
Document deterministic options
Apr 10, 2026
72e8e3f
Add deterministic capture controls
Apr 10, 2026
48e7207
Support composite deterministic atomics
Apr 10, 2026
3170255
Refine deterministic record sizing
Apr 12, 2026
50b7ebc
Fix deterministic cache-hit launches
Apr 12, 2026
cc6720d
Benchmark deterministic graph replay
Apr 12, 2026
5b5038f
Optimize scalar deterministic reduction
Apr 12, 2026
e34a93c
Add deterministic mode levels
Apr 13, 2026
6974788
Support deterministic functions
mmacklin Apr 20, 2026
3d6d4ba
Clean deterministic ABI naming
mmacklin Apr 21, 2026
be06d95
Support deterministic struct targets
eric-heiden May 1, 2026
b010ea0
Address deterministic review comments
eric-heiden May 5, 2026
a925a72
Support deterministic sliced atomics
eric-heiden May 8, 2026
ff873d6
Update deterministic changelog ref
eric-heiden May 8, 2026
75c886e
Resolve deterministic merge conflicts
eric-heiden May 8, 2026
6e6c9e6
Resolve changelog merge conflict
eric-heiden May 8, 2026
8d74aa3
Resolve changelog conflict
eric-heiden May 8, 2026
218af6f
Fix deterministic counter edge cases
eric-heiden May 8, 2026
3833309
Avoid deterministic merge conflicts
eric-heiden May 8, 2026
768c669
Document deterministic helper flow
eric-heiden May 8, 2026
fdf8e8e
Fix deterministic option cache toggles
eric-heiden May 8, 2026
d2f6bd6
Guard integer atomics in counter passes
eric-heiden May 8, 2026
d8ce57d
Clarify deterministic comments
eric-heiden May 8, 2026
a7d8c1f
Address deterministic review cleanup
eric-heiden May 8, 2026
145bf6e
Address deterministic capture edge cases
eric-heiden May 9, 2026
28380ab
Avoid graph replay overflow sync
eric-heiden May 9, 2026
d079bf4
Clean up deterministic review nits
eric-heiden May 9, 2026
381a4aa
Avoid import-time benchmark options
eric-heiden May 9, 2026
c506742
Fix helper counter phase suppression
eric-heiden May 9, 2026
ce4ce4e
Clarify deterministic graph limits
eric-heiden May 9, 2026
85fac0c
Guard deterministic helper stores
eric-heiden May 9, 2026
2367ce1
Regenerate config docs
eric-heiden May 9, 2026
731ed6c
Address deterministic review feedback
eric-heiden May 9, 2026
f7fc45b
Guard deterministic capture limits
eric-heiden May 9, 2026
fe53ed4
Fix deterministic graph workspaces
eric-heiden May 9, 2026
2c38196
Update deterministic limitations doc
eric-heiden May 9, 2026
93a1b6d
Avoid sorting unused scatter records
eric-heiden May 9, 2026
adbb878
Fix deterministic enum test Sonar hotspot
eric-heiden May 9, 2026
a7c9335
Fix block-dependent tile codegen reuse
eric-heiden May 9, 2026
c7f4f96
Clarify deterministic scatter capture limit
eric-heiden May 9, 2026
659876e
Reduce launch benchmark overhead
eric-heiden May 9, 2026
771578a
Tighten deterministic atomic validation
eric-heiden May 9, 2026
ebe317a
Restore direct CUDA launch path
eric-heiden May 9, 2026
e2f5977
Avoid launch-time option resolution
eric-heiden May 9, 2026
4f0bfca
Document deterministic execution
eric-heiden May 9, 2026
803984c
Clarify deterministic counter slots
eric-heiden May 9, 2026
945f829
Reject counter graph capture
eric-heiden May 9, 2026
0b8fe83
Refocus determinism quick start
eric-heiden May 9, 2026
b3a79e0
Clarify boolean deterministic aliases
eric-heiden May 9, 2026
dc17c2c
Fix deterministic counter ASV
eric-heiden May 9, 2026
4b1d244
Fix deterministic benchmark capture
eric-heiden May 9, 2026
5f7ea05
Clarify determinism docs
eric-heiden May 9, 2026
18a1b51
Stabilize deterministic ASV params
eric-heiden May 9, 2026
c09fc8f
Clarify deterministic bool aliases
eric-heiden May 9, 2026
cd882e1
Add manual determinism docs
eric-heiden May 9, 2026
0abd155
Clarify deterministic reduction cost
eric-heiden May 9, 2026
40b670f
Expand determinism performance docs
eric-heiden May 9, 2026
4a153c6
Clarify determinism benchmark timing
eric-heiden May 9, 2026
c4718cc
Optimize deterministic counter writeback
eric-heiden May 10, 2026
edef2aa
Emit block tile strides symbolically
eric-heiden May 10, 2026
2e86921
Support deterministic counters in graphs
eric-heiden May 10, 2026
bc8a3ed
Support indexed deterministic counters
eric-heiden May 10, 2026
53755ea
Fix custom adjoint nondeterministic mode
eric-heiden May 10, 2026
e557947
Address deterministic review comments
eric-heiden May 11, 2026
be260a9
Remove unused deterministic counter export
eric-heiden May 11, 2026
2340d94
Clarify custom adjoint determinism
eric-heiden May 11, 2026
d4f5d83
Fix deterministic counter review nits
eric-heiden May 11, 2026
5d663b8
Guard APIC capture checks
eric-heiden May 11, 2026
39f41fc
Adapt deterministic sync to GitHub base
eric-heiden May 11, 2026
d0284f4
Align APIC capture guard with main
eric-heiden May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 197 additions & 36 deletions CHANGELOG.md

Large diffs are not rendered by default.

239 changes: 234 additions & 5 deletions asv/benchmarks/atomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Benchmarks for atomic operations under high thread contention.
"""Benchmarks for atomic operations and deterministic mode overhead.

All threads write to a single output location (index 0) to maximize contention
and measure worst-case atomic operation performance.
"""

import inspect
from typing import Any

import numpy as np
Expand All @@ -32,9 +33,21 @@
}

NUM_ELEMENTS = 32 * 1024 * 1024


@wp.kernel
DETERMINISTIC_BENCHMARK_SIZES = [64 * 1024, 256 * 1024, 1024 * 1024]
DETERMINISM_SUPPORTED = "deterministic" in inspect.signature(wp.kernel).parameters
DETERMINISTIC_BENCHMARK_MODES = ("normal", "deterministic")
DETERMINISTIC_KERNEL_OPTIONS = {"enable_backward": False}
if DETERMINISM_SUPPORTED:
DETERMINISTIC_KERNEL_OPTIONS.update(
{
"deterministic": True,
"deterministic_max_records": 1,
"module": "unique",
}
)


@wp.kernel(enable_backward=False)
def max_kernel(
vals: wp.array(dtype=Any),
out: wp.array(dtype=Any),
Expand All @@ -44,7 +57,7 @@ def max_kernel(
wp.atomic_max(out, 0, val) # All threads contend on out[0]


@wp.kernel
@wp.kernel(enable_backward=False)
def min_kernel(
vals: wp.array(dtype=Any),
out: wp.array(dtype=Any),
Expand All @@ -54,6 +67,60 @@ def min_kernel(
wp.atomic_min(out, 0, val) # All threads contend on out[0]


@wp.kernel(enable_backward=False)
def scatter_add_kernel(
vals: wp.array(dtype=wp.float32),
indices: wp.array(dtype=wp.int32),
out: wp.array(dtype=wp.float32),
):
tid = wp.tid()
wp.atomic_add(out, indices[tid], vals[tid])


@wp.kernel(**DETERMINISTIC_KERNEL_OPTIONS)
def scatter_add_kernel_deterministic(
vals: wp.array(dtype=wp.float32),
indices: wp.array(dtype=wp.int32),
out: wp.array(dtype=wp.float32),
):
tid = wp.tid()
wp.atomic_add(out, indices[tid], vals[tid])


@wp.kernel(enable_backward=False)
def counter_kernel(
vals: wp.array(dtype=wp.float32),
counter: wp.array(dtype=wp.int32),
out: wp.array(dtype=wp.float32),
):
tid = wp.tid()
slot = wp.atomic_add(counter, 0, 1)
out[slot] = vals[tid]


@wp.kernel(**DETERMINISTIC_KERNEL_OPTIONS)
def counter_kernel_deterministic(
vals: wp.array(dtype=wp.float32),
counter: wp.array(dtype=wp.int32),
out: wp.array(dtype=wp.float32),
):
tid = wp.tid()
slot = wp.atomic_add(counter, 0, 1)
out[slot] = vals[tid]


@wp.kernel(enable_backward=False)
def zero_float_array_kernel(out: wp.array(dtype=wp.float32)):
tid = wp.tid()
out[tid] = 0.0


@wp.kernel(enable_backward=False)
def zero_int_array_kernel(out: wp.array(dtype=wp.int32)):
tid = wp.tid()
out[tid] = 0


class AtomicMax:
"""Benchmark wp.atomic_max() with high thread contention.

Expand Down Expand Up @@ -166,3 +233,165 @@ def time_cuda(self, vals_np_dict, dtype_str):
self.out.zero_()
self.cmd.launch()
wp.synchronize_device(self.device)


class AtomicAddDeterminismOverhead:
"""Benchmark the overhead of deterministic accumulation atomics.

The benchmark compares the normal atomic-add path against deterministic
scatter-sort-reduce for the same kernel using CUDA graph replay. A small
size sweep exposes where deterministic execution crosses over. Two
destination counts are used:

- ``1``: worst-case contention, where every thread targets the same output.
- ``65536``: lower contention, closer to a scatter workload.
"""

params = (DETERMINISTIC_BENCHMARK_MODES, (1, 65536), tuple(DETERMINISTIC_BENCHMARK_SIZES))
param_names = ("mode", "num_outputs", "num_elements")

repeat = 10
number = 5

def setup_cache(self):
rng = np.random.default_rng(123)
vals_np = {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES}
indices_np = {}
for n in DETERMINISTIC_BENCHMARK_SIZES:
indices_np[n] = {
1: np.zeros(n, dtype=np.int32),
65536: rng.integers(0, 65536, size=n, dtype=np.int32),
}
return vals_np, indices_np

def setup(self, cache, mode, num_outputs, num_elements):
wp.init()
self.device = wp.get_device("cuda:0")

if mode == "deterministic" and not DETERMINISM_SUPPORTED:
raise NotImplementedError("deterministic kernel options are not supported by this Warp version")

vals_np, indices_np = cache
self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device)
self.indices = wp.array(indices_np[num_elements][num_outputs], dtype=wp.int32, device=self.device)
self.out = wp.zeros(shape=(num_outputs,), dtype=wp.float32, device=self.device)

self.kernel = scatter_add_kernel_deterministic if mode == "deterministic" else scatter_add_kernel
wp.launch(
zero_float_array_kernel,
dim=num_outputs,
inputs=[self.out],
device=self.device,
)
wp.launch(
self.kernel,
(num_elements,),
inputs=[self.vals, self.indices],
outputs=[self.out],
device=self.device,
)
wp.synchronize_device(self.device)

with wp.ScopedCapture(device=self.device, force_module_load=False) as capture:
wp.launch(
zero_float_array_kernel,
dim=num_outputs,
inputs=[self.out],
device=self.device,
)
wp.launch(
self.kernel,
(num_elements,),
inputs=[self.vals, self.indices],
outputs=[self.out],
device=self.device,
)

self.graph = capture.graph

for _ in range(5):
wp.capture_launch(self.graph)
wp.synchronize_device(self.device)

def time_cuda(self, cache, mode, num_outputs, num_elements):
wp.capture_launch(self.graph)
wp.synchronize_device(self.device)


class AtomicCounterDeterminismOverhead:
"""Benchmark the overhead of deterministic counter/allocator atomics.

The timed path uses CUDA graph replay and includes resetting the output
state inside the captured graph so the benchmark isolates device work.
"""

params = (DETERMINISTIC_BENCHMARK_MODES, tuple(DETERMINISTIC_BENCHMARK_SIZES))
param_names = ("mode", "num_elements")

repeat = 10
number = 5

def setup_cache(self):
rng = np.random.default_rng(321)
return {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES}

def setup(self, vals_np, mode, num_elements):
wp.init()
self.device = wp.get_device("cuda:0")

self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device)
self.counter = wp.zeros(shape=(1,), dtype=wp.int32, device=self.device)
self.out = wp.zeros(shape=(num_elements,), dtype=wp.float32, device=self.device)

self.kernel = counter_kernel_deterministic if mode == "deterministic" else counter_kernel
wp.launch(
zero_int_array_kernel,
dim=1,
inputs=[self.counter],
device=self.device,
)
wp.launch(
zero_float_array_kernel,
dim=num_elements,
inputs=[self.out],
device=self.device,
)
wp.launch(
self.kernel,
(num_elements,),
inputs=[self.vals, self.counter],
outputs=[self.out],
device=self.device,
)
wp.synchronize_device(self.device)

with wp.ScopedCapture(device=self.device, force_module_load=False) as capture:
wp.launch(
zero_int_array_kernel,
dim=1,
inputs=[self.counter],
device=self.device,
)
wp.launch(
zero_float_array_kernel,
dim=num_elements,
inputs=[self.out],
device=self.device,
)
wp.launch(
self.kernel,
(num_elements,),
inputs=[self.vals, self.counter],
outputs=[self.out],
device=self.device,
)

self.graph = capture.graph

for _ in range(5):
wp.capture_launch(self.graph)
wp.synchronize_device(self.device)

def time_cuda(self, vals_np, mode, num_elements):
wp.capture_launch(self.graph)
wp.synchronize_device(self.device)
2 changes: 2 additions & 0 deletions build_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ def main(argv: list[str] | None = None) -> int:
"native/texture.cpp",
"native/mathdx.cpp",
"native/coloring.cpp",
"native/deterministic.cpp",
"native/fastcall.cpp",
]
warp_cpp_paths = [os.path.join(build_path, cpp) for cpp in cpp_sources]
Expand All @@ -533,6 +534,7 @@ def main(argv: list[str] | None = None) -> int:
else:
cuda_sources = [
"native/bvh.cu",
"native/deterministic.cu",
"native/mesh.cu",
"native/sort.cu",
"native/hashgrid.cu",
Expand Down
Loading