CliMA · imreddyTeja · May 4, 2026 · May 4, 2026 · Apr 13, 2026
diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
@@ -52,14 +52,9 @@ if VERSION ≥ v"1.11.0-beta"
             else
                 cartesian_indices_mask(us, mask)
             end
-            args = (dest, bc, us, mask, cart_inds)
-            threads = threads_via_occupancy(knl_copyto!, args)
-            n_max_threads = min(threads, get_N(us))
-            p = if mask isa NoMask
-                linear_partition(prod(size(dest)), n_max_threads)
-            else
-                masked_partition(mask, n_max_threads, us)
-            end
+            args = cudaconvert((dest, bc, us, mask, cart_inds))
+            nitems = length(cart_inds)
+            p = config_via_occupancy(knl_copyto!, nitems, args)
             auto_launch!(
                 knl_copyto!,
                 args;
@@ -81,10 +76,9 @@ else
                 bc′ = Base.Broadcast.instantiate(
                     DataLayouts.to_non_extruded_broadcasted(bc),
                 )
-                args = (dest, bc′, us)
-                threads = threads_via_occupancy(knl_copyto_linear!, args)
-                n_max_threads = min(threads, get_N(us))
-                p = linear_partition(prod(size(dest)), n_max_threads)
+                args = cudaconvert((dest, bc′, us))
+                nitems = prod(size(dest))
+                p = config_via_occupancy(knl_copyto_linear!, nitems, args)
                 auto_launch!(
                     knl_copyto_linear!,
                     args;
@@ -97,14 +91,9 @@ else
                 else
                     cartesian_indices_mask(us, mask)
                 end
-                args = (dest, bc, us, mask, cart_inds)
-                threads = threads_via_occupancy(knl_copyto!, args)
-                n_max_threads = min(threads, get_N(us))
-                p = if mask isa NoMask
-                    linear_partition(prod(size(dest)), n_max_threads)
-                else
-                    masked_partition(mask, n_max_threads, us)
-                end
+                args = cudaconvert((dest, bc, us, mask, cart_inds))
+                nitems = length(cart_inds)
+                p = config_via_occupancy(knl_copyto!, nitems, args)
                 auto_launch!(
                     knl_copyto!,
                     args;
@@ -189,10 +178,8 @@ function DataLayouts.copyto_per_field!(
     # leverage linear indexing:
     nitems = prod(size(array))
     N = prod(size(array))
-    args = (array, bc′, N)
-    threads = threads_via_occupancy(copyto_per_field_kernel!, args)
-    n_max_threads = min(threads, nitems)
-    p = linear_partition(nitems, n_max_threads)
+    args = cudaconvert((array, bc′, N))
+    p = config_via_occupancy(copyto_per_field_kernel!, nitems, args)
     auto_launch!(
         copyto_per_field_kernel!,
         args;
@@ -225,10 +212,8 @@ function DataLayouts.copyto_per_field_scalar!(
     # leverage linear indexing:
     nitems = prod(size(array))
     N = prod(size(array))
-    args = (array, bc′, N)
-    threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
-    n_max_threads = min(threads, nitems)
-    p = linear_partition(nitems, n_max_threads)
+    args = cudaconvert((array, bc′, N))
+    p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args)
     auto_launch!(
         copyto_per_field_kernel_0D!,
         args;
@@ -249,10 +234,8 @@ function DataLayouts.copyto_per_field_scalar!(
     # leverage linear indexing:
     nitems = prod(size(array))
     N = prod(size(array))
-    args = (array, bc′, N)
-    threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
-    n_max_threads = min(threads, nitems)
-    p = linear_partition(nitems, n_max_threads)
+    args = cudaconvert((array, bc′, N))
+    p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args)
     auto_launch!(
         copyto_per_field_kernel_0D!,
         args;

diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl
@@ -26,7 +26,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
         if !(VERSION ≥ v"1.11.0-beta") &&
            dest isa DataLayouts.EndsWithField &&
            mask isa NoMask
-            args = (dest, bc, us)
+            args = cudaconvert((dest, bc, us))
             threads = threads_via_occupancy(knl_fill_linear!, args)
             n_max_threads = min(threads, get_N(us))
             p = linear_partition(prod(size(dest)), n_max_threads)
@@ -42,7 +42,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
             else
                 cartesian_indices_mask(us, mask)
             end
-            args = (dest, bc, us, mask, cart_inds)
+            args = cudaconvert((dest, bc, us, mask, cart_inds))
             threads = threads_via_occupancy(knl_fill!, args)
             n_max_threads = min(threads, get_N(us))
             p = if mask isa NoMask

diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
@@ -111,15 +111,15 @@ function Base.copyto!(
             cartesian_indices_mask(us, mask)
         end
 
-        args = (
+        args = cudaconvert((
             strip_space(out, space),
             strip_space(bc′, space),
             axes(out),
             bounds,
             us,
             mask,
             cart_inds,
-        )
+        ))
 
         threads = threads_via_occupancy(copyto_stencil_kernel!, args)
         n_max_threads = min(threads, get_N(us))

diff --git a/test/gpu/latency_benchmarks.jl b/test/gpu/latency_benchmarks.jl
@@ -27,25 +27,25 @@ import LazyBroadcast: lazy
     scalar_field_2 = fill(1.0f0, space)
     # basic expression
     # intentionally benchmark without a sync between each trial
-    # CUDA.synchronize()
-    latency = median(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time
+    CUDA.synchronize()
+    latency = minimum(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time
     # update this value if the kernel launch time changes significantly and it is expected
-    baseline_latency = 20500
-    @test latency ≈ baseline_latency atol = 4000
+    baseline_latency = 12000
+    @test latency ≈ baseline_latency atol = 2000
     percent_change_latency =
         round(Int, (latency - baseline_latency) / baseline_latency * 100)
     @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"
 
     # repeated args expression
     CUDA.synchronize()
     latency =
-        median(
+        minimum(
             @benchmark $scalar_field_1 .=
                 $scalar_field_1 .+ $scalar_field_2 .+ $scalar_field_1 .+ $scalar_field_2
         ).time
     # update this value if the kernel launch time changes significantly and it is expected
-    baseline_latency = 22500
-    @test latency ≈ baseline_latency atol = 4000
+    baseline_latency = 13300
+    @test latency ≈ baseline_latency atol = 2000
     percent_change_latency =
         round(Int, (latency - baseline_latency) / baseline_latency * 100)
     @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"
@@ -55,10 +55,10 @@ import LazyBroadcast: lazy
     lazy_sum_2 = @. lazy(lazy_sum_1 + lazy_sum_1)
     lazy_sum_3 = @. lazy(lazy_sum_2 + lazy_sum_2)
     CUDA.synchronize()
-    latency = median(@benchmark $scalar_field_1 .= $lazy_sum_3).time
+    latency = minimum(@benchmark $scalar_field_1 .= $lazy_sum_3).time
     # update this value if the kernel launch time changes significantly and it is expected
-    baseline_latency = 29000
-    @test latency ≈ baseline_latency atol = 4000
+    baseline_latency = 16000
+    @test latency ≈ baseline_latency atol = 2000
     percent_change_latency =
         round(Int, (latency - baseline_latency) / baseline_latency * 100)
     @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"