diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl index 5badafd8b2..58e2fc1563 100644 --- a/ext/cuda/data_layouts_copyto.jl +++ b/ext/cuda/data_layouts_copyto.jl @@ -52,14 +52,9 @@ if VERSION ≥ v"1.11.0-beta" else cartesian_indices_mask(us, mask) end - args = (dest, bc, us, mask, cart_inds) - threads = threads_via_occupancy(knl_copyto!, args) - n_max_threads = min(threads, get_N(us)) - p = if mask isa NoMask - linear_partition(prod(size(dest)), n_max_threads) - else - masked_partition(mask, n_max_threads, us) - end + args = cudaconvert((dest, bc, us, mask, cart_inds)) + nitems = length(cart_inds) + p = config_via_occupancy(knl_copyto!, nitems, args) auto_launch!( knl_copyto!, args; @@ -81,10 +76,9 @@ else bc′ = Base.Broadcast.instantiate( DataLayouts.to_non_extruded_broadcasted(bc), ) - args = (dest, bc′, us) - threads = threads_via_occupancy(knl_copyto_linear!, args) - n_max_threads = min(threads, get_N(us)) - p = linear_partition(prod(size(dest)), n_max_threads) + args = cudaconvert((dest, bc′, us)) + nitems = prod(size(dest)) + p = config_via_occupancy(knl_copyto_linear!, nitems, args) auto_launch!( knl_copyto_linear!, args; @@ -97,14 +91,9 @@ else else cartesian_indices_mask(us, mask) end - args = (dest, bc, us, mask, cart_inds) - threads = threads_via_occupancy(knl_copyto!, args) - n_max_threads = min(threads, get_N(us)) - p = if mask isa NoMask - linear_partition(prod(size(dest)), n_max_threads) - else - masked_partition(mask, n_max_threads, us) - end + args = cudaconvert((dest, bc, us, mask, cart_inds)) + nitems = length(cart_inds) + p = config_via_occupancy(knl_copyto!, nitems, args) auto_launch!( knl_copyto!, args; @@ -189,10 +178,8 @@ function DataLayouts.copyto_per_field!( # leverage linear indexing: nitems = prod(size(array)) N = prod(size(array)) - args = (array, bc′, N) - threads = threads_via_occupancy(copyto_per_field_kernel!, args) - n_max_threads = min(threads, nitems) - p = linear_partition(nitems, n_max_threads) + args = cudaconvert((array, bc′, N)) + p = config_via_occupancy(copyto_per_field_kernel!, nitems, args) auto_launch!( copyto_per_field_kernel!, args; @@ -225,10 +212,8 @@ function DataLayouts.copyto_per_field_scalar!( # leverage linear indexing: nitems = prod(size(array)) N = prod(size(array)) - args = (array, bc′, N) - threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args) - n_max_threads = min(threads, nitems) - p = linear_partition(nitems, n_max_threads) + args = cudaconvert((array, bc′, N)) + p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args) auto_launch!( copyto_per_field_kernel_0D!, args; @@ -249,10 +234,8 @@ function DataLayouts.copyto_per_field_scalar!( # leverage linear indexing: nitems = prod(size(array)) N = prod(size(array)) - args = (array, bc′, N) - threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args) - n_max_threads = min(threads, nitems) - p = linear_partition(nitems, n_max_threads) + args = cudaconvert((array, bc′, N)) + p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args) auto_launch!( copyto_per_field_kernel_0D!, args; diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl index acaa8f3f66..b6a4567d96 100644 --- a/ext/cuda/data_layouts_fill.jl +++ b/ext/cuda/data_layouts_fill.jl @@ -26,7 +26,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask()) if !(VERSION ≥ v"1.11.0-beta") && dest isa DataLayouts.EndsWithField && mask isa NoMask - args = (dest, bc, us) + args = cudaconvert((dest, bc, us)) threads = threads_via_occupancy(knl_fill_linear!, args) n_max_threads = min(threads, get_N(us)) p = linear_partition(prod(size(dest)), n_max_threads) @@ -42,7 +42,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask()) else cartesian_indices_mask(us, mask) end - args = (dest, bc, us, mask, cart_inds) + args = cudaconvert((dest, bc, us, mask, cart_inds)) threads = threads_via_occupancy(knl_fill!, args) n_max_threads = min(threads, get_N(us)) p = if mask isa NoMask diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl index b5920e3e50..d8f49ffedd 100644 --- a/ext/cuda/operators_finite_difference.jl +++ b/ext/cuda/operators_finite_difference.jl @@ -111,7 +111,7 @@ function Base.copyto!( cartesian_indices_mask(us, mask) end - args = ( + args = cudaconvert(( strip_space(out, space), strip_space(bc′, space), axes(out), @@ -119,7 +119,7 @@ function Base.copyto!( us, mask, cart_inds, - ) + )) threads = threads_via_occupancy(copyto_stencil_kernel!, args) n_max_threads = min(threads, get_N(us)) diff --git a/test/gpu/latency_benchmarks.jl b/test/gpu/latency_benchmarks.jl index 090a471f34..6bb40d4d0d 100644 --- a/test/gpu/latency_benchmarks.jl +++ b/test/gpu/latency_benchmarks.jl @@ -27,11 +27,11 @@ import LazyBroadcast: lazy scalar_field_2 = fill(1.0f0, space) # basic expression # intentionally benchmark without a sync between each trial - # CUDA.synchronize() - latency = median(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time + CUDA.synchronize() + latency = minimum(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time # update this value if the kernel launch time changes significantly and it is expected - baseline_latency = 20500 - @test latency ≈ baseline_latency atol = 4000 + baseline_latency = 12000 + @test latency ≈ baseline_latency atol = 2000 percent_change_latency = round(Int, (latency - baseline_latency) / baseline_latency * 100) @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%" @@ -39,13 +39,13 @@ import LazyBroadcast: lazy # repeated args expression CUDA.synchronize() latency = - median( + minimum( @benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2 .+ $scalar_field_1 .+ $scalar_field_2 ).time # update this value if the kernel launch time changes significantly and it is expected - baseline_latency = 22500 - @test latency ≈ baseline_latency atol = 4000 + baseline_latency = 13300 + @test latency ≈ baseline_latency atol = 2000 percent_change_latency = round(Int, (latency - baseline_latency) / baseline_latency * 100) @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%" @@ -55,10 +55,10 @@ import LazyBroadcast: lazy lazy_sum_2 = @. lazy(lazy_sum_1 + lazy_sum_1) lazy_sum_3 = @. lazy(lazy_sum_2 + lazy_sum_2) CUDA.synchronize() - latency = median(@benchmark $scalar_field_1 .= $lazy_sum_3).time + latency = minimum(@benchmark $scalar_field_1 .= $lazy_sum_3).time # update this value if the kernel launch time changes significantly and it is expected - baseline_latency = 29000 - @test latency ≈ baseline_latency atol = 4000 + baseline_latency = 16000 + @test latency ≈ baseline_latency atol = 2000 percent_change_latency = round(Int, (latency - baseline_latency) / baseline_latency * 100) @info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"