Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 15 additions & 32 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,9 @@ if VERSION ≥ v"1.11.0-beta"
else
cartesian_indices_mask(us, mask)
end
args = (dest, bc, us, mask, cart_inds)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = if mask isa NoMask
linear_partition(prod(size(dest)), n_max_threads)
else
masked_partition(mask, n_max_threads, us)
end
args = cudaconvert((dest, bc, us, mask, cart_inds))
nitems = length(cart_inds)
p = config_via_occupancy(knl_copyto!, nitems, args)
auto_launch!(
knl_copyto!,
args;
Expand All @@ -81,10 +76,9 @@ else
bc′ = Base.Broadcast.instantiate(
DataLayouts.to_non_extruded_broadcasted(bc),
)
args = (dest, bc′, us)
threads = threads_via_occupancy(knl_copyto_linear!, args)
n_max_threads = min(threads, get_N(us))
p = linear_partition(prod(size(dest)), n_max_threads)
args = cudaconvert((dest, bc′, us))
nitems = prod(size(dest))
p = config_via_occupancy(knl_copyto_linear!, nitems, args)
auto_launch!(
knl_copyto_linear!,
args;
Expand All @@ -97,14 +91,9 @@ else
else
cartesian_indices_mask(us, mask)
end
args = (dest, bc, us, mask, cart_inds)
threads = threads_via_occupancy(knl_copyto!, args)
n_max_threads = min(threads, get_N(us))
p = if mask isa NoMask
linear_partition(prod(size(dest)), n_max_threads)
else
masked_partition(mask, n_max_threads, us)
end
args = cudaconvert((dest, bc, us, mask, cart_inds))
nitems = length(cart_inds)
p = config_via_occupancy(knl_copyto!, nitems, args)
auto_launch!(
knl_copyto!,
args;
Expand Down Expand Up @@ -189,10 +178,8 @@ function DataLayouts.copyto_per_field!(
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
args = cudaconvert((array, bc′, N))
p = config_via_occupancy(copyto_per_field_kernel!, nitems, args)
auto_launch!(
copyto_per_field_kernel!,
args;
Expand Down Expand Up @@ -225,10 +212,8 @@ function DataLayouts.copyto_per_field_scalar!(
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
args = cudaconvert((array, bc′, N))
p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args)
auto_launch!(
copyto_per_field_kernel_0D!,
args;
Expand All @@ -249,10 +234,8 @@ function DataLayouts.copyto_per_field_scalar!(
# leverage linear indexing:
nitems = prod(size(array))
N = prod(size(array))
args = (array, bc′, N)
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
n_max_threads = min(threads, nitems)
p = linear_partition(nitems, n_max_threads)
args = cudaconvert((array, bc′, N))
p = config_via_occupancy(copyto_per_field_kernel_0D!, nitems, args)
auto_launch!(
copyto_per_field_kernel_0D!,
args;
Expand Down
4 changes: 2 additions & 2 deletions ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
if !(VERSION ≥ v"1.11.0-beta") &&
dest isa DataLayouts.EndsWithField &&
mask isa NoMask
args = (dest, bc, us)
args = cudaconvert((dest, bc, us))
threads = threads_via_occupancy(knl_fill_linear!, args)
n_max_threads = min(threads, get_N(us))
p = linear_partition(prod(size(dest)), n_max_threads)
Expand All @@ -42,7 +42,7 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
else
cartesian_indices_mask(us, mask)
end
args = (dest, bc, us, mask, cart_inds)
args = cudaconvert((dest, bc, us, mask, cart_inds))
threads = threads_via_occupancy(knl_fill!, args)
n_max_threads = min(threads, get_N(us))
p = if mask isa NoMask
Expand Down
4 changes: 2 additions & 2 deletions ext/cuda/operators_finite_difference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,15 @@ function Base.copyto!(
cartesian_indices_mask(us, mask)
end

args = (
args = cudaconvert((
strip_space(out, space),
strip_space(bc′, space),
axes(out),
bounds,
us,
mask,
cart_inds,
)
))

threads = threads_via_occupancy(copyto_stencil_kernel!, args)
n_max_threads = min(threads, get_N(us))
Expand Down
20 changes: 10 additions & 10 deletions test/gpu/latency_benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,25 @@ import LazyBroadcast: lazy
scalar_field_2 = fill(1.0f0, space)
# basic expression
# intentionally benchmark without a sync between each trial
# CUDA.synchronize()
latency = median(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time
CUDA.synchronize()
latency = minimum(@benchmark $scalar_field_1 .= $scalar_field_1 .+ $scalar_field_2).time
# update this value if the kernel launch time changes significantly and it is expected
baseline_latency = 20500
@test latency ≈ baseline_latency atol = 4000
baseline_latency = 12000
@test latency ≈ baseline_latency atol = 2000
percent_change_latency =
round(Int, (latency - baseline_latency) / baseline_latency * 100)
@info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"

# repeated args expression
CUDA.synchronize()
latency =
median(
minimum(
@benchmark $scalar_field_1 .=
$scalar_field_1 .+ $scalar_field_2 .+ $scalar_field_1 .+ $scalar_field_2
).time
# update this value if the kernel launch time changes significantly and it is expected
baseline_latency = 22500
@test latency ≈ baseline_latency atol = 4000
baseline_latency = 13300
@test latency ≈ baseline_latency atol = 2000
percent_change_latency =
round(Int, (latency - baseline_latency) / baseline_latency * 100)
@info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"
Expand All @@ -55,10 +55,10 @@ import LazyBroadcast: lazy
lazy_sum_2 = @. lazy(lazy_sum_1 + lazy_sum_1)
lazy_sum_3 = @. lazy(lazy_sum_2 + lazy_sum_2)
CUDA.synchronize()
latency = median(@benchmark $scalar_field_1 .= $lazy_sum_3).time
latency = minimum(@benchmark $scalar_field_1 .= $lazy_sum_3).time
# update this value if the kernel launch time changes significantly and it is expected
baseline_latency = 29000
@test latency ≈ baseline_latency atol = 4000
baseline_latency = 16000
@test latency ≈ baseline_latency atol = 2000
percent_change_latency =
round(Int, (latency - baseline_latency) / baseline_latency * 100)
@info "Latency: $latency ns, Percent change from baseline: $percent_change_latency%"
Expand Down
Loading