[Do not merge] Test KernelIntrinsics#688
Open
christiangnrd wants to merge 3 commits intomainfrom
Open
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
i += stride
@@ -91,13 +91,13 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
- @inbounds dest[i] = bc[i]
- i += stride
- end
- return
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
+ @inbounds dest[i] = bc[i]
+ i += stride
+ end
+ return
end
## COV_EXCL_STOP
@@ -168,9 +168,9 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # gamma and erfc not currently supported on Metal.jl
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: 3be3a7c | Previous: b94fd4b | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
1082834 ns |
1098958 ns |
0.99 |
array/accumulate/Float32/dims=1 |
1756458 ns |
1554708 ns |
1.13 |
array/accumulate/Float32/dims=1L |
10573166 ns |
9848583.5 ns |
1.07 |
array/accumulate/Float32/dims=2 |
2191167 ns |
1886771 ns |
1.16 |
array/accumulate/Float32/dims=2L |
7852125 ns |
7256459 ns |
1.08 |
array/accumulate/Int64/1d |
1368750 ns |
1261958 ns |
1.08 |
array/accumulate/Int64/dims=1 |
2006875 ns |
1824291.5 ns |
1.10 |
array/accumulate/Int64/dims=1L |
12374354 ns |
11664208.5 ns |
1.06 |
array/accumulate/Int64/dims=2 |
2391083.5 ns |
2170333.5 ns |
1.10 |
array/accumulate/Int64/dims=2L |
10179771 ns |
10120062.5 ns |
1.01 |
array/broadcast |
551520.5 ns |
605916 ns |
0.91 |
array/construct |
6542 ns |
6292 ns |
1.04 |
array/permutedims/2d |
1217833 ns |
1168125 ns |
1.04 |
array/permutedims/3d |
1831458.5 ns |
1673084 ns |
1.09 |
array/permutedims/4d |
2757333 ns |
2365959 ns |
1.17 |
array/private/copy |
828291.5 ns |
545792 ns |
1.52 |
array/private/copyto!/cpu_to_gpu |
752521 ns |
802916 ns |
0.94 |
array/private/copyto!/gpu_to_cpu |
752666.5 ns |
801917 ns |
0.94 |
array/private/copyto!/gpu_to_gpu |
551708 ns |
634458 ns |
0.87 |
array/private/iteration/findall/bool |
1539041 ns |
1402750 ns |
1.10 |
array/private/iteration/findall/int |
1665709 ns |
1564021 ns |
1.07 |
array/private/iteration/findfirst/bool |
2112708 ns |
2055916 ns |
1.03 |
array/private/iteration/findfirst/int |
2194250 ns |
2064479.5 ns |
1.06 |
array/private/iteration/findmin/1d |
2674916 ns |
2499959 ns |
1.07 |
array/private/iteration/findmin/2d |
1914958 ns |
1790791 ns |
1.07 |
array/private/iteration/logical |
2782333.5 ns |
2631896 ns |
1.06 |
array/private/iteration/scalar |
3324937.5 ns |
5047625 ns |
0.66 |
array/random/rand/Float32 |
828937.5 ns |
582958 ns |
1.42 |
array/random/rand/Int64 |
901000 ns |
775667 ns |
1.16 |
array/random/rand!/Float32 |
540833 ns |
574750 ns |
0.94 |
array/random/rand!/Int64 |
532833 ns |
550792 ns |
0.97 |
array/random/randn/Float32 |
1045791 ns |
1006937.5 ns |
1.04 |
array/random/randn!/Float32 |
724125 ns |
755666 ns |
0.96 |
array/reductions/mapreduce/Float32/1d |
801458 ns |
1029500 ns |
0.78 |
array/reductions/mapreduce/Float32/dims=1 |
828708 ns |
840875 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=1L |
1370146 ns |
1324000 ns |
1.03 |
array/reductions/mapreduce/Float32/dims=2 |
880958 ns |
860875 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=2L |
1894104.5 ns |
1799541 ns |
1.05 |
array/reductions/mapreduce/Int64/1d |
1280208.5 ns |
1374875 ns |
0.93 |
array/reductions/mapreduce/Int64/dims=1 |
1122020.5 ns |
1097625 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1L |
1998208 ns |
2002854 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2 |
1357291.5 ns |
1145000 ns |
1.19 |
array/reductions/mapreduce/Int64/dims=2L |
4164313 ns |
3614000 ns |
1.15 |
array/reductions/reduce/Float32/1d |
810270.5 ns |
1028437.5 ns |
0.79 |
array/reductions/reduce/Float32/dims=1 |
819396 ns |
832667 ns |
0.98 |
array/reductions/reduce/Float32/dims=1L |
1371729.5 ns |
1318416.5 ns |
1.04 |
array/reductions/reduce/Float32/dims=2 |
880500 ns |
853041.5 ns |
1.03 |
array/reductions/reduce/Float32/dims=2L |
1885250 ns |
1810250 ns |
1.04 |
array/reductions/reduce/Int64/1d |
1265750 ns |
1516958 ns |
0.83 |
array/reductions/reduce/Int64/dims=1 |
1113209 ns |
1095375 ns |
1.02 |
array/reductions/reduce/Int64/dims=1L |
2066750 ns |
2023499.5 ns |
1.02 |
array/reductions/reduce/Int64/dims=2 |
1347646 ns |
1240750 ns |
1.09 |
array/reductions/reduce/Int64/dims=2L |
4256479.5 ns |
4233875 ns |
1.01 |
array/shared/copy |
215250 ns |
252417 ns |
0.85 |
array/shared/copyto!/cpu_to_gpu |
83750 ns |
80750 ns |
1.04 |
array/shared/copyto!/gpu_to_cpu |
83709 ns |
80667 ns |
1.04 |
array/shared/copyto!/gpu_to_gpu |
84500 ns |
83083 ns |
1.02 |
array/shared/iteration/findall/bool |
1543875 ns |
1427208.5 ns |
1.08 |
array/shared/iteration/findall/int |
1661625 ns |
1559875 ns |
1.07 |
array/shared/iteration/findfirst/bool |
1711792 ns |
1649000 ns |
1.04 |
array/shared/iteration/findfirst/int |
1792250 ns |
1672458 ns |
1.07 |
array/shared/iteration/findmin/1d |
2234874.5 ns |
2115583 ns |
1.06 |
array/shared/iteration/findmin/2d |
1913416 ns |
1792625 ns |
1.07 |
array/shared/iteration/logical |
2558542 ns |
2292167 ns |
1.12 |
array/shared/iteration/scalar |
206083 ns |
199958 ns |
1.03 |
integration/byval/reference |
1593645.5 ns |
1544250 ns |
1.03 |
integration/byval/slices=1 |
1600792 ns |
1560229.5 ns |
1.03 |
integration/byval/slices=2 |
2732166 ns |
2598333.5 ns |
1.05 |
integration/byval/slices=3 |
19612583 ns |
8092333 ns |
2.42 |
integration/metaldevrt |
861354.5 ns |
868125 ns |
0.99 |
kernel/indexing |
485750 ns |
592667 ns |
0.82 |
kernel/indexing_checked |
487250 ns |
598292 ns |
0.81 |
kernel/launch |
12625 ns |
11791.5 ns |
1.07 |
kernel/rand |
536250 ns |
570709 ns |
0.94 |
latency/import |
1692594000.5 ns |
1425597062.5 ns |
1.19 |
latency/precompile |
30415191770.5 ns |
25453724708 ns |
1.19 |
latency/ttfp |
2536515958.5 ns |
2341177208 ns |
1.08 |
metal/synchronization/context |
19792 ns |
19667 ns |
1.01 |
metal/synchronization/stream |
19292 ns |
18459 ns |
1.05 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
4b8f026 to
ce67b4c
Compare
ce67b4c to
03bb0dd
Compare
415079d to
90e4fb2
Compare
90e4fb2 to
ee7543a
Compare
b0fd1b3 to
865af1a
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks