Skip to content

[Do not merge] Test KernelIntrinsics#688

Open
christiangnrd wants to merge 3 commits intomainfrom
kaintr
Open

[Do not merge] Test KernelIntrinsics#688
christiangnrd wants to merge 3 commits intomainfrom
kaintr

Conversation

@christiangnrd
Copy link
Copy Markdown
Member

Not a draft to also run benchmarks

@github-actions
Copy link
Copy Markdown
Contributor

github-actions Bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
@@ -91,13 +91,13 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
-                 @inbounds dest[i] = bc[i]
-                 i += stride
-             end
-             return
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
+                @inbounds dest[i] = bc[i]
+                i += stride
+            end
+            return
         end
         ## COV_EXCL_STOP
 
@@ -168,9 +168,9 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # gamma and erfc not currently supported on Metal.jl
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31
Copy link
Copy Markdown
Contributor

@github-actions github-actions Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Details
Benchmark suite Current: 3be3a7c Previous: b94fd4b Ratio
array/accumulate/Float32/1d 1082834 ns 1098958 ns 0.99
array/accumulate/Float32/dims=1 1756458 ns 1554708 ns 1.13
array/accumulate/Float32/dims=1L 10573166 ns 9848583.5 ns 1.07
array/accumulate/Float32/dims=2 2191167 ns 1886771 ns 1.16
array/accumulate/Float32/dims=2L 7852125 ns 7256459 ns 1.08
array/accumulate/Int64/1d 1368750 ns 1261958 ns 1.08
array/accumulate/Int64/dims=1 2006875 ns 1824291.5 ns 1.10
array/accumulate/Int64/dims=1L 12374354 ns 11664208.5 ns 1.06
array/accumulate/Int64/dims=2 2391083.5 ns 2170333.5 ns 1.10
array/accumulate/Int64/dims=2L 10179771 ns 10120062.5 ns 1.01
array/broadcast 551520.5 ns 605916 ns 0.91
array/construct 6542 ns 6292 ns 1.04
array/permutedims/2d 1217833 ns 1168125 ns 1.04
array/permutedims/3d 1831458.5 ns 1673084 ns 1.09
array/permutedims/4d 2757333 ns 2365959 ns 1.17
array/private/copy 828291.5 ns 545792 ns 1.52
array/private/copyto!/cpu_to_gpu 752521 ns 802916 ns 0.94
array/private/copyto!/gpu_to_cpu 752666.5 ns 801917 ns 0.94
array/private/copyto!/gpu_to_gpu 551708 ns 634458 ns 0.87
array/private/iteration/findall/bool 1539041 ns 1402750 ns 1.10
array/private/iteration/findall/int 1665709 ns 1564021 ns 1.07
array/private/iteration/findfirst/bool 2112708 ns 2055916 ns 1.03
array/private/iteration/findfirst/int 2194250 ns 2064479.5 ns 1.06
array/private/iteration/findmin/1d 2674916 ns 2499959 ns 1.07
array/private/iteration/findmin/2d 1914958 ns 1790791 ns 1.07
array/private/iteration/logical 2782333.5 ns 2631896 ns 1.06
array/private/iteration/scalar 3324937.5 ns 5047625 ns 0.66
array/random/rand/Float32 828937.5 ns 582958 ns 1.42
array/random/rand/Int64 901000 ns 775667 ns 1.16
array/random/rand!/Float32 540833 ns 574750 ns 0.94
array/random/rand!/Int64 532833 ns 550792 ns 0.97
array/random/randn/Float32 1045791 ns 1006937.5 ns 1.04
array/random/randn!/Float32 724125 ns 755666 ns 0.96
array/reductions/mapreduce/Float32/1d 801458 ns 1029500 ns 0.78
array/reductions/mapreduce/Float32/dims=1 828708 ns 840875 ns 0.99
array/reductions/mapreduce/Float32/dims=1L 1370146 ns 1324000 ns 1.03
array/reductions/mapreduce/Float32/dims=2 880958 ns 860875 ns 1.02
array/reductions/mapreduce/Float32/dims=2L 1894104.5 ns 1799541 ns 1.05
array/reductions/mapreduce/Int64/1d 1280208.5 ns 1374875 ns 0.93
array/reductions/mapreduce/Int64/dims=1 1122020.5 ns 1097625 ns 1.02
array/reductions/mapreduce/Int64/dims=1L 1998208 ns 2002854 ns 1.00
array/reductions/mapreduce/Int64/dims=2 1357291.5 ns 1145000 ns 1.19
array/reductions/mapreduce/Int64/dims=2L 4164313 ns 3614000 ns 1.15
array/reductions/reduce/Float32/1d 810270.5 ns 1028437.5 ns 0.79
array/reductions/reduce/Float32/dims=1 819396 ns 832667 ns 0.98
array/reductions/reduce/Float32/dims=1L 1371729.5 ns 1318416.5 ns 1.04
array/reductions/reduce/Float32/dims=2 880500 ns 853041.5 ns 1.03
array/reductions/reduce/Float32/dims=2L 1885250 ns 1810250 ns 1.04
array/reductions/reduce/Int64/1d 1265750 ns 1516958 ns 0.83
array/reductions/reduce/Int64/dims=1 1113209 ns 1095375 ns 1.02
array/reductions/reduce/Int64/dims=1L 2066750 ns 2023499.5 ns 1.02
array/reductions/reduce/Int64/dims=2 1347646 ns 1240750 ns 1.09
array/reductions/reduce/Int64/dims=2L 4256479.5 ns 4233875 ns 1.01
array/shared/copy 215250 ns 252417 ns 0.85
array/shared/copyto!/cpu_to_gpu 83750 ns 80750 ns 1.04
array/shared/copyto!/gpu_to_cpu 83709 ns 80667 ns 1.04
array/shared/copyto!/gpu_to_gpu 84500 ns 83083 ns 1.02
array/shared/iteration/findall/bool 1543875 ns 1427208.5 ns 1.08
array/shared/iteration/findall/int 1661625 ns 1559875 ns 1.07
array/shared/iteration/findfirst/bool 1711792 ns 1649000 ns 1.04
array/shared/iteration/findfirst/int 1792250 ns 1672458 ns 1.07
array/shared/iteration/findmin/1d 2234874.5 ns 2115583 ns 1.06
array/shared/iteration/findmin/2d 1913416 ns 1792625 ns 1.07
array/shared/iteration/logical 2558542 ns 2292167 ns 1.12
array/shared/iteration/scalar 206083 ns 199958 ns 1.03
integration/byval/reference 1593645.5 ns 1544250 ns 1.03
integration/byval/slices=1 1600792 ns 1560229.5 ns 1.03
integration/byval/slices=2 2732166 ns 2598333.5 ns 1.05
integration/byval/slices=3 19612583 ns 8092333 ns 2.42
integration/metaldevrt 861354.5 ns 868125 ns 0.99
kernel/indexing 485750 ns 592667 ns 0.82
kernel/indexing_checked 487250 ns 598292 ns 0.81
kernel/launch 12625 ns 11791.5 ns 1.07
kernel/rand 536250 ns 570709 ns 0.94
latency/import 1692594000.5 ns 1425597062.5 ns 1.19
latency/precompile 30415191770.5 ns 25453724708 ns 1.19
latency/ttfp 2536515958.5 ns 2341177208 ns 1.08
metal/synchronization/context 19792 ns 19667 ns 1.01
metal/synchronization/stream 19292 ns 18459 ns 1.05

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from b0fd1b3 to 865af1a Compare February 19, 2026 23:30
skip scripts tests on 1.10

Project.toml

Better workaround
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant