Skip to content

[Do not merge] Test KernelIntrinsics#688

Open
christiangnrd wants to merge 4 commits into
mainfrom
kaintr
Open

[Do not merge] Test KernelIntrinsics#688
christiangnrd wants to merge 4 commits into
mainfrom
kaintr

Conversation

@christiangnrd

Copy link
Copy Markdown
Member

Not a draft to also run benchmarks

@github-actions

github-actions Bot commented Oct 22, 2025

Copy link
Copy Markdown
Contributor

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
@@ -91,13 +91,13 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
-                 @inbounds dest[i] = bc[i]
-                 i += stride
-             end
-             return
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
+                @inbounds dest[i] = bc[i]
+                i += stride
+            end
+            return
         end
         ## COV_EXCL_STOP
 
@@ -168,9 +168,9 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # gamma and erfc not currently supported on Metal.jl
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Details
Benchmark suite Current: 86196fa Previous: edeab68 Ratio
array/accumulate/Float32/1d 848125 ns 830625 ns 1.02
array/accumulate/Float32/dims=1 1138917 ns 947959 ns 1.20
array/accumulate/Float32/dims=1L 10409000 ns 10226437.5 ns 1.02
array/accumulate/Float32/dims=2 1468584 ns 1276875 ns 1.15
array/accumulate/Float32/dims=2L 6251625 ns 5028542 ns 1.24
array/accumulate/Int64/1d 1038604 ns 983833 ns 1.06
array/accumulate/Int64/dims=1 1311042 ns 1143000 ns 1.15
array/accumulate/Int64/dims=1L 12204208 ns 12122375 ns 1.01
array/accumulate/Int64/dims=2 1621333 ns 1481542 ns 1.09
array/accumulate/Int64/dims=2L 9902000 ns 9577292 ns 1.03
array/broadcast 373792 ns 384666 ns 0.97
array/construct 5708 ns 5959 ns 0.96
array/permutedims/2d 575292 ns 613853.5 ns 0.94
array/permutedims/3d 1075333 ns 1146479 ns 0.94
array/permutedims/4d 2168000 ns 2015041.5 ns 1.08
array/private/copy 434479.5 ns 447500 ns 0.97
array/private/copyto!/cpu_to_gpu 381250 ns 385645.5 ns 0.99
array/private/copyto!/gpu_to_cpu 377500 ns 382833 ns 0.99
array/private/copyto!/gpu_to_gpu 350292 ns 356625 ns 0.98
array/private/iteration/findall/bool 1158229 ns 1118291 ns 1.04
array/private/iteration/findall/int 1317417 ns 1287333 ns 1.02
array/private/iteration/findfirst/bool 1520709 ns 1486146 ns 1.02
array/private/iteration/findfirst/int 1549062.5 ns 1558375 ns 0.99
array/private/iteration/findmin/1d 1675541 ns 1624750 ns 1.03
array/private/iteration/findmin/2d 1346312.5 ns 1342292 ns 1.00
array/private/iteration/logical 1836250 ns 1841208 ns 1.00
array/private/iteration/scalar 2825292 ns 2928500 ns 0.96
array/random/rand/Float32 680000 ns 650937.5 ns 1.04
array/random/rand/Int64 740292 ns 740687.5 ns 1.00
array/random/rand!/Float32 623229.5 ns 596666 ns 1.04
array/random/rand!/Int64 557833 ns 524250 ns 1.06
array/random/randn/Float32 611333 ns 619500 ns 0.99
array/random/randn!/Float32 548167 ns 541937.5 ns 1.01
array/reductions/mapreduce/Float32/1d 508145.5 ns 510416 ns 1.00
array/reductions/mapreduce/Float32/dims=1 486583 ns 526479.5 ns 0.92
array/reductions/mapreduce/Float32/dims=1L 747270.5 ns 773167 ns 0.97
array/reductions/mapreduce/Float32/dims=2 513708 ns 539791 ns 0.95
array/reductions/mapreduce/Float32/dims=2L 1401167 ns 1367333 ns 1.02
array/reductions/mapreduce/Int64/1d 938125 ns 1013500 ns 0.93
array/reductions/mapreduce/Int64/dims=1 785250 ns 866375 ns 0.91
array/reductions/mapreduce/Int64/dims=1L 1347709 ns 1427042 ns 0.94
array/reductions/mapreduce/Int64/dims=2 1056541 ns 1032708 ns 1.02
array/reductions/mapreduce/Int64/dims=2L 2309062.5 ns 2234542 ns 1.03
array/reductions/reduce/Float32/1d 503041 ns 507709 ns 0.99
array/reductions/reduce/Float32/dims=1 513167 ns 525625 ns 0.98
array/reductions/reduce/Float32/dims=1L 754541.5 ns 779333.5 ns 0.97
array/reductions/reduce/Float32/dims=2 564083 ns 528041.5 ns 1.07
array/reductions/reduce/Float32/dims=2L 1397375 ns 1364417 ns 1.02
array/reductions/reduce/Int64/1d 929125 ns 1028271 ns 0.90
array/reductions/reduce/Int64/dims=1 782833 ns 826937.5 ns 0.95
array/reductions/reduce/Int64/dims=1L 1600375 ns 1626333 ns 0.98
array/reductions/reduce/Int64/dims=2 1009208 ns 1021708.5 ns 0.99
array/reductions/reduce/Int64/dims=2L 2307750 ns 2237687 ns 1.03
array/shared/copy 239584 ns 249729 ns 0.96
array/shared/copyto!/cpu_to_gpu 39917 ns 40500 ns 0.99
array/shared/copyto!/gpu_to_cpu 41125 ns 40833 ns 1.01
array/shared/copyto!/gpu_to_gpu 41333 ns 41458 ns 1.00
array/shared/iteration/findall/bool 1167770.5 ns 1125417 ns 1.04
array/shared/iteration/findall/int 1319375 ns 1275917 ns 1.03
array/shared/iteration/findfirst/bool 1237770.5 ns 1192833 ns 1.04
array/shared/iteration/findfirst/int 1275000 ns 1274645.5 ns 1.00
array/shared/iteration/findmin/1d 1394833 ns 1350458 ns 1.03
array/shared/iteration/findmin/2d 1345458 ns 1338208 ns 1.01
array/shared/iteration/logical 1686375 ns 1732500 ns 0.97
array/shared/iteration/scalar 5833.333333333333 ns 5708.333333333333 ns 1.02
integration/byval/reference 1167375 ns 1167708 ns 1.00
integration/byval/slices=1 1169875 ns 1171875 ns 1.00
integration/byval/slices=2 2093541 ns 2097125 ns 1.00
integration/byval/slices=3 7897959 ns 12623667 ns 0.63
integration/metaldevrt 498167 ns 480458.5 ns 1.04
kernel/indexing 375875 ns 386333 ns 0.97
kernel/indexing_checked 551792 ns 566062.5 ns 0.97
kernel/launch 13459 ns 13417 ns 1.00
kernel/rand 565416.5 ns 575500 ns 0.98
latency/import 1699500188 ns 1501035416.5 ns 1.13
latency/precompile 36687602000.5 ns 31452786146 ns 1.17
latency/ttfp 2021924604.5 ns 1813457458.5 ns 1.11
metal/synchronization/context 677.60625 ns 672.2242424242424 ns 1.01
metal/synchronization/stream 444.23737373737373 ns 440.2323232323232 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from b0fd1b3 to 865af1a Compare February 19, 2026 23:30
@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from c2c8426 to e52f747 Compare June 4, 2026 18:07
@christiangnrd christiangnrd force-pushed the kaintr branch 6 times, most recently from 9f95616 to fb61557 Compare June 12, 2026 15:19
@christiangnrd christiangnrd force-pushed the kaintr branch 2 times, most recently from 45dfdfb to 86196fa Compare June 16, 2026 22:48
skip scripts tests on 1.10

Project.toml

Better workaround
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant