[Do not merge] Test KernelIntrinsics#688
Open
christiangnrd wants to merge 4 commits into
Open
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
i += stride
@@ -91,13 +91,13 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
- @inbounds dest[i] = bc[i]
- i += stride
- end
- return
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
+ @inbounds dest[i] = bc[i]
+ i += stride
+ end
+ return
end
## COV_EXCL_STOP
@@ -168,9 +168,9 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # gamma and erfc not currently supported on Metal.jl
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: 86196fa | Previous: edeab68 | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
848125 ns |
830625 ns |
1.02 |
array/accumulate/Float32/dims=1 |
1138917 ns |
947959 ns |
1.20 |
array/accumulate/Float32/dims=1L |
10409000 ns |
10226437.5 ns |
1.02 |
array/accumulate/Float32/dims=2 |
1468584 ns |
1276875 ns |
1.15 |
array/accumulate/Float32/dims=2L |
6251625 ns |
5028542 ns |
1.24 |
array/accumulate/Int64/1d |
1038604 ns |
983833 ns |
1.06 |
array/accumulate/Int64/dims=1 |
1311042 ns |
1143000 ns |
1.15 |
array/accumulate/Int64/dims=1L |
12204208 ns |
12122375 ns |
1.01 |
array/accumulate/Int64/dims=2 |
1621333 ns |
1481542 ns |
1.09 |
array/accumulate/Int64/dims=2L |
9902000 ns |
9577292 ns |
1.03 |
array/broadcast |
373792 ns |
384666 ns |
0.97 |
array/construct |
5708 ns |
5959 ns |
0.96 |
array/permutedims/2d |
575292 ns |
613853.5 ns |
0.94 |
array/permutedims/3d |
1075333 ns |
1146479 ns |
0.94 |
array/permutedims/4d |
2168000 ns |
2015041.5 ns |
1.08 |
array/private/copy |
434479.5 ns |
447500 ns |
0.97 |
array/private/copyto!/cpu_to_gpu |
381250 ns |
385645.5 ns |
0.99 |
array/private/copyto!/gpu_to_cpu |
377500 ns |
382833 ns |
0.99 |
array/private/copyto!/gpu_to_gpu |
350292 ns |
356625 ns |
0.98 |
array/private/iteration/findall/bool |
1158229 ns |
1118291 ns |
1.04 |
array/private/iteration/findall/int |
1317417 ns |
1287333 ns |
1.02 |
array/private/iteration/findfirst/bool |
1520709 ns |
1486146 ns |
1.02 |
array/private/iteration/findfirst/int |
1549062.5 ns |
1558375 ns |
0.99 |
array/private/iteration/findmin/1d |
1675541 ns |
1624750 ns |
1.03 |
array/private/iteration/findmin/2d |
1346312.5 ns |
1342292 ns |
1.00 |
array/private/iteration/logical |
1836250 ns |
1841208 ns |
1.00 |
array/private/iteration/scalar |
2825292 ns |
2928500 ns |
0.96 |
array/random/rand/Float32 |
680000 ns |
650937.5 ns |
1.04 |
array/random/rand/Int64 |
740292 ns |
740687.5 ns |
1.00 |
array/random/rand!/Float32 |
623229.5 ns |
596666 ns |
1.04 |
array/random/rand!/Int64 |
557833 ns |
524250 ns |
1.06 |
array/random/randn/Float32 |
611333 ns |
619500 ns |
0.99 |
array/random/randn!/Float32 |
548167 ns |
541937.5 ns |
1.01 |
array/reductions/mapreduce/Float32/1d |
508145.5 ns |
510416 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1 |
486583 ns |
526479.5 ns |
0.92 |
array/reductions/mapreduce/Float32/dims=1L |
747270.5 ns |
773167 ns |
0.97 |
array/reductions/mapreduce/Float32/dims=2 |
513708 ns |
539791 ns |
0.95 |
array/reductions/mapreduce/Float32/dims=2L |
1401167 ns |
1367333 ns |
1.02 |
array/reductions/mapreduce/Int64/1d |
938125 ns |
1013500 ns |
0.93 |
array/reductions/mapreduce/Int64/dims=1 |
785250 ns |
866375 ns |
0.91 |
array/reductions/mapreduce/Int64/dims=1L |
1347709 ns |
1427042 ns |
0.94 |
array/reductions/mapreduce/Int64/dims=2 |
1056541 ns |
1032708 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=2L |
2309062.5 ns |
2234542 ns |
1.03 |
array/reductions/reduce/Float32/1d |
503041 ns |
507709 ns |
0.99 |
array/reductions/reduce/Float32/dims=1 |
513167 ns |
525625 ns |
0.98 |
array/reductions/reduce/Float32/dims=1L |
754541.5 ns |
779333.5 ns |
0.97 |
array/reductions/reduce/Float32/dims=2 |
564083 ns |
528041.5 ns |
1.07 |
array/reductions/reduce/Float32/dims=2L |
1397375 ns |
1364417 ns |
1.02 |
array/reductions/reduce/Int64/1d |
929125 ns |
1028271 ns |
0.90 |
array/reductions/reduce/Int64/dims=1 |
782833 ns |
826937.5 ns |
0.95 |
array/reductions/reduce/Int64/dims=1L |
1600375 ns |
1626333 ns |
0.98 |
array/reductions/reduce/Int64/dims=2 |
1009208 ns |
1021708.5 ns |
0.99 |
array/reductions/reduce/Int64/dims=2L |
2307750 ns |
2237687 ns |
1.03 |
array/shared/copy |
239584 ns |
249729 ns |
0.96 |
array/shared/copyto!/cpu_to_gpu |
39917 ns |
40500 ns |
0.99 |
array/shared/copyto!/gpu_to_cpu |
41125 ns |
40833 ns |
1.01 |
array/shared/copyto!/gpu_to_gpu |
41333 ns |
41458 ns |
1.00 |
array/shared/iteration/findall/bool |
1167770.5 ns |
1125417 ns |
1.04 |
array/shared/iteration/findall/int |
1319375 ns |
1275917 ns |
1.03 |
array/shared/iteration/findfirst/bool |
1237770.5 ns |
1192833 ns |
1.04 |
array/shared/iteration/findfirst/int |
1275000 ns |
1274645.5 ns |
1.00 |
array/shared/iteration/findmin/1d |
1394833 ns |
1350458 ns |
1.03 |
array/shared/iteration/findmin/2d |
1345458 ns |
1338208 ns |
1.01 |
array/shared/iteration/logical |
1686375 ns |
1732500 ns |
0.97 |
array/shared/iteration/scalar |
5833.333333333333 ns |
5708.333333333333 ns |
1.02 |
integration/byval/reference |
1167375 ns |
1167708 ns |
1.00 |
integration/byval/slices=1 |
1169875 ns |
1171875 ns |
1.00 |
integration/byval/slices=2 |
2093541 ns |
2097125 ns |
1.00 |
integration/byval/slices=3 |
7897959 ns |
12623667 ns |
0.63 |
integration/metaldevrt |
498167 ns |
480458.5 ns |
1.04 |
kernel/indexing |
375875 ns |
386333 ns |
0.97 |
kernel/indexing_checked |
551792 ns |
566062.5 ns |
0.97 |
kernel/launch |
13459 ns |
13417 ns |
1.00 |
kernel/rand |
565416.5 ns |
575500 ns |
0.98 |
latency/import |
1699500188 ns |
1501035416.5 ns |
1.13 |
latency/precompile |
36687602000.5 ns |
31452786146 ns |
1.17 |
latency/ttfp |
2021924604.5 ns |
1813457458.5 ns |
1.11 |
metal/synchronization/context |
677.60625 ns |
672.2242424242424 ns |
1.01 |
metal/synchronization/stream |
444.23737373737373 ns |
440.2323232323232 ns |
1.01 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
4b8f026 to
ce67b4c
Compare
ce67b4c to
03bb0dd
Compare
415079d to
90e4fb2
Compare
90e4fb2 to
ee7543a
Compare
b0fd1b3 to
865af1a
Compare
c2c8426 to
e52f747
Compare
9f95616 to
fb61557
Compare
45dfdfb to
86196fa
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks