diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 239ebe0bd..a51d565f3 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -179,11 +179,12 @@ The output of this function is automatically cached, i.e. you can simply call `m in a hot path without degrading performance. New code will be generated automatically when the function changes, or when different types or keyword arguments are provided. """ -function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +function mtlfunction(@nospecialize(f), @nospecialize(tt)=Tuple{}; name=nothing, kwargs...) dev = device() Base.@lock mtlfunction_lock begin # compile the function cache = compiler_cache(dev) + F = Core.Typeof(f) source = methodinstance(F, tt) config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link) @@ -197,7 +198,7 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} kernel = HostKernel{F,tt}(f, pipeline) _kernel_instances[h] = kernel end - return kernel::HostKernel{F,tt} + return kernel end end @@ -246,7 +247,7 @@ const _kernel_instances = Dict{UInt, Any}() ex end -@inline function encode_argument!(kernel, arg) +@inline function encode_argument!(@nospecialize(kernel), arg) argtyp = typeof(arg) # replace non-isbits arguments (they should be unused, or compilation @@ -263,8 +264,20 @@ end return argument_buffer end +# Thin outer callable — specializes on HostKernel{F,TT} and args, but the body +# is trivial (one function call) so it's fast to compile for new kernel types. @autoreleasepool function (kernel::HostKernel)(args...; groups=1, threads=1, queue=global_queue(device())) + _metal_launch(kernel, args, groups, threads, queue) +end + +# Heavy body — @nospecialize, compiled once during precompilation and reused for +# all kernel types. All generic MTL operations (command buffer, validation, etc.) live here. +function _metal_launch(@nospecialize(kernel::HostKernel), @nospecialize(args::Tuple), + groups, threads, queue) + pipeline = kernel.pipeline::MTLComputePipelineState + f = kernel.f + gs = MTLSize(groups) ts = MTLSize(threads) (gs.width>0 && gs.height>0 && gs.depth>0) || @@ -284,42 +297,60 @@ end kernel_state = KernelState(Random.rand(UInt32)) - cmdbuf = MTLCommandBuffer(queue) - cmdbuf.label = "MTLCommandBuffer($(nameof(kernel.f)))" - cce = MTLComputeCommandEncoder(cmdbuf) - argument_buffers = try - MTL.set_function!(cce, kernel.pipeline) - bufs = encode_arguments!(cce, kernel, kernel_state, kernel.f, args...) - MTL.append_current_function!(cce, gs, ts) - bufs - finally - close(cce) - end + @autoreleasepool begin + cmdbuf = MTLCommandBuffer(queue) + cmdbuf.label = "MTLCommandBuffer($(nameof(f)))" + cce = MTLComputeCommandEncoder(cmdbuf) + argument_buffers = try + MTL.set_function!(cce, pipeline) + bufs = _metal_encode(cce, kernel, kernel_state, f, args) + MTL.append_current_function!(cce, gs, ts) + bufs + finally + close(cce) + end - # the command buffer retains resources that are explicitly encoded (i.e. direct buffer - # arguments, or the buffers allocated for each other argument), but that doesn't keep - # other resources alive for which we've encoded the GPU address ourselves. since it's - # possible for buffers to go out of scope while the kernel is still running, which - # triggers validation failures, keep track of things we need to keep alive until the - # kernel has actually completed. - # - # TODO: is there a way to bind additional resources to the command buffer? - roots = [kernel.f, args] - MTL.on_completed(cmdbuf) do buf - empty!(roots) - foreach(free, argument_buffers) - - # Check for errors - # XXX: we cannot do this nicely, e.g. throwing an `error` or reporting with `@error` - # because we're not allowed to switch tasks from this contexts. - if buf.status == MTL.MTLCommandBufferStatusError - Core.println("ERROR: Failed to submit command buffer: $(buf.error.localizedDescription)") + # during precompilation, skip GPU submission (which hangs) but keep the + # encoding path above to cache compilation of the argument encoding pipeline + if ccall(:jl_generating_output, Cint, ()) != 0 + foreach(free, argument_buffers) + return end - end - commit!(cmdbuf) + # the command buffer retains resources that are explicitly encoded (i.e. direct buffer + # arguments, or the buffers allocated for each other argument), but that doesn't keep + # other resources alive for which we've encoded the GPU address ourselves. since it's + # possible for buffers to go out of scope while the kernel is still running, which + # triggers validation failures, keep track of things we need to keep alive until the + # kernel has actually completed. + # + # TODO: is there a way to bind additional resources to the command buffer? + # + # collect to Vector so the callback closure type doesn't + # depend on the specific kernel's argument buffer tuple type + argument_buffer_vec = collect(argument_buffers) + roots = Any[f, args] + MTL.on_completed(cmdbuf) do buf + empty!(roots) + foreach(free, argument_buffer_vec) + + # Check for errors + # XXX: we cannot do this nicely, e.g. throwing an `error` or reporting with `@error` + # because we're not allowed to switch tasks from this contexts. + if buf.status == MTL.MTLCommandBufferStatusError + Core.println("ERROR: Failed to submit command buffer: $(buf.error.localizedDescription)") + end + end + + commit!(cmdbuf) + end end +# Bridge: specializes on f and args types (encode_arguments! needs concrete types +# for ghost-type detection), but NOT on the full HostKernel type. +@inline _metal_encode(cce, @nospecialize(kernel), kernel_state, f, args::Tuple) = + encode_arguments!(cce, kernel, kernel_state, f, args...) + ## Intra-warp Helpers """ diff --git a/src/precompile.jl b/src/precompile.jl index 296c3a4a1..49c9ef2ab 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -3,18 +3,43 @@ using PrecompileTools: @setup_workload, @compile_workload @setup_workload begin metallib_file = joinpath(dirname(@__DIR__), "test", "dummy.metallib") - # parsing and writing metal libraries - metallib = parse(MetalLib, metallib_file) - sprint(write, metallib) -end + @compile_workload begin + # parsing and writing metal libraries + metallib = parse(MetalLib, metallib_file) + sprint(write, metallib) + + # launch a trivial kernel to precompile the full pipeline: + # mtlfunction → GPUCompiler → LLVM IR → AIR → metallib → link → launch + # (GPU submission is skipped during precompilation, but the entire + # encoding path — encode_arguments!, command buffer setup, etc. — runs) + kernel() = return + @metal kernel() -precompile(compile, (CompilerJob,)) -precompile(Tuple{typeof(GPUCompiler.finish_ir!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) -precompile(Tuple{typeof(GPUCompiler.finish_module!), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module, LLVM.Function}) -precompile(Tuple{typeof(GPUCompiler.check_ir), GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, LLVM.Module}) -precompile(Tuple{typeof(GPUCompiler.actual_compilation), Base.Dict{Any, Any}, Core.MethodInstance, UInt64, GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, typeof(Metal.compile), typeof(Metal.link)}) + # launch a realistic kernel with array arguments + a = MtlArray(Float32[1]) + b = MtlArray(Float32[1]) + c = MtlArray(Float32[0]) + function precompile_vadd(a, b, c) + i = thread_position_in_grid().x + c[i] = a[i] + b[i] + return + end + @metal precompile_vadd(a, b, c) -# Worth the hassle -if isdefined(Base, :Compiler) && isdefined(Base.Compiler, :typeinf_local) - precompile(Tuple{typeof(Base.Compiler.typeinf_local), GPUCompiler.GPUInterpreter{Base.Compiler.CachedMethodTable{Base.Compiler.OverlayMethodTable}}, Base.Compiler.InferenceState, Base.Compiler.CurrentState}) + # also exercise 2D arrays (common in real workloads) + a2 = MtlArray(Float32[1 1]) + b2 = MtlArray(Float32[1 1]) + c2 = MtlArray(Float32[0 0]) + @metal precompile_vadd(a2, b2, c2) + + # precompile MtlArray → Array copy-back + Array(c) + Array(c2) + end end + +# GPUCompiler macro-expansion utilities (compiled at every @metal callsite). +# split_kwargs is called with 3 Vector{Symbol} groups (MACRO/COMPILER/LAUNCH_KWARGS). +precompile(Tuple{typeof(GPUCompiler.split_kwargs), Tuple{Expr}, Vector{Symbol}, Vector{Symbol}, Vector{Symbol}}) +precompile(Tuple{typeof(GPUCompiler.split_kwargs), Tuple{}, Vector{Symbol}, Vector{Symbol}, Vector{Symbol}}) +precompile(Tuple{typeof(GPUCompiler.assign_args!), Expr, Vector{Any}})