diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 57dbac9fec..28ca591a8c 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -48,6 +48,15 @@ steps: agents: slurm_gpus: 1 + - label: "Unit: compiler stress regression" + key: unit_compiler_stress_regression + command: + - "julia --color=yes --project=.buildkite test/gpu/compiler_stress_regression.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - group: "Unit: RecursiveApply" steps: diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index b9a31ec711..fb3993d43d 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -5,10 +5,21 @@ import ClimaCore.DataLayouts: empty_kernel_stats const reported_stats = Dict() const kernel_names = IdDict() + # Call via ClimaCore.DataLayouts.empty_kernel_stats() empty_kernel_stats(::ClimaComms.CUDADevice) = empty!(reported_stats) collect_kernel_stats() = false +function _memory_bytes(memory, key::Symbol) + if hasproperty(memory, key) + return Int(getproperty(memory, key)) + elseif memory isa NamedTuple && haskey(memory, key) + return Int(memory[key]) + else + return 0 + end +end + # Robustly parse boolean-like environment variables function _getenv_bool(var::AbstractString; default::Bool = false) raw = get(ENV, var, nothing) @@ -175,27 +186,38 @@ function auto_launch!( # CUDA.registers(kernel) > 50 || return nothing # for debugging # occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing if !haskey(reported_stats, key) - @assert !isnothing(nitems) kernel = CUDA.@cuda always_inline = true launch = false f!(args...) config = CUDA.launch_configuration(kernel.fun) - threads = min(nitems, config.threads) - blocks = cld(nitems, threads) + threads = isnothing(nitems) ? nothing : min(nitems, config.threads) + blocks = isnothing(nitems) ? nothing : cld(nitems, threads) # For now, let's just collect info, later we can benchmark #! format: off s = "" s *= "Launching kernel $f! with following config:\n" - s *= " nitems: $(nitems)\n" + nitems_str = isnothing(nitems) ? "unknown" : string(nitems) + s *= " nitems: $(nitems_str)\n" isnothing(threads_s) || (s *= " threads_s: $(threads_s)\n") isnothing(blocks_s) || (s *= " blocks_s: $(blocks_s)\n") - s *= " threads: $(threads)\n" - s *= " blocks: $(blocks)\n" - isnothing(threads_s) || (s *= " Δthreads: $(threads - prod(threads_s))\n") - isnothing(blocks_s) || (s *= " Δblocks: $(blocks - prod(blocks_s))\n") + isnothing(threads) || (s *= " threads: $(threads)\n") + isnothing(blocks) || (s *= " blocks: $(blocks)\n") + (isnothing(threads_s) || isnothing(threads)) || (s *= " Δthreads: $(threads - prod(threads_s))\n") + (isnothing(blocks_s) || isnothing(blocks)) || (s *= " Δblocks: $(blocks - prod(blocks_s))\n") s *= " maxthreads: $(CUDA.maxthreads(kernel))\n" s *= " registers: $(CUDA.registers(kernel))\n" isnothing(threads_s) || ( s *= " threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n") - s *= " memory: $(CUDA.memory(kernel))\n" + memory = CUDA.memory(kernel) + local_bytes = _memory_bytes(memory, :local) + shared_bytes = _memory_bytes(memory, :shared) + const_bytes = _memory_bytes(memory, :constant) + s *= " memory: $(memory)\n" + profile_line = + "CUDA_PROFILE: kernel=$(something(kernel_name, nameof(F!))) " * + "registers=$(CUDA.registers(kernel)) " * + "local=$(local_bytes) shared=$(shared_bytes) constant=$(const_bytes) " * + "maxthreads=$(CUDA.maxthreads(kernel))" + s *= " $(profile_line)\n" @info s + println(profile_line) #! format: on reported_stats[key] = true # error("Oops") # for debugging diff --git a/perf/stress_test_compiler.jl b/perf/stress_test_compiler.jl new file mode 100644 index 0000000000..261c32d488 --- /dev/null +++ b/perf/stress_test_compiler.jl @@ -0,0 +1,2556 @@ +""" +Stress test for the ClimaCore compiler on pointwise/broadcast operations. + +This script tests when inlining fails, CUDA performance degrades, or the +compiler fails altogether. The test suite covers: + +1. **Arithmetic operations** - varying nesting depth and argument counts + When do we fail to inline deeply chained operations? + +2. **Projection operations** - axis tensor coordinate transformations + How does projection complexity affect compilation? + +3. **Multiple arguments** - varying numbers of nonlocal field arguments + When does broadcasting with many arguments become problematic? + +4. **Function composition** - log, sqrt, and other transcendental functions + How do special functions interact with inlining? + +5. **Divergence operations** - differential operator on vector fields + How does the divergence operator compilation scale with mesh complexity? + +6. **Curl operations** - differential operator on vector fields + How does the curl operator compilation scale with mesh complexity? + +7. **Nested call operations** - helper functions that call other helper functions + How does compilation behave as the call chain depth increases? + +8. **Function-argument subexpressions** - inline `max`/`ifelse` argument logic + Does compilation remain robust when function-call arguments contain + nontrivial subexpressions instead of precomputed intermediates? + +9. **ClimaAtmos-like column broadcasts** - fused closure-wrapped microphysics-style + broadcasts with inline subexpressions, interpolation, and upwinding + How close do typical ClimaAtmos column expressions get to compiler failure? + +Each test is run in a subprocess to avoid compilation state leakage. +The Julia project is automatically instantiated before running tests. +Uses the `.buildkite` project environment for reproducibility. + +Benchmarking note: each test constructs spaces/fields and fills inputs before the +timed region. The `@benchmark` expression only times the broadcast/pointwise +operation under test. On CUDA, the profiled primary kernel may still be +`knl_fill!` for some test expressions; this reflects the kernel generated for the +operation itself, not benchmark inclusion of setup work. + +USAGE EXAMPLES: + # Run all tests on CPU (default) + julia --project=.buildkite perf/stress_test_compiler.jl + + # Run tests matching a filter on CPU + julia --project=.buildkite perf/stress_test_compiler.jl arithmetic + + # Run on CUDA with GPU reservation + CLIMACOMMS_DEVICE=CUDA srun --mpi=none --gpus=1 julia --project=.buildkite perf/stress_test_compiler.jl + + # Run a single specific test + julia --project=.buildkite perf/stress_test_compiler.jl arithmetic_depth_5 + + # Write reports to JSON and Markdown (both option forms are supported) + julia --project=.buildkite perf/stress_test_compiler.jl \ + --output-json perf/results/stress_latest.json \ + --output-markdown=perf/results/stress_latest.md + + # Filter + report outputs + julia --project=.buildkite perf/stress_test_compiler.jl curl \ + --output-json=perf/results/curl_latest.json \ + --output-markdown=perf/results/curl_latest.md + + # Run with sbatch + mkdir -p perf/logs && sbatch --job-name=cc-stress-suite \ + --output=perf/logs/stress_suite_%j.log --gpus=1 \ + --wrap='CLIMACOMMS_DEVICE=CUDA julia --project=.buildkite perf/stress_test_compiler.jl' + +NOTE: The script automatically detects and uses the `.buildkite` project directory, +so it should be run from the ClimaCore.jl root or from the perf/ directory. + +REPORT OUTPUT OPTIONS: + --output-json PATH + Write the full structured report (run metadata, summary, per-test records) + to PATH as JSON. + + --output-markdown PATH + Write a human-readable Markdown report table to PATH. + + Option syntax: + Both `--flag value` and `--flag=value` are accepted. + Relative paths are resolved from the repository root. +""" + +using Pkg +using BenchmarkTools +using Printf +using Dates +using Logging +using Statistics +using Sockets + +# Set up logging to suppress info messages during testing +disable_logging(Logging.Info) + +# Determine project root and buildkite directory. +# Use @__DIR__ so this works both when run as a script and when included from notebooks. +const PROJECT_ROOT = dirname(@__DIR__) +const PROJECT_DIR = begin + buildkite_dir = joinpath(PROJECT_ROOT, ".buildkite") + isdir(buildkite_dir) ? buildkite_dir : PROJECT_ROOT +end + +# Default to CUDA; can still be overridden explicitly by environment +const DEVICE = get(ENV, "CLIMACOMMS_DEVICE", "CUDA") + +const VALID_ANALYSIS_MODES = ("timing", "compile") + +is_compile_analysis_mode(mode::AbstractString) = mode == "compile" + +""" + initialize_project() + +Initialize the Julia project before running tests. +""" +function initialize_project() + # Change to project directory + cd(PROJECT_DIR) + + # Instantiate project if needed + if !isfile(joinpath(PROJECT_DIR, "Manifest.toml")) + @info "Instantiating project at $PROJECT_DIR" + Pkg.instantiate() + else + @info "Project manifest found, skipping instantiation" + end +end + +""" + has_cuda_env() + +Check if CUDA device is requested via environment variable. +""" +has_cuda_env() = DEVICE == "CUDA" + +""" + run_test_subprocess(test_code::String, test_name::String) -> (success::Bool, output::String, error::String) + +Run a test in a subprocess to avoid compilation state leakage. +Returns a tuple of (success, stdout, stderr). + +If already running under srun (detected via SLURM_* environment variables), +subprocesses inherit the parent's GPU allocation and don't request their own. +Otherwise, subprocesses request their own GPU allocation via srun. +""" +function run_test_subprocess( + test_code::String, + test_name::String, + analysis_mode::String, +) + tmp_file = tempname() * ".jl" + try + write(tmp_file, test_code) + + # Check if we're already in a srun environment + in_srun = !isempty(get(ENV, "SLURM_JOB_ID", "")) + + # Build command: skip srun if parent is already in srun to avoid resource contention + if in_srun + cmd = `$(Base.julia_cmd()) --startup-file=no --project=$(PROJECT_DIR) $tmp_file` + else + # Need to allocate GPU for standalone subprocess + cmd = `srun --mpi=none --gpus=1 $(Base.julia_cmd()) --startup-file=no --project=$(PROJECT_DIR) $tmp_file` + end + + stdout_buffer = IOBuffer() + stderr_buffer = IOBuffer() + proc = withenv( + "CLIMACOMMS_DEVICE" => DEVICE, + "CLIMA_COLLECT_CUDA_KERNEL_STATS" => (has_cuda_env() ? "1" : "0"), + "CLIMACORE_STRESS_ANALYSIS_MODE" => analysis_mode, + "CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE" => + ( + has_cuda_env() ? + get(ENV, "CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE", "1") : "0" + ), + ) do + run( + pipeline(ignorestatus(cmd); stdout = stdout_buffer, stderr = stderr_buffer), + ) + end + + stdout_text = String(take!(stdout_buffer)) + stderr_text = String(take!(stderr_buffer)) + combined_output = if isempty(stderr_text) + stdout_text + elseif isempty(stdout_text) + stderr_text + elseif endswith(stdout_text, '\n') + stdout_text * stderr_text + else + stdout_text * "\n" * stderr_text + end + + if success(proc) + return (true, combined_output, stderr_text) + else + error_text = + isempty(stderr_text) ? "subprocess exited with a non-zero status" : + stderr_text + return (false, combined_output, error_text) + end + finally + rm(tmp_file, force = true) + end +end + +""" + parse_timings_from_output(output::String) -> Dict{String, Float64} + +Parse timing information from test output (expected format: "TIMING: name = value_seconds"). +""" +function parse_timings_from_output(output::String) + timings = Dict{String, Float64}() + for line in split(output, '\n') + if startswith(line, "TIMING:") + parts = split(line, '=') + if length(parts) == 2 + name = strip(split(parts[1], ':')[2]) + value_str = strip(parts[2]) + if endswith(value_str, "s") + value_str = value_str[1:(end - 1)] + end + try + timings[name] = parse(Float64, value_str) + catch + end + end + end + end + return timings +end + +struct CUDAKernelProfile + kernel::String + registers::Int + local_bytes::Int + shared_bytes::Int + raw_line::String +end + +function Base.show(io::IO, p::CUDAKernelProfile) + println(io, "CUDAKernelProfile(") + println(io, " kernel = ", repr(p.kernel)) + println(io, " registers = ", p.registers) + println(io, " local_bytes = ", p.local_bytes) + println(io, " shared_bytes = ", p.shared_bytes) + print(io, ")") +end + +struct CUDAProfileSummary + primary_kernel::String + registers::Int + local_bytes::Int + shared_bytes::Int + status::String + local_memory_kernels::Int + total_kernels::Int +end + +struct LLVMAnalysisSummary + call_count::Int + invoke_count::Int + line_count::Int + status::String +end + +function Base.show(io::IO, s::LLVMAnalysisSummary) + println(io, "LLVMAnalysisSummary(") + println(io, " call_count = ", s.call_count) + println(io, " invoke_count = ", s.invoke_count) + println(io, " line_count = ", s.line_count) + print(io, " status = ", repr(s.status), ")") +end + +function _parse_llvm_analysis_line(line::AbstractString) + parts = split(line) + startswith(line, "LLVM_ANALYSIS:") || return nothing + metrics = Dict{String, String}() + for token in parts + if contains(token, '=') + key, value = split(token, '='; limit = 2) + metrics[key] = value + end + end + + call_count = _metric_int(metrics, "calls") + invoke_count = _metric_int(metrics, "invokes") + line_count = _metric_int(metrics, "lines") + status = invoke_count > 0 ? "invoke_present" : "no_invoke" + return LLVMAnalysisSummary(call_count, invoke_count, line_count, status) +end + +function parse_llvm_analysis_from_output(output::String) + for line in split(output, '\n') + if startswith(strip(line), "LLVM_ANALYSIS:") + parsed = _parse_llvm_analysis_line(strip(line)) + !isnothing(parsed) && return parsed + end + end + return nothing +end + +function format_llvm_analysis(summary::LLVMAnalysisSummary) + return "LLVM_ANALYSIS: calls=$(summary.call_count) invokes=$(summary.invoke_count) lines=$(summary.line_count) status=$(summary.status)" +end + +function Base.show(io::IO, s::CUDAProfileSummary) + println(io, "CUDAProfileSummary(") + println(io, " primary_kernel = ", repr(s.primary_kernel)) + println(io, " registers = ", s.registers) + println(io, " local_bytes = ", s.local_bytes) + println(io, " shared_bytes = ", s.shared_bytes) + println(io, " status = ", repr(s.status)) + println(io, " local_memory_kernels = ", s.local_memory_kernels, "/", s.total_kernels) + print(io, ")") +end + +function parse_cuda_profile_metrics(profile::AbstractString) + metrics = Dict{String, String}() + payload = strip(split(profile, "CUDA_PROFILE:"; limit = 2)[2]) + for token in split(payload) + if contains(token, '=') + key, value = split(token, '='; limit = 2) + metrics[key] = value + end + end + return metrics +end + +function _metric_int(metrics::Dict{String, String}, key::String) + try + return parse(Int, get(metrics, key, "0")) + catch + return 0 + end +end + +function _parse_cuda_profile_line(line::AbstractString) + metrics = parse_cuda_profile_metrics(line) + return CUDAKernelProfile( + get(metrics, "kernel", "unknown"), + _metric_int(metrics, "registers"), + _metric_int(metrics, "local"), + _metric_int(metrics, "shared"), + String(line), + ) +end + +""" + parse_cuda_profile_from_output(output::String) -> Vector{CUDAKernelProfile} + +Parse CUDA profile lines from subprocess output. +Expected format: an info-log line containing "CUDA_PROFILE: ...". +""" +function parse_cuda_profile_from_output(output::String) + profiles = CUDAKernelProfile[] + for line in split(output, '\n') + if occursin("CUDA_PROFILE:", line) + push!(profiles, _parse_cuda_profile_line(strip(line))) + end + end + return profiles +end + +function summarize_cuda_profiles(profiles::Vector{CUDAKernelProfile}) + isempty(profiles) && return nothing + + primary = first(profiles) + primary_score = (primary.registers, primary.local_bytes, primary.shared_bytes) + for candidate in profiles[2:end] + candidate_score = ( + candidate.registers, + candidate.local_bytes, + candidate.shared_bytes, + ) + if candidate_score > primary_score + primary = candidate + primary_score = candidate_score + end + end + + local_memory_count = count(p -> p.local_bytes > 0, profiles) + status = primary.local_bytes > 0 ? "local_memory_used" : "no_local_memory" + + return CUDAProfileSummary( + primary.kernel, + primary.registers, + primary.local_bytes, + primary.shared_bytes, + status, + local_memory_count, + length(profiles), + ) +end + +function format_cuda_profile(summary::CUDAProfileSummary) + return "CUDA_PROFILE: primary_kernel=$(summary.primary_kernel) registers=$(summary.registers) local=$(summary.local_bytes) shared=$(summary.shared_bytes) status=$(summary.status) local_memory_kernels=$(summary.local_memory_kernels)/$(summary.total_kernels)" +end + +function _command_output(cmd::Cmd) + try + return chomp(read(cmd, String)) + catch + return nothing + end +end + +function _command_lines(cmd::Cmd) + output = _command_output(cmd) + isnothing(output) && return String[] + isempty(output) && return String[] + return split(output, '\n') +end + +function _git_cmd(args...) + return Cmd(vcat(["git", "-C", PROJECT_ROOT], collect(args))) +end + +function collect_run_metadata(test_filter::Union{String, Nothing}, analysis_mode::String) + git_status = _command_lines(_git_cmd("status", "--porcelain")) + gpu_lines = + has_cuda_env() ? + _command_lines(`nvidia-smi --query-gpu=name,uuid --format=csv,noheader`) : + String[] + visible_gpu_env = get(ENV, "CUDA_VISIBLE_DEVICES", "") + allocated_gpu_ids = + isempty(strip(visible_gpu_env)) ? String[] : split(visible_gpu_env, ',') + + return Dict{String, Any}( + "timestamp_utc" => Dates.format(now(UTC), dateformat"yyyy-mm-ddTHH:MM:SSZ"), + "project_root" => PROJECT_ROOT, + "device" => DEVICE, + "hostname" => gethostname(), + "julia_version" => string(VERSION), + "analysis_mode" => analysis_mode, + "test_filter" => something(test_filter, "all"), + "git_commit" => + something(_command_output(_git_cmd("rev-parse", "HEAD")), "unknown"), + "git_branch" => + something( + _command_output(_git_cmd("rev-parse", "--abbrev-ref", "HEAD")), + "unknown", + ), + "git_describe" => + something( + _command_output(_git_cmd("describe", "--always", "--dirty", "--tags")), + "unknown", + ), + "git_dirty" => !isempty(git_status), + "git_status" => git_status, + "slurm_job_id" => get(ENV, "SLURM_JOB_ID", nothing), + "slurm_job_name" => get(ENV, "SLURM_JOB_NAME", nothing), + "slurm_nodelist" => get(ENV, "SLURM_JOB_NODELIST", nothing), + "allocated_gpu_count" => length(allocated_gpu_ids), + "allocated_gpu_ids" => allocated_gpu_ids, + "node_gpu_count" => length(gpu_lines), + "gpu_devices" => [ + Dict{String, Any}( + "index" => i, + "name" => strip(first(split(line, ','))), + "uuid" => strip(last(split(line, ','))), + ) for (i, line) in enumerate(gpu_lines) + ], + ) +end + +function arithmetic_expression(depth::Int) + expr = "x" + ops = ["+", "*", "/", "-"] + for i in 1:depth + op = ops[mod(i, length(ops)) + 1] + expr = "($expr $op $(i).0)" + end + return expr +end + +function functions_expression(funcs::Vector{String}, depth::Int) + expr = "x + 0.5" + for i in depth:-1:1 + func = funcs[mod1(i, length(funcs))] + if func == "log" + expr = "log(abs($expr) + 1.5)" + elseif func == "sqrt" + expr = "sqrt(abs($expr) + 1.5)" + else + expr = "$func($expr)" + end + end + return expr +end + +function nested_call_definitions(depth::Int) + defs = String[] + ops = ["+", "*", "/", "-"] + for i in 1:depth + op = ops[mod1(i, length(ops))] + rhs = if i == 1 + "x" + else + "helper_$(i - 1)(x)" + end + expr = if op == "+" + "($rhs + $(i).0)" + elseif op == "*" + "($rhs * $(i + 1).0)" + elseif op == "/" + "($rhs / $(i + 1).0)" + else + "($rhs - $(i).0)" + end + push!(defs, "helper_$i(x) = $expr") + end + return defs +end + +function nested_call_expression(depth::Int) + defs = nested_call_definitions(depth) + push!(defs, "op(x) = helper_$(depth)(x)") + push!(defs, "op.(f)") + return join(defs, "\n") +end + +function lazy_broadcast_tree_expression(depth::Int, breadth::Int) + args = ["f$i" for i in 1:breadth] + layer = "(" * join(args, " .+ ") * ") ./ $(breadth).0" + for d in 1:depth + terms = [ + "sqrt.(abs.(($layer) .+ $(i + d).0 .* f$i) .+ 1.0)" for i in 1:breadth + ] + layer = "(" * join(terms, " .+ ") * ") ./ $(breadth).0" + end + return args, layer +end + +function lazy_broadcast_tree_builder(depth::Int, breadth::Int) + fields = ["f$i" for i in 1:breadth] + tree = "Base.Broadcast.broadcasted(/, Base.Broadcast.broadcasted(+, $(join(fields, ", "))), $(breadth).0)" + lines = ["tree = $(tree)"] + for d in 1:depth + terms = [ + "Base.Broadcast.broadcasted(x -> sqrt(abs(x) + 1.0), Base.Broadcast.broadcasted(+, tree, Base.Broadcast.broadcasted(*, $(i + d).0, f$i)))" + for i in 1:breadth + ] + push!( + lines, + "tree = Base.Broadcast.broadcasted(/, Base.Broadcast.broadcasted(+, $(join(terms, ", "))), $(breadth).0)", + ) + end + push!(lines, "tree") + return join(lines, "\n ") +end + +function subexpression_args_expression(mode::String) + if mode == "bare_namedtuple" + # Passes a bare NamedTuple as the first argument inside @. + # Julia refuses to broadcast over NamedTuples, so this FAILS. + return "@. loglambda = my_get_distribution_loglambda(scheme, max(zero(rhoq_ice), rhoq_ice), max(zero(rhon_ice), rhon_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim))" + elseif mode == "closure_wrapped" + # Wraps the NamedTuple in a closure so broadcast never sees it. + # This is the minimal fix for the ClimaAtmos precipitation-velocity bug. + return "fn_with_scheme = let s = scheme\n (q, n, rqi, rqb) -> log(abs(s.c1 * q + s.c2 * n) + 1) + s.c3 * (rqi - rqb)\nend\n@. loglambda = fn_with_scheme(max(zero(rhoq_ice), rhoq_ice), max(zero(rhon_ice), rhon_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim))" + else # precomputed + # Precomputes all subexpressions into separate fields, then calls the + # function with plain field arguments — no inline subexpressions at all. + return "@. rhoq_ice_pos = max(zero(rhoq_ice), rhoq_ice)\n@. rhon_ice_pos = max(zero(rhon_ice), rhon_ice)\n@. rim_over_ice = ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice)\n@. rim_over_bulk = ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim)\nfn_with_scheme = let s = scheme\n (q, n, rqi, rqb) -> log(abs(s.c1 * q + s.c2 * n) + 1) + s.c3 * (rqi - rqb)\nend\n@. loglambda = fn_with_scheme(rhoq_ice_pos, rhon_ice_pos, rim_over_ice, rim_over_bulk)" + end +end + +function climaatmos_column_expression(repeats::Int) + blocks = [ + "fn_with_scheme(winterp(ᶜw, max(zero(rhoq_ice), rhoq_ice + $(i).0 / 10)), winterp(ᶜn, max(zero(rhon_ice), rhon_ice + $(i).0 / 20)), upwind(ᶠv, ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / (rhoq_ice + $(i).0 / 50))), upwind(ᶠv, ifelse(iszero(rhon_ice), zero(rhon_ice), rhob_rim / (rhon_ice + $(i).0 / 40))), interp(max(zero(rhoq_ice), rhoq_ice + $(i).0 / 30)), interp(ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / (rhoq_ice + $(i).0 / 60))))" + for i in 1:repeats + ] + return "@. tendency = " * join(blocks, " + ") +end + +function _lazy_broadcast_params_from_test_name(name::String) + m = match(r"lazy_broadcast_d(\d+)_b(\d+)", name) + isnothing(m) && return nothing + return parse(Int, m.captures[1]), parse(Int, m.captures[2]) +end + +function render_test_expression(test) + if test.operation_type == "arithmetic" + expr = arithmetic_expression(test.complexity) + return "op(x) = $expr\nop.(f)" + elseif test.operation_type == "multiarg" + args = ["f$i" for i in 1:(test.num_args)] + sum_expr = join(args[1:(end - 1)], " + ") + op_expr = "($sum_expr) / ($(last(args)) + 1.0)" + return "op($(join(args, ", "))) = $op_expr\nop.($(join(args, ", ")))" + elseif test.operation_type == "functions" + funcs = if occursin("_log_", test.name) + ["log"] + elseif occursin("_sqrt_", test.name) + ["sqrt"] + else + ["log", "sqrt", "abs"] + end + expr = functions_expression(funcs, test.complexity) + return "op(x) = $expr\nop.(f)" + elseif test.operation_type == "nested_calls" + return nested_call_expression(test.complexity) + elseif test.operation_type == "subexpression_args" + mode = if occursin("bare_namedtuple", test.name) + "bare_namedtuple" + elseif occursin("closure_wrapped", test.name) + "closure_wrapped" + else + "precomputed" + end + return subexpression_args_expression(mode) + elseif test.operation_type == "projection" + proj_terms = join( + [ + "Geometry.project(Geometry.Covariant12Axis(), v)" for + _ in 1:(test.complexity) + ], + " .+ ", + ) + return "@. $proj_terms" + elseif test.operation_type == "divergence" + return join(["div_op.(v .* $(i).0)" for i in 1:(test.complexity)], " .+ ") + elseif test.operation_type == "curl" + return join(["curl_op.(v .* $(i).0)" for i in 1:(test.complexity)], " .+ ") + elseif test.operation_type == "interpolate" + return join(["interp.(ᶜf .* $(i).0)" for i in 1:(test.complexity)], " .+ ") + elseif test.operation_type == "weighted_interpolate" + return join(["winterp.(ᶜw, ᶜf .* $(i).0)" for i in 1:(test.complexity)], " .+ ") + elseif test.operation_type == "upwinding" + return join(["upwind.(ᶠv, ᶜf .* $(i).0)" for i in 1:(test.complexity)], " .+ ") + elseif test.operation_type == "climaatmos" + return climaatmos_column_expression(test.complexity) + elseif test.operation_type == "lazy_broadcast_tree" + params = _lazy_broadcast_params_from_test_name(test.name) + if isnothing(params) + return test.description + end + depth, breadth = params + args, expr = lazy_broadcast_tree_expression(depth, breadth) + return "$(join(args, ", ")) -> $expr" + else + return test.description + end +end + +result_profile_summary(result) = result.cuda_profile_summary +result_llvm_summary(result) = result.llvm_analysis_summary + +function result_to_record(result) + summary = result_profile_summary(result) + llvm_summary = result_llvm_summary(result) + local_memory_kernels = + isnothing(summary) ? "0/0" : + "$(summary.local_memory_kernels)/$(summary.total_kernels)" + return Dict{String, Any}( + "name" => result.test_def.name, + "description" => result.test_def.description, + "operation_type" => result.test_def.operation_type, + "complexity" => result.test_def.complexity, + "num_args" => result.test_def.num_args, + "uses_geometry" => result.test_def.uses_geometry, + "success" => result.success, + "time_seconds" => result.time_seconds, + "time_microseconds" => + isnothing(result.time_seconds) ? nothing : result.time_seconds * 1e6, + "error_msg" => result.error_msg, + "expression" => render_test_expression(result.test_def), + "primary_kernel" => isnothing(summary) ? nothing : summary.primary_kernel, + "registers" => isnothing(summary) ? nothing : summary.registers, + "local_bytes" => isnothing(summary) ? nothing : summary.local_bytes, + "shared_bytes" => isnothing(summary) ? nothing : summary.shared_bytes, + "local_memory_status" => isnothing(summary) ? nothing : summary.status, + "local_memory_kernels" => local_memory_kernels, + "cuda_profile_lines" => + isnothing(summary) ? String[] : [format_cuda_profile(summary)], + "llvm_call_count" => isnothing(llvm_summary) ? nothing : llvm_summary.call_count, + "llvm_invoke_count" => + isnothing(llvm_summary) ? nothing : llvm_summary.invoke_count, + "llvm_line_count" => isnothing(llvm_summary) ? nothing : llvm_summary.line_count, + "llvm_status" => isnothing(llvm_summary) ? nothing : llvm_summary.status, + "llvm_analysis_lines" => + isnothing(llvm_summary) ? String[] : [format_llvm_analysis(llvm_summary)], + "soft_fail" => false, + "soft_fail_reasons" => String[], + "failure_mode" => result.success ? "pass" : "hard_fail", + ) +end + +function _local_memory_fraction(value) + if !(value isa AbstractString) || !occursin('/', value) + return nothing + end + parts = split(value, '/'; limit = 2) + length(parts) == 2 || return nothing + try + used = parse(Float64, strip(parts[1])) + total = parse(Float64, strip(parts[2])) + total <= 0 && return nothing + return used / total + catch + return nothing + end +end + +function _as_int_or_nothing(x) + x === nothing && return nothing + x isa Integer && return Int(x) + x isa AbstractFloat && return Int(round(x)) + return nothing +end + +function _is_timed_success(record::Dict{String, Any}) + timed = get(record, "time_microseconds", nothing) + return get(record, "success", false) && !isnothing(timed) && timed > 0 +end + +function _llvm_soft_fail_reasons(record::Dict{String, Any}) + reasons = String[] + invokes = _as_int_or_nothing(get(record, "llvm_invoke_count", nothing)) + if !isnothing(invokes) && invokes > 0 + push!(reasons, "llvm_invoke_present(count=$(invokes))") + end + return reasons +end + +function _soft_fail_reasons( + prev::Dict{String, Any}, + cur::Dict{String, Any}; + complexity_gap::Union{Int, Nothing} = nothing, +) + reasons = String[] + + prev_regs = _as_int_or_nothing(get(prev, "registers", nothing)) + cur_regs = _as_int_or_nothing(get(cur, "registers", nothing)) + if !isnothing(prev_regs) && !isnothing(cur_regs) && prev_regs > 0 + reg_jump_abs = cur_regs - prev_regs + reg_jump_rel = cur_regs / prev_regs + # For sparse sampling (e.g. 1 -> 12), avoid over-sensitive cliff labels. + sparse_gap = !isnothing(complexity_gap) && complexity_gap > 4 + register_cliff = if sparse_gap + # For coarse complexity gaps, require both a substantial absolute and + # relative jump (or a very large relative jump on its own). + (reg_jump_abs >= 32 && reg_jump_rel >= 1.25) || reg_jump_rel >= 2.00 + else + # For neighboring points, require a moderate relative increase when + # using absolute jump checks to avoid high-register false positives. + (reg_jump_abs >= 16 && reg_jump_rel >= 1.20) || reg_jump_rel >= 1.35 + end + if register_cliff + push!( + reasons, + "register_cliff(prev=$(prev_regs), cur=$(cur_regs), jump=$(reg_jump_abs), ratio=$(@sprintf("%.2f", reg_jump_rel)))", + ) + end + end + + prev_local = _as_int_or_nothing(get(prev, "local_bytes", nothing)) + cur_local = _as_int_or_nothing(get(cur, "local_bytes", nothing)) + if !isnothing(prev_local) && !isnothing(cur_local) + if prev_local == 0 && cur_local > 0 + push!(reasons, "local_memory_appeared(prev=0, cur=$(cur_local))") + elseif prev_local > 0 && cur_local >= 2 * prev_local && + (cur_local - prev_local) >= 32 + push!( + reasons, + "local_memory_cliff(prev=$(prev_local), cur=$(cur_local), ratio=$(@sprintf("%.2f", cur_local / prev_local)))", + ) + end + end + + prev_frac = _local_memory_fraction(get(prev, "local_memory_kernels", nothing)) + cur_frac = _local_memory_fraction(get(cur, "local_memory_kernels", nothing)) + if !isnothing(prev_frac) && !isnothing(cur_frac) + if (cur_frac - prev_frac) >= 0.34 && cur_frac >= 0.50 + push!( + reasons, + "local_memory_kernel_fraction_cliff(prev=$(@sprintf("%.2f", prev_frac)), cur=$(@sprintf("%.2f", cur_frac)))", + ) + end + end + + return reasons +end + +function annotate_soft_failures!(records::Vector{Dict{String, Any}}) + for record in records + if !get(record, "success", false) + record["failure_mode"] = "hard_fail" + elseif isnothing(get(record, "time_microseconds", nothing)) + # expected-failure style pass; do not treat as hard/soft fail + record["failure_mode"] = "expected_failure" + else + record["failure_mode"] = "pass" + end + record["soft_fail"] = false + record["soft_fail_reasons"] = String[] + end + + by_op = Dict{String, Vector{Dict{String, Any}}}() + for record in records + op_type = string(get(record, "operation_type", "unknown")) + if !haskey(by_op, op_type) + by_op[op_type] = Dict{String, Any}[] + end + push!(by_op[op_type], record) + end + + for (_, op_records) in by_op + sort!( + op_records; + by = r -> ( + get(r, "complexity", typemax(Int)), + string(get(r, "name", "")), + ), + ) + + prev_timed = nothing + for record in op_records + if !_is_timed_success(record) + continue + end + + if !isnothing(prev_timed) + prev_complexity = _as_int_or_nothing(get(prev_timed, "complexity", nothing)) + cur_complexity = _as_int_or_nothing(get(record, "complexity", nothing)) + complexity_gap = + if !isnothing(prev_complexity) && !isnothing(cur_complexity) + max(0, cur_complexity - prev_complexity) + else + nothing + end + reasons = _soft_fail_reasons( + prev_timed, + record; + complexity_gap = complexity_gap, + ) + if !isempty(reasons) + record["soft_fail"] = true + record["soft_fail_reasons"] = reasons + record["failure_mode"] = "soft_fail_resource_cliff" + end + end + + prev_timed = record + end + end + + for record in records + if get(record, "success", false) + llvm_reasons = _llvm_soft_fail_reasons(record) + if !isempty(llvm_reasons) + record["soft_fail"] = true + record["soft_fail_reasons"] = + vcat(record["soft_fail_reasons"], llvm_reasons) + if record["failure_mode"] == "pass" + record["failure_mode"] = "soft_fail_inlining_signal" + end + end + end + end + + return records +end + +function build_report( + results, + test_filter::Union{String, Nothing}, + analysis_mode::String, +) + records = [result_to_record(result) for result in results] + annotate_soft_failures!(records) + + successful = filter(r -> r.success, results) + times_μs = [r.time_seconds * 1e6 for r in successful if !isnothing(r.time_seconds)] + soft_failed = count(r -> get(r, "soft_fail", false), records) + hard_failed = count(r -> !get(r, "success", false), records) + return Dict{String, Any}( + "schema_version" => 1, + "run_metadata" => collect_run_metadata(test_filter, analysis_mode), + "summary" => Dict{String, Any}( + "total_tests" => length(results), + "successful_tests" => length(successful), + "failed_tests" => length(results) - length(successful), + "hard_failed_tests" => hard_failed, + "soft_failed_tests" => soft_failed, + "minimum_time_microseconds" => + isempty(times_μs) ? nothing : minimum(times_μs), + "maximum_time_microseconds" => + isempty(times_μs) ? nothing : maximum(times_μs), + "mean_time_microseconds" => isempty(times_μs) ? nothing : mean(times_μs), + "median_time_microseconds" => + length(times_μs) < 2 ? nothing : median(times_μs), + ), + "results" => records, + ) +end + +function json_escape(s::AbstractString) + return replace( + s, + '\\' => "\\\\", + '"' => "\\\"", + '\n' => "\\n", + '\r' => "\\r", + '\t' => "\\t", + ) +end + +function _json_scalar(x) + if x === nothing + return "null" + elseif x isa Bool + return x ? "true" : "false" + elseif x isa Integer || x isa AbstractFloat + return string(x) + elseif x isa AbstractString + return '"' * json_escape(x) * '"' + else + return '"' * json_escape(string(x)) * '"' + end +end + +function to_json(x; indent_step::Int = 2) + return _to_json_pretty(x, 0, indent_step) +end + +function _to_json_pretty(x, level::Int, indent_step::Int) + indent = repeat(" ", level * indent_step) + child_indent = repeat(" ", (level + 1) * indent_step) + + if x isa AbstractDict + keys_sorted = sort!(collect(keys(x)); by = string) + isempty(keys_sorted) && return "{}" + + lines = String[] + for key in keys_sorted + key_json = _json_scalar(string(key)) + value_json = _to_json_pretty(x[key], level + 1, indent_step) + push!(lines, "$(child_indent)$(key_json): $(value_json)") + end + return "{\n" * join(lines, ",\n") * "\n" * indent * "}" + elseif x isa AbstractVector + isempty(x) && return "[]" + + lines = String[] + for value in x + value_json = _to_json_pretty(value, level + 1, indent_step) + push!(lines, "$(child_indent)$(value_json)") + end + return "[\n" * join(lines, ",\n") * "\n" * indent * "]" + else + return _json_scalar(x) + end +end + +function ensure_parent_dir(path::AbstractString) + parent = dirname(path) + isempty(parent) || mkpath(parent) + return nothing +end + +resolve_output_path(path::AbstractString) = + isabspath(path) ? path : joinpath(PROJECT_ROOT, path) + +function write_json_report(path::AbstractString, report::Dict{String, Any}) + path = resolve_output_path(path) + ensure_parent_dir(path) + open(path, "w") do io + write(io, to_json(report) * "\n") + end + return nothing +end + +function html_escape(s::AbstractString) + return replace(replace(replace(s, "&" => "&"), "<" => "<"), ">" => ">") +end + +function markdown_expression_cell(expr::AbstractString) + # Replace newlines with so the entire
element stays on one line. + # Markdown table parsers (including VS Code) treat literal newlines inside a cell + # as row breaks, which creates phantom blank rows in the rendered table. + escaped = replace(html_escape(expr), "\n" => " ") + return "
show
$(escaped)
" +end + +function markdown_table_row(record::Dict{String, Any}, comparison_by_test = nothing) + cmp = + isnothing(comparison_by_test) ? nothing : + get(comparison_by_test, record["name"], nothing) + baseline_time = "-" + delta_time = "-" + baseline_regs = "-" + baseline_local = "-" + if !isnothing(cmp) + bt = get(cmp, "baseline_time_microseconds", nothing) + dt = get(cmp, "delta_time_percent", nothing) + br = get(cmp, "baseline_registers", nothing) + bl = get(cmp, "baseline_local_bytes", nothing) + baseline_time = isnothing(bt) ? "-" : @sprintf("%.3f", bt) + delta_time = isnothing(dt) ? "-" : @sprintf("%+.2f%%", dt) + baseline_regs = isnothing(br) ? "-" : string(br) + baseline_local = isnothing(bl) ? "-" : string(bl) + end + + if record["success"] && !isnothing(record["time_microseconds"]) + time_cell = @sprintf("%.3f", record["time_microseconds"]) + soft_fail_cell = get(record, "soft_fail", false) ? "yes" : "no" + reasons = get(record, "soft_fail_reasons", String[]) + reasons_cell = isempty(reasons) ? "-" : join(reasons, "; ") + return "| $(record["name"]) | $(record["operation_type"]) | $(time_cell) | $(baseline_time) | $(delta_time) | $(something(record["primary_kernel"], "-")) | $(something(record["registers"], "-")) | $(baseline_regs) | $(something(record["local_bytes"], "-")) | $(baseline_local) | $(something(record["shared_bytes"], "-")) | $(something(record["local_memory_status"], "-")) | $(record["local_memory_kernels"]) | $(something(record["llvm_call_count"], "-")) | $(something(record["llvm_invoke_count"], "-")) | $(soft_fail_cell) | $(reasons_cell) | $(markdown_expression_cell(record["expression"])) |" + elseif record["success"] + return "| $(record["name"]) | $(record["operation_type"]) | expected failure | $(baseline_time) | $(delta_time) | - | - | $(baseline_regs) | - | $(baseline_local) | - | - | - | $(something(record["llvm_call_count"], "-")) | $(something(record["llvm_invoke_count"], "-")) | - | - | $(markdown_expression_cell(record["expression"])) |" + else + return "| $(record["name"]) | $(record["operation_type"]) | FAILED | $(baseline_time) | $(delta_time) | - | - | $(baseline_regs) | - | $(baseline_local) | - | - | - | $(something(record["llvm_call_count"], "-")) | $(something(record["llvm_invoke_count"], "-")) | - | - | $(markdown_expression_cell(record["expression"])) |" + end +end + +function write_markdown_report( + path::AbstractString, + report::Dict{String, Any}; + comparison::Union{Dict{String, Any}, Nothing} = nothing, +) + path = resolve_output_path(path) + ensure_parent_dir(path) + metadata = report["run_metadata"] + summary = report["summary"] + records = report["results"] + + lines = String[] + push!(lines, "# ClimaCore Stress Test Report") + push!(lines, "") + push!(lines, "## Run Metadata") + push!(lines, "") + push!(lines, "- Timestamp (UTC): $(metadata["timestamp_utc"])") + push!(lines, "- Git commit: $(metadata["git_commit"])") + push!(lines, "- Git branch: $(metadata["git_branch"])") + push!(lines, "- Git describe: $(metadata["git_describe"])") + push!(lines, "- Git dirty: $(metadata["git_dirty"])") + push!(lines, "- Hostname: $(metadata["hostname"])") + push!(lines, "- Julia version: $(metadata["julia_version"])") + push!(lines, "- Device backend: $(metadata["device"])") + push!(lines, "- Analysis mode: $(metadata["analysis_mode"])") + push!(lines, "- Test filter: $(metadata["test_filter"])") + push!(lines, "- Allocated GPU count: $(metadata["allocated_gpu_count"])") + if !isempty(metadata["allocated_gpu_ids"]) + push!(lines, "- Allocated GPU IDs: $(join(metadata["allocated_gpu_ids"], ", "))") + end + push!(lines, "- Node GPU inventory: $(metadata["node_gpu_count"])") + if !isempty(metadata["gpu_devices"]) + for gpu in metadata["gpu_devices"] + push!(lines, "- GPU $(gpu["index"]): $(gpu["name"]) ($(gpu["uuid"]))") + end + end + if !isnothing(metadata["slurm_job_id"]) + push!(lines, "- Slurm job ID: $(metadata["slurm_job_id"])") + end + if !isnothing(comparison) + comparable = get(comparison, "comparable", false) + push!(lines, "- Baseline comparable: $(comparable)") + end + push!(lines, "") + push!(lines, "## Summary") + push!(lines, "") + push!(lines, "- Total tests: $(summary["total_tests"])") + push!(lines, "- Successful tests: $(summary["successful_tests"])") + push!(lines, "- Failed tests: $(summary["failed_tests"])") + if haskey(summary, "hard_failed_tests") + push!(lines, "- Hard failed tests: $(summary["hard_failed_tests"])") + end + if haskey(summary, "soft_failed_tests") + push!( + lines, + "- Soft failed tests (resource cliffs): $(summary["soft_failed_tests"])", + ) + end + if !isnothing(summary["minimum_time_microseconds"]) + push!( + lines, + "- Minimum time (μs): $(@sprintf("%.3f", summary["minimum_time_microseconds"]))", + ) + push!( + lines, + "- Maximum time (μs): $(@sprintf("%.3f", summary["maximum_time_microseconds"]))", + ) + push!( + lines, + "- Mean time (μs): $(@sprintf("%.3f", summary["mean_time_microseconds"]))", + ) + end + if !isnothing(summary["median_time_microseconds"]) + push!( + lines, + "- Median time (μs): $(@sprintf("%.3f", summary["median_time_microseconds"]))", + ) + end + push!(lines, "") + push!(lines, "## Results") + push!(lines, "") + push!( + lines, + "| Test | Type | Time (μs) | Baseline (μs) | Δ Time | Primary kernel | Regs | Base Regs | Local B | Base Local B | Shared B | Local memory | Local-memory kernels | LLVM calls | LLVM invokes | Soft fail | Soft-fail signals | Expression |", + ) + push!( + lines, + "| --- | --- | ---: | ---: | ---: | --- | ---: | ---: | ---: | ---: | ---: | --- | --- | ---: | ---: | --- | --- | --- |", + ) + comparison_by_test = + isnothing(comparison) ? nothing : get(comparison, "by_test_name", nothing) + for record in records + push!(lines, markdown_table_row(record, comparison_by_test)) + end + + open(path, "w") do io + write(io, join(lines, "\n") * "\n") + end + return nothing +end + +struct CliOptions + test_filter::Union{String, Nothing} + output_json::Union{String, Nothing} + output_markdown::Union{String, Nothing} + compare_against::Union{String, Nothing} + analysis_mode::String +end + +function parse_cli_args(args::Vector{String}) + test_filter = nothing + output_json = nothing + output_markdown = nothing + compare_against = nothing + analysis_mode = "timing" + + i = 1 + while i <= length(args) + arg = args[i] + if startswith(arg, "--output-json=") + output_json = split(arg, "="; limit = 2)[2] + elseif arg == "--output-json" + i += 1 + i > length(args) && error("Missing path after --output-json") + output_json = args[i] + elseif startswith(arg, "--output-markdown=") + output_markdown = split(arg, "="; limit = 2)[2] + elseif arg == "--output-markdown" + i += 1 + i > length(args) && error("Missing path after --output-markdown") + output_markdown = args[i] + elseif startswith(arg, "--compare-against=") + compare_against = split(arg, "="; limit = 2)[2] + elseif arg == "--compare-against" + i += 1 + i > length(args) && error("Missing path after --compare-against") + compare_against = args[i] + elseif startswith(arg, "--analysis-mode=") + analysis_mode = split(arg, "="; limit = 2)[2] + elseif arg == "--analysis-mode" + i += 1 + i > length(args) && error("Missing value after --analysis-mode") + analysis_mode = args[i] + elseif startswith(arg, "--") + error("Unknown option: $arg") + elseif isnothing(test_filter) + test_filter = arg + else + error( + "Multiple positional arguments provided; expected at most one test filter", + ) + end + i += 1 + end + + analysis_mode in VALID_ANALYSIS_MODES || + error( + "Invalid --analysis-mode=$(analysis_mode). Valid values: $(join(VALID_ANALYSIS_MODES, ", "))", + ) + + return CliOptions( + test_filter, + output_json, + output_markdown, + compare_against, + analysis_mode, + ) +end + +function _skip_ws(s::AbstractString, i::Int) + while i <= lastindex(s) + c = s[i] + if c == ' ' || c == '\n' || c == '\r' || c == '\t' + i = nextind(s, i) + else + break + end + end + return i +end + +function _json_parse_string(s::AbstractString, i::Int) + @assert s[i] == '"' + i = nextind(s, i) + buf = IOBuffer() + while i <= lastindex(s) + c = s[i] + if c == '"' + i = nextind(s, i) + return String(take!(buf)), i + elseif c == '\\' + i = nextind(s, i) + i > lastindex(s) && error("Invalid JSON escape") + esc = s[i] + if esc == '"' + write(buf, '"') + elseif esc == '\\' + write(buf, '\\') + elseif esc == '/' + write(buf, '/') + elseif esc == 'b' + write(buf, '\b') + elseif esc == 'f' + write(buf, '\f') + elseif esc == 'n' + write(buf, '\n') + elseif esc == 'r' + write(buf, '\r') + elseif esc == 't' + write(buf, '\t') + elseif esc == 'u' + # Minimal unicode handling: keep literal for unsupported code-point decoding + # Our generated JSON doesn't emit \u escapes, so this keeps parser robust enough. + i = nextind(s, i) + for _ in 1:4 + i > lastindex(s) && error("Invalid JSON unicode escape") + i = nextind(s, i) + end + write(buf, '?') + continue + else + error("Unsupported JSON escape: \\$esc") + end + i = nextind(s, i) + else + write(buf, c) + i = nextind(s, i) + end + end + error("Unterminated JSON string") +end + +function _json_parse_number(s::AbstractString, i::Int) + start = i + while i <= lastindex(s) + c = s[i] + if isdigit(c) || c in ('-', '+', '.', 'e', 'E') + i = nextind(s, i) + else + break + end + end + token = s[start:prevind(s, i)] + if occursin('.', token) || occursin('e', token) || occursin('E', token) + return parse(Float64, token), i + else + return parse(Int, token), i + end +end + +function _json_parse_value(s::AbstractString, i::Int) + i = _skip_ws(s, i) + i > lastindex(s) && error("Unexpected end of JSON") + c = s[i] + if c == '"' + return _json_parse_string(s, i) + elseif c == '{' + obj = Dict{String, Any}() + i = nextind(s, i) + i = _skip_ws(s, i) + if i <= lastindex(s) && s[i] == '}' + return obj, nextind(s, i) + end + while true + key, i = _json_parse_string(s, _skip_ws(s, i)) + i = _skip_ws(s, i) + s[i] == ':' || error("Expected ':' in JSON object") + i = nextind(s, i) + value, i = _json_parse_value(s, i) + obj[key] = value + i = _skip_ws(s, i) + if s[i] == '}' + return obj, nextind(s, i) + elseif s[i] == ',' + i = nextind(s, i) + else + error("Expected ',' or '}' in JSON object") + end + end + elseif c == '[' + arr = Any[] + i = nextind(s, i) + i = _skip_ws(s, i) + if i <= lastindex(s) && s[i] == ']' + return arr, nextind(s, i) + end + while true + value, i = _json_parse_value(s, i) + push!(arr, value) + i = _skip_ws(s, i) + if s[i] == ']' + return arr, nextind(s, i) + elseif s[i] == ',' + i = nextind(s, i) + else + error("Expected ',' or ']' in JSON array") + end + end + elseif c == 't' && i + 3 <= lastindex(s) && s[i:(i + 3)] == "true" + return true, i + 4 + elseif c == 'f' && i + 4 <= lastindex(s) && s[i:(i + 4)] == "false" + return false, i + 5 + elseif c == 'n' && i + 3 <= lastindex(s) && s[i:(i + 3)] == "null" + return nothing, i + 4 + else + return _json_parse_number(s, i) + end +end + +function parse_json_text(s::AbstractString) + value, i = _json_parse_value(s, firstindex(s)) + i = _skip_ws(s, i) + i <= lastindex(s) && error("Trailing content after JSON value") + return value +end + +function read_json_report(path::AbstractString) + path = resolve_output_path(path) + text = read(path, String) + parsed = parse_json_text(text) + parsed isa Dict{String, Any} || error("Expected top-level JSON object in $(path)") + return parsed +end + +function _record_lookup(records) + table = Dict{String, Dict{String, Any}}() + for rec_any in records + rec_any isa Dict{String, Any} || continue + haskey(rec_any, "name") || continue + table[string(rec_any["name"])] = rec_any + end + return table +end + +function _to_float_or_nothing(x) + x === nothing && return nothing + x isa Integer && return Float64(x) + x isa AbstractFloat && return Float64(x) + return nothing +end + +function compare_reports(current::Dict{String, Any}, baseline::Dict{String, Any}) + current_records = _record_lookup(current["results"]) + baseline_records = _record_lookup(baseline["results"]) + + run_cur = current["run_metadata"] + run_base = baseline["run_metadata"] + comparable = + get(run_cur, "device", nothing) == get(run_base, "device", nothing) && + get(run_cur, "julia_version", nothing) == get(run_base, "julia_version", nothing) + + by_name = Dict{String, Dict{String, Any}}() + for name in sort(collect(keys(current_records))) + rec_cur = current_records[name] + rec_base = get(baseline_records, name, nothing) + cur_time = _to_float_or_nothing(get(rec_cur, "time_microseconds", nothing)) + base_time = + isnothing(rec_base) ? nothing : + _to_float_or_nothing(get(rec_base, "time_microseconds", nothing)) + delta_pct = if !isnothing(cur_time) && !isnothing(base_time) && base_time != 0 + 100.0 * (cur_time - base_time) / base_time + else + nothing + end + by_name[name] = Dict{String, Any}( + "baseline_found" => !isnothing(rec_base), + "baseline_success" => + isnothing(rec_base) ? nothing : get(rec_base, "success", false), + "baseline_time_microseconds" => base_time, + "current_time_microseconds" => cur_time, + "delta_time_percent" => delta_pct, + "baseline_registers" => + isnothing(rec_base) ? nothing : get(rec_base, "registers", nothing), + "current_registers" => get(rec_cur, "registers", nothing), + "baseline_local_bytes" => + isnothing(rec_base) ? nothing : get(rec_base, "local_bytes", nothing), + "current_local_bytes" => get(rec_cur, "local_bytes", nothing), + ) + end + + return Dict{String, Any}( + "comparable" => comparable, + "baseline_metadata" => run_base, + "current_metadata" => run_cur, + "by_test_name" => by_name, + ) +end + +# ============================================================================ +# TEST CASE GENERATION +# ============================================================================ + +""" + generate_field_test_code(test_name::String, test_impl::String) -> String + +Generate a complete test code block that can be run in a subprocess. +Includes ClimaCore setup and proper device handling. + +# Arguments +- `test_name`: Unique identifier for the test case; used in error/compile-failure + messages and as the label passed to `run_stress_kernel_test`. +- `test_impl`: Julia source code (as a string) containing the body of the test. + It is interpolated verbatim into the generated script after all imports and + boilerplate are emitted. A typical `test_impl` string: + 1. Constructs the ClimaCore space (e.g. via `create_spectral_space()` or + `create_column_space()`). + 2. Allocates and initialises the fields under test. + 3. Defines a zero-argument closure `kernel_call!()` that performs the + broadcast / operator expression to benchmark. + 4. Calls `run_stress_kernel_test(test_name, kernel_call!)` to drive timing + and LLVM analysis. +""" +function generate_field_test_code(test_name::String, test_impl::String) + device_init = if has_cuda_env() + """ + using CUDA + CUDA.allowscalar(false) + """ + else + "" + end + + return """ + import Pkg + using Printf + using BenchmarkTools + using InteractiveUtils + import ClimaComms + ClimaComms.@import_required_backends + + using ClimaCore + using ClimaCore.Fields + using ClimaCore.CommonSpaces + using ClimaCore.Grids + using ClimaCore.Spaces + using ClimaCore.Domains + using ClimaCore.Meshes + using ClimaCore.Geometry + using ClimaCore.Topologies + using ClimaCore.Quadratures + + $device_init + + # Suppress informational logging + using Logging + disable_logging(Logging.Info) + + const ANALYSIS_MODE = get(ENV, "CLIMACORE_STRESS_ANALYSIS_MODE", "timing") + analysis_compile_only() = ANALYSIS_MODE == "compile" + + function _env_truthy(var::AbstractString) + raw = get(ENV, var, nothing) + raw === nothing && return false + s = lowercase(strip(String(raw))) + if s in ("1", "true", "t", "yes", "y", "on") + return true + elseif s in ("0", "false", "f", "no", "n", "off") + return false + else + parsed = tryparse(Int, s) + return !isnothing(parsed) && parsed != 0 + end + end + + function maybe_enable_cuda_kernel_stats!() + enabled = _env_truthy("CLIMA_COLLECT_CUDA_KERNEL_STATS") + enabled || return nothing + + ext = Base.get_extension(ClimaCore, :ClimaCoreCUDAExt) + isnothing(ext) && return nothing + Core.eval(ext, :(collect_kernel_stats() = true)) + return nothing + end + + maybe_enable_cuda_kernel_stats!() + + function _count_llvm_calls(llvm_ir::AbstractString) + calls = 0 + invokes = 0 + for line in split(llvm_ir, '\n') + stripped = strip(line) + startswith(stripped, ";") && continue + occursin(" call ", stripped) && (calls += 1) + occursin(" invoke ", stripped) && (invokes += 1) + end + return calls, invokes, length(split(llvm_ir, '\n')) + end + + function emit_llvm_analysis(test_name::AbstractString, thunk::Function) + llvm_ir = sprint( + io -> code_llvm(io, thunk, Tuple{}; optimize = true, debuginfo = :none, raw = true), + ) + calls, invokes, line_count = _count_llvm_calls(llvm_ir) + println( + "LLVM_ANALYSIS: test=\$(test_name) calls=\$(calls) invokes=\$(invokes) lines=\$(line_count)", + ) + return nothing + end + + function run_stress_kernel_test(test_name::AbstractString, kernel_call!::Function) + emit_llvm_analysis(test_name, kernel_call!) + + if analysis_compile_only() + # CUDA-first compile diagnostics: execute once so device kernels are + # actually compiled and kernel profile stats are emitted. + _ = kernel_call!() + println("TIMING: \$(test_name) = 0.000000 s") + return + end + + # Warmup is intentionally skipped in compile-only mode to avoid running kernels. + _ = kernel_call!() + + trial = @benchmark \$kernel_call!() samples=10 evals=1 + time_μs = minimum(trial.times) / 1000.0 + @printf "TIMING: %s = %.6f s\\n" test_name time_μs / 1e6 + end + + try + $test_impl + catch e + println("COMPILE_FAILURE: $test_name reason=\$(sprint(showerror, e))") + @error "Test failed: \$e" + rethrow() + end + """ +end + +""" + create_spectral_space() + +Helper function to create a reusable spectral element space setup code string. +""" +function create_spectral_space() + return """ + FT = Float64 + space = CommonSpaces.RectangleXYSpace(FT; + x_min = -1.0, + x_max = 1.0, + y_min = -1.0, + y_max = 1.0, + periodic_x = true, + periodic_y = true, + n_quad_points = 4, + x_elem = 16, + y_elem = 16, + ) + """ +end + +""" + arithmetic_test(depth::Int) -> String + +Generate test code for arithmetic operations with given nesting depth. +ClimaCore automatically generates kernels from the broadcast operation. +""" +function arithmetic_test(depth::Int) + # Build expression with `depth` levels of nesting + expr = "x" + ops = ["+", "*", "/", "-"] + for i in 1:depth + op = ops[mod(i, length(ops)) + 1] + val = i + expr = "($expr $op $val.0)" + end + + test_impl = create_spectral_space() * """ + + f = Fields.Field(FT, space) + fill!(Fields.field_values(f), 1.5) + + op(x) = $expr + + _ = op(1.5) + kernel_call!() = op.(f) + run_stress_kernel_test("arithmetic_depth_$(depth)", kernel_call!) + """ + + return generate_field_test_code("arithmetic_depth_$(depth)", test_impl) +end + +""" + multiarg_test(nargs::Int) -> String + +Generate test code for operations with multiple field arguments. +ClimaCore automatically generates kernels from the broadcast operation. +""" +function multiarg_test(nargs::Int) + # Build argument list + args_decl = join( + [ + "f$i = Fields.Field(FT, space);\n fill!(Fields.field_values(f$i), $(Float64(i)))" + for i in 1:nargs + ], + "\n ", + ) + args_list = join(["f$i" for i in 1:nargs], ", ") + bench_args_list = join(["\$f$i" for i in 1:nargs], ", ") + + # Build operation: (f1 + f2 + ...) / (f_last + 1) + sum_expr = join(["f$i" for i in 1:(nargs - 1)], " + ") + op_expr = "($sum_expr) / (f$nargs + 1.0)" + + test_impl = create_spectral_space() * """ + + $args_decl + + op($args_list) = $op_expr + + _ = op($(join(["$(Float64(i))" for i in 1:nargs], ", "))) + kernel_call!() = op.($args_list) + run_stress_kernel_test("multiarg_$(nargs)_args", kernel_call!) + """ + + return generate_field_test_code("multiarg_$(nargs)_args", test_impl) +end + +""" + functions_test(funcs::Vector{String}, depth::Int) -> String + +Generate test code for composed mathematical functions. +ClimaCore automatically generates kernels from the broadcast operation. +""" +function functions_test(funcs::Vector{String}, depth::Int) + label = if funcs == ["log"] + "log" + elseif funcs == ["sqrt"] + "sqrt" + elseif funcs == ["log", "sqrt", "abs"] + "mixed" + else + join(funcs, "_") + end + + test_name = "functions_$(label)_depth_$(depth)" + + # Build nested function composition with domain-safe wrappers for real-valued log/sqrt + expr = "x + 0.5" + for i in depth:-1:1 + func = funcs[mod1(i, length(funcs))] + if func == "log" + expr = "log(abs($expr) + 1.5)" + elseif func == "sqrt" + expr = "sqrt(abs($expr) + 1.5)" + else + expr = "$func($expr)" + end + end + + test_impl = create_spectral_space() * """ + + f = Fields.Field(FT, space) + fill!(Fields.field_values(f), 1.5) + + op(x) = $expr + + _ = op(1.5) + kernel_call!() = op.(f) + run_stress_kernel_test($(repr(test_name)), kernel_call!) + """ + + return generate_field_test_code(test_name, test_impl) +end + +""" + nested_calls_test(depth::Int) -> String + +Generate test code for a chain of helper functions where each helper calls the +previous one, then broadcast the outermost function across a field. +""" +function nested_calls_test(depth::Int) + test_name = "nested_calls_depth_$(depth)" + helper_defs = join(nested_call_definitions(depth), "\n ") + + test_impl = create_spectral_space() * """ + + f = Fields.Field(FT, space) + fill!(Fields.field_values(f), 1.5) + + $helper_defs + op(x) = helper_$(depth)(x) + + _ = op(1.5) + kernel_call!() = op.(f) + run_stress_kernel_test("$(test_name)", kernel_call!) + """ + + return generate_field_test_code(test_name, test_impl) +end + +""" + subexpression_args_test(mode::String) -> String + +Generate test code comparing three strategies for passing a NamedTuple parameter +alongside field arguments inside a broadcast expression: + +- `bare_namedtuple`: passes the NamedTuple directly inside `@.`; Julia refuses to + broadcast over NamedTuples and this **fails at runtime** with + `ArgumentError: broadcasting over dictionaries and NamedTuples is reserved`. + This reproduces the original ClimaAtmos microphysics precipitation-velocity bug. + +- `closure_wrapped`: captures the NamedTuple in a `let` closure so broadcast + never sees it; all subexpressions remain inline. This is the minimal fix. + +- `precomputed`: evaluates each subexpression into a separate field first, then + calls through the same closure. Maximally explicit; fewest inline operations. +""" +function subexpression_args_test(mode::String) + mode in ("bare_namedtuple", "closure_wrapped", "precomputed") || + error("Unknown mode: $mode") + test_name = "subexpression_args_$(mode)" + + # bare_namedtuple passes scheme directly and must fail. + # The other two modes wrap it in a closure so broadcast never touches scheme. + closure_setup = if mode == "bare_namedtuple" + "" # no closure — scheme is passed directly and will error + else + """ + fn_with_scheme = let s = scheme + (q, n, rqi, rqb) -> log(abs(s.c1 * q + s.c2 * n) + 1) + s.c3 * (rqi - rqb) + end + """ + end + + bench_expr = if mode == "bare_namedtuple" + "@. loglambda = my_get_distribution_loglambda(scheme, max(zero(rhoq_ice), rhoq_ice), max(zero(rhon_ice), rhon_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim))" + elseif mode == "closure_wrapped" + "@. loglambda = fn_with_scheme(max(zero(rhoq_ice), rhoq_ice), max(zero(rhon_ice), rhon_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice), ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim))" + else # precomputed + "@. loglambda = fn_with_scheme(rhoq_ice_pos, rhon_ice_pos, rim_over_ice, rim_over_bulk)" + end + + precompute_block = if mode == "precomputed" + """ + @. rhoq_ice_pos = max(zero(rhoq_ice), rhoq_ice) + @. rhon_ice_pos = max(zero(rhon_ice), rhon_ice) + @. rim_over_ice = ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhoq_ice) + @. rim_over_bulk = ifelse(iszero(rhoq_ice), zero(rhoq_ice), rhoq_rim / rhob_rim) + """ + else + "" + end + + # bare_namedtuple is expected to fail; catch and report rather than crashing. + bench_block = if mode == "bare_namedtuple" + """ + try + $bench_expr + @printf "TIMING: $(test_name) = 0.0 s\\n" # should not reach here + catch _e + println("EXPECTED_FAILURE: $(test_name): \$(sprint(showerror, _e))") + end + """ + else + """ + kernel_call!() = begin + $bench_expr + end + run_stress_kernel_test("$(test_name)", kernel_call!) + """ + end + + test_impl = create_spectral_space() * """ + + rhoq_ice = Fields.Field(FT, space) + rhon_ice = Fields.Field(FT, space) + rhoq_rim = Fields.Field(FT, space) + rhob_rim = Fields.Field(FT, space) + loglambda = Fields.Field(FT, space) + rhoq_ice_pos = Fields.Field(FT, space) + rhon_ice_pos = Fields.Field(FT, space) + rim_over_ice = Fields.Field(FT, space) + rim_over_bulk = Fields.Field(FT, space) + + fill!(Fields.field_values(rhoq_ice), 1.5) + fill!(Fields.field_values(rhon_ice), 0.5) + fill!(Fields.field_values(rhoq_rim), 0.75) + fill!(Fields.field_values(rhob_rim), 2.0) + fill!(Fields.field_values(loglambda), 0.0) + + scheme = (; c1 = FT(0.7), c2 = FT(1.1), c3 = FT(0.4)) + my_get_distribution_loglambda(s, q, n, rqi, rqb) = + log(abs(s.c1 * q + s.c2 * n) + 1) + s.c3 * (rqi - rqb) + + $precompute_block + $closure_setup + $bench_block + """ + + return generate_field_test_code(test_name, test_impl) +end + +""" + projection_test(complexity::Int) -> String + +Generate test code for projection operations on geometric objects. +ClimaCore automatically generates kernels from the broadcast operation. +""" +function projection_test(complexity::Int) + # Use @. macro so ClimaCore can supply LocalGeometry during the fused broadcast + proj_terms = join( + ["Geometry.project(Geometry.Covariant12Axis(), v)" for _ in 1:complexity], + " .+ ", + ) + test_name = "projection_$(complexity)x_chained" + + test_impl = create_spectral_space() * """ + + v = Fields.Field(Geometry.Covariant12Vector{FT}, space) + fill!(Fields.field_values(v), Geometry.Covariant12Vector(1.0, 2.0)) + + kernel_call!() = begin + @. $proj_terms + end + run_stress_kernel_test("$(test_name)", kernel_call!) + """ + + return generate_field_test_code(test_name, test_impl) +end + +""" + create_column_space() + +Helper function to create a vertical column (finite difference) space setup code string. +Produces both center and face spaces needed for C2F/F2C operators. +""" +function create_column_space() + return """ + FT = Float64 + center_space = CommonSpaces.ColumnSpace(FT; + z_min = 0.0, + z_max = 1.0, + z_elem = 63, + staggering = Grids.CellCenter(), + ) + face_space = CommonSpaces.ColumnSpace(FT; + z_min = 0.0, + z_max = 1.0, + z_elem = 63, + staggering = Grids.CellFace(), + ) + """ +end + +""" + div_test(n::Int) -> String + +Generate test code that packs n Divergence calls into a single broadcast expression. +Tests how many spectral-element divergences the compiler can inline before giving up. +""" +function div_test(n::Int) + warm = join(["div_op.(v .* $(i).0)" for i in 1:n], " .+ ") + bench = join(["\$div_op.(\$v .* $(i).0)" for i in 1:n], " .+ ") + + test_impl = create_spectral_space() * """ + + using ClimaCore.Operators + + div_op = Operators.Divergence() + v = Fields.Field(Geometry.Contravariant12Vector{FT}, space) + fill!(Fields.field_values(v), Geometry.Contravariant12Vector(1.0, 2.0)) + + kernel_call!() = begin + $warm + end + run_stress_kernel_test("div_$(n)_ops", kernel_call!) + """ + + return generate_field_test_code("div_$(n)_ops", test_impl) +end + +""" + curl_test(n::Int) -> String + +Generate test code that packs n Curl calls into a single broadcast expression. +Tests how many spectral-element curls the compiler can inline before giving up. +""" +function curl_test(n::Int) + warm = join(["curl_op.(v .* $(i).0)" for i in 1:n], " .+ ") + bench = join(["\$curl_op.(\$v .* $(i).0)" for i in 1:n], " .+ ") + + test_impl = create_spectral_space() * """ + + using ClimaCore.Operators + + curl_op = Operators.Curl() + v = Fields.Field(Geometry.Covariant12Vector{FT}, space) + fill!(Fields.field_values(v), Geometry.Covariant12Vector(1.0, 2.0)) + + kernel_call!() = begin + $warm + end + run_stress_kernel_test("curl_$(n)_ops", kernel_call!) + """ + + return generate_field_test_code("curl_$(n)_ops", test_impl) +end + +""" + interp_test(n::Int) -> String + +Generate test code that packs n InterpolateC2F calls into a single broadcast expression. +Tests how many center-to-face interpolations the compiler can inline before giving up. +""" +function interp_test(n::Int) + warm = join(["interp.(ᶜf .* $(i).0)" for i in 1:n], " .+ ") + bench = join(["\$interp.(\$ᶜf .* $(i).0)" for i in 1:n], " .+ ") + + test_impl = create_column_space() * """ + + using ClimaCore.Operators + + interp = Operators.InterpolateC2F( + bottom = Operators.Extrapolate(), + top = Operators.Extrapolate(), + ) + ᶜf = Fields.Field(FT, center_space) + fill!(Fields.field_values(ᶜf), 1.5) + + kernel_call!() = begin + $warm + end + run_stress_kernel_test("interp_c2f_$(n)_ops", kernel_call!) + """ + + return generate_field_test_code("interp_c2f_$(n)_ops", test_impl) +end + +""" + weighted_interp_test(n::Int) -> String + +Generate test code that packs n WeightedInterpolateC2F calls into a single broadcast expression. +Tests how many weighted center-to-face interpolations the compiler can inline before giving up. +""" +function weighted_interp_test(n::Int) + warm = join(["winterp.(ᶜw, ᶜf .* $(i).0)" for i in 1:n], " .+ ") + bench = join(["\$winterp.(\$ᶜw, \$ᶜf .* $(i).0)" for i in 1:n], " .+ ") + + test_impl = create_column_space() * """ + + using ClimaCore.Operators + + winterp = Operators.WeightedInterpolateC2F( + bottom = Operators.Extrapolate(), + top = Operators.Extrapolate(), + ) + ᶜw = Fields.Field(FT, center_space) + ᶜf = Fields.Field(FT, center_space) + fill!(Fields.field_values(ᶜw), 1.0) + fill!(Fields.field_values(ᶜf), 1.5) + + kernel_call!() = begin + $warm + end + run_stress_kernel_test("weighted_interp_c2f_$(n)_ops", kernel_call!) + """ + + return generate_field_test_code("weighted_interp_c2f_$(n)_ops", test_impl) +end + +""" + upwinding_test(n::Int) -> String + +Generate test code that packs n Upwind3rdOrderBiasedProductC2F calls into a single +broadcast expression. Tests how many 3rd-order upwind flux evaluations the compiler +can inline before giving up. +""" +function upwinding_test(n::Int) + warm = join(["upwind.(ᶠv, ᶜf .* $(i).0)" for i in 1:n], " .+ ") + bench = join(["\$upwind.(\$ᶠv, \$ᶜf .* $(i).0)" for i in 1:n], " .+ ") + + test_impl = create_column_space() * """ + + using ClimaCore.Operators + + upwind = Operators.Upwind3rdOrderBiasedProductC2F( + bottom = Operators.ThirdOrderOneSided(), + top = Operators.ThirdOrderOneSided(), + ) + ᶠv = Fields.Field(Geometry.WVector{FT}, face_space) + ᶜf = Fields.Field(FT, center_space) + fill!(Fields.field_values(ᶠv), Geometry.WVector(1.0)) + fill!(Fields.field_values(ᶜf), 1.5) + + kernel_call!() = begin + $warm + end + run_stress_kernel_test("upwinding_3rdorder_$(n)_ops", kernel_call!) + """ + + return generate_field_test_code("upwinding_3rdorder_$(n)_ops", test_impl) +end + +""" + climaatmos_column_test(repeats::Int) -> String + +Generate a ClimaAtmos-like fused column broadcast that mixes closure-wrapped +microphysics-style subexpressions with interpolation and upwinding. Each repeat +adds another heavy closure invocation to the same broadcast expression. +""" +function climaatmos_column_test(repeats::Int) + test_name = "climaatmos_column_$(repeats)x" + bench_expr = climaatmos_column_expression(repeats) + + test_impl = create_column_space() * """ + + using ClimaCore.Operators + + interp = Operators.InterpolateC2F( + bottom = Operators.Extrapolate(), + top = Operators.Extrapolate(), + ) + winterp = Operators.WeightedInterpolateC2F( + bottom = Operators.Extrapolate(), + top = Operators.Extrapolate(), + ) + upwind = Operators.Upwind3rdOrderBiasedProductC2F( + bottom = Operators.ThirdOrderOneSided(), + top = Operators.ThirdOrderOneSided(), + ) + + ᶜw = Fields.Field(FT, center_space) + ᶜn = Fields.Field(FT, center_space) + ᶠv = Fields.Field(Geometry.WVector{FT}, face_space) + rhoq_ice = Fields.Field(FT, center_space) + rhon_ice = Fields.Field(FT, center_space) + rhoq_rim = Fields.Field(FT, center_space) + rhob_rim = Fields.Field(FT, center_space) + tendency = Fields.Field(FT, face_space) + + fill!(Fields.field_values(ᶜw), 1.0) + fill!(Fields.field_values(ᶜn), 0.8) + fill!(Fields.field_values(ᶠv), Geometry.WVector(0.75)) + fill!(Fields.field_values(rhoq_ice), 1.5) + fill!(Fields.field_values(rhon_ice), 0.5) + fill!(Fields.field_values(rhoq_rim), 0.75) + fill!(Fields.field_values(rhob_rim), 2.0) + fill!(Fields.field_values(tendency), 0.0) + + scheme = (; c1 = FT(0.7), c2 = FT(1.1), c3 = FT(0.4), c4 = FT(0.2)) + fn_with_scheme = let s = scheme + (qp, np, rim1, rim2, qface, rface) -> qp + end + + kernel_call!() = begin + $bench_expr + end + + run_stress_kernel_test("$(test_name)", kernel_call!) + """ + + return generate_field_test_code(test_name, test_impl) +end + +function lazy_broadcast_tree_test(depth::Int, breadth::Int) + tree_builder = lazy_broadcast_tree_builder(depth, breadth) + fields_decl = join( + [ + "f$i = Fields.Field(FT, space)\n fill!(Fields.field_values(f$i), $(0.75 + 0.1 * i))" + for i in 1:breadth + ], + "\n ", + ) + test_name = "lazy_broadcast_d$(depth)_b$(breadth)" + + test_impl = create_spectral_space() * """ + + $fields_decl + + kernel_call!() = begin + tree = begin + $tree_builder + end + Base.copy(tree) + end + run_stress_kernel_test("$(test_name)", kernel_call!) + """ + + return generate_field_test_code(test_name, test_impl) +end + +# ============================================================================ +# TEST CATALOG: Define all test cases +# ============================================================================ + +""" + struct TestDef + +Definition of a single test case for generation and execution. +""" +struct TestDef + name::String + description::String + operation_type::String # "arithmetic", "multiarg", "functions", "nested_calls", "subexpression_args", "projection", "divergence", "curl", "interpolate", "weighted_interpolate", "upwinding", "climaatmos", "lazy_broadcast_tree" + complexity::Int # nesting depth or argument count + num_args::Int + uses_geometry::Bool + code_generator::Function # Function that generates test code +end + +# Create test definitions +const ALL_TESTS = + [ + # Arithmetic: keep one sanity case and one edge case + [ + TestDef("arithmetic_depth_$i", "Arithmetic operations with depth $i", + "arithmetic", i, 1, false, + () -> arithmetic_test(i)) for i in [1, 24] + ] + + # Multiple arguments: low and high fan-in + [ + TestDef("multiarg_$(i)_args", "Operations with $i field arguments", + "multiarg", 1, i, false, + () -> multiarg_test(i)) for i in [2, 16] + ] + + # Function compositions: low and high depth + [ + TestDef("functions_log_depth_$i", "Log function composed $i times", + "functions", i, 1, false, + () -> functions_test(["log"], i)) for i in [1, 6] + ] + [ + TestDef("functions_sqrt_depth_$i", "Sqrt function composed $i times", + "functions", i, 1, false, + () -> functions_test(["sqrt"], i)) for i in [1, 6] + ] + [ + TestDef("functions_mixed_depth_$i", "Mixed functions (log, sqrt, abs) depth $i", + "functions", i, 1, false, + () -> functions_test(["log", "sqrt", "abs"], i)) for i in [1, 4] + ] + + # Nested helper-function call chains: shallow and deep + [ + TestDef("nested_calls_depth_$i", "Helper-function call chain depth $i", + "nested_calls", i, 1, false, + () -> nested_calls_test(i)) for i in [1, 24] + ] + + # Function-call args with inline subexpressions vs precomputed intermediates + [ + TestDef("subexpression_args_bare_namedtuple", + "NamedTuple param passed bare inside @. (expected failure)", + "subexpression_args", 0, 5, false, + () -> subexpression_args_test("bare_namedtuple")) + TestDef("subexpression_args_closure_wrapped", + "NamedTuple param captured in let-closure (minimal fix)", + "subexpression_args", 1, 5, false, + () -> subexpression_args_test("closure_wrapped")) + TestDef("subexpression_args_precomputed", + "Subexpressions precomputed into separate fields", + "subexpression_args", 2, 5, false, + () -> subexpression_args_test("precomputed")) + ] + + # Projection operations: include one intermediate near edge + [ + TestDef("projection_$(i)x_chained", "Chained projection operations x$i", + "projection", i, 1, true, + () -> projection_test(i)) for i in [1, 8, 12] + ] + + # Divergence operations: add near-edge points to better resolve the cliff + [ + TestDef("div_$(i)_ops", "$i Divergence calls in one expression", + "divergence", i, 1, true, + () -> div_test(i)) for i in [1, 8, 12, 14] + ] + + # Curl operations: add near-edge points to better resolve the cliff + [ + TestDef("curl_$(i)_ops", "$i Curl calls in one expression", + "curl", i, 1, true, + () -> curl_test(i)) for i in [1, 8, 12, 14] + ] + + # C2F interpolation: include intermediate and near-edge points + [ + TestDef("interp_c2f_$(i)_ops", "$i InterpolateC2F calls in one expression", + "interpolate", i, 1, false, + () -> interp_test(i)) for i in [1, 8, 12, 14, 16] + ] + + # Weighted C2F interpolation: include intermediate and near-edge points + [ + TestDef("weighted_interp_c2f_$(i)_ops", + "$i WeightedInterpolateC2F calls in one expression", + "weighted_interpolate", i, 1, false, + () -> weighted_interp_test(i)) for i in [1, 8, 12] + ] + + # Van Leer upwinding: include intermediate and near-edge points + [ + TestDef("upwinding_3rdorder_$(i)_ops", + "$i Upwind3rdOrderBiasedProductC2F calls in one expression", + "upwinding", i, 1, false, + () -> upwinding_test(i)) for i in [1, 8] + ] + + # ClimaAtmos-like fused column broadcasts: shallow and deep + [ + TestDef("climaatmos_column_$(i)x", + "ClimaAtmos-like fused column broadcast repeated x$i", + "climaatmos", i, 6, false, + () -> climaatmos_column_test(i)) for i in [1, 6] + ] + + # Nested lazy-broadcast trees (depth = number of broadcasted layers) + # Breadth=2 series: fine-grained depth sweep to find inlining cliffs + [ + TestDef( + "lazy_broadcast_d$(d)_b2", + "Nested lazy broadcast depth=$(d), breadth=2", + "lazy_broadcast_tree", + d, + 2, + false, + () -> lazy_broadcast_tree_test(d, 2), + ) for d in [1, 4, 8] + ] + + # Breadth=4 series: moderate fan-in per lazy layer + [ + TestDef( + "lazy_broadcast_d$(d)_b4", + "Nested lazy broadcast depth=$(d), breadth=4", + "lazy_broadcast_tree", + d, + 4, + false, + () -> lazy_broadcast_tree_test(d, 4), + ) for d in [1, 4] + ] + ] |> vec + +# ============================================================================ +# EXECUTION AND REPORTING +# ============================================================================ + +""" + mutable struct TestResult + +Stores the result of a single test execution. +""" +mutable struct TestResult + test_def::TestDef + success::Bool + time_seconds::Union{Float64, Nothing} + error_msg::String + cuda_profile_summary::Union{CUDAProfileSummary, Nothing} + llvm_analysis_summary::Union{LLVMAnalysisSummary, Nothing} + + TestResult(test_def) = new(test_def, false, nothing, "", nothing, nothing) +end + +function Base.show(io::IO, r::TestResult) + println(io, "TestResult(") + println(io, " name = ", repr(r.test_def.name)) + println(io, " success = ", r.success) + if r.success && !isnothing(r.time_seconds) + @printf io " time_μs = %.3f\n" r.time_seconds * 1e6 + end + if !isempty(r.error_msg) + println(io, " error = ", repr(r.error_msg)) + end + if !isnothing(r.cuda_profile_summary) + # Indent the nested struct by prepending two spaces to each of its lines + profile_str = sprint(show, r.cuda_profile_summary) + indented = join(" " .* split(profile_str, '\n'), '\n') + println(io, " cuda_profile = ", indented) + end + if !isnothing(r.llvm_analysis_summary) + llvm_str = sprint(show, r.llvm_analysis_summary) + indented_llvm = join(" " .* split(llvm_str, '\n'), '\n') + println(io, " llvm_analysis = ", indented_llvm) + end + print(io, ")") +end + +""" + run_test(test_def::TestDef) -> TestResult + +Run a single test case in a subprocess and collect results. +""" +function run_test(test_def::TestDef, analysis_mode::String) + result = TestResult(test_def) + + # Generate test code + test_code = test_def.code_generator() + + # Run in subprocess + success, output, error = run_test_subprocess(test_code, test_def.name, analysis_mode) + + if success + result.cuda_profile_summary = + summarize_cuda_profiles(parse_cuda_profile_from_output(output)) + result.llvm_analysis_summary = parse_llvm_analysis_from_output(output) + # Parse timing from output + timings = parse_timings_from_output(output) + if haskey(timings, test_def.name) + result.time_seconds = timings[test_def.name] + result.success = true + elseif any( + startswith(l, "EXPECTED_FAILURE: $(test_def.name)") for l in split(output, '\n') + ) + # Expected failure: extract error message + for l in split(output, '\n') + if startswith(l, "EXPECTED_FAILURE: $(test_def.name)") + result.error_msg = + replace(l, "EXPECTED_FAILURE: $(test_def.name): " => "", count = 1) + break + end + end + result.success = true + result.time_seconds = nothing + else + result.error_msg = "Failed to parse timing from output" + end + else + # Subprocess failed: try to extract a structured COMPILE_FAILURE line from + # combined output before falling back to raw stderr. + compile_failure_msg = nothing + for l in split(output, '\n') + if startswith(l, "COMPILE_FAILURE: $(test_def.name) reason=") + compile_failure_msg = + replace(l, "COMPILE_FAILURE: $(test_def.name) reason=" => "", count = 1) + break + end + end + result.error_msg = + !isnothing(compile_failure_msg) ? compile_failure_msg : error + # LLVM analysis may have been emitted before the failure — capture it. + result.llvm_analysis_summary = parse_llvm_analysis_from_output(output) + end + + return result +end + +""" + print_result(result::TestResult) + +Pretty-print a test result. +""" +function print_result(result::TestResult) + test = result.test_def + if result.success && !isnothing(result.time_seconds) + time_μs = result.time_seconds * 1e6 + @printf " %-45s %10.3f μs" test.name time_μs + @printf " (depth=%d, args=%d)" test.complexity test.num_args + if test.uses_geometry + print(" [geometry]") + end + println() + if !isnothing(result.cuda_profile_summary) + println(" " * format_cuda_profile(result.cuda_profile_summary)) + end + if !isnothing(result.llvm_analysis_summary) + println(" " * format_llvm_analysis(result.llvm_analysis_summary)) + end + elseif result.success + # Expected-failure case: documented failure mode, no timing. + @printf " %-45s [expected failure: %s]\n" test.name result.error_msg + else + @printf " %-45s ERROR: %s\n" test.name result.error_msg + end +end + +""" + main(; test_filter::Union{String, Nothing}=nothing) + +Run all tests and produce a report. +""" +function main(; + test_filter::Union{String, Nothing} = nothing, + output_json::Union{String, Nothing} = nothing, + output_markdown::Union{String, Nothing} = nothing, + compare_against::Union{String, Nothing} = nothing, + analysis_mode::String = "timing", +) + println("="^90) + println("ClimaCore Compiler Stress Test Suite - Pointwise/Broadcast Operations") + println("Device: $(DEVICE)") + println("Analysis mode: $(analysis_mode)") + has_cuda_env() && println("CUDA warnings disabled to catch only actual failures") + println("="^90) + println() + + # Filter tests if requested + tests = if isnothing(test_filter) + ALL_TESTS + else + filter(t -> contains(t.name, test_filter), ALL_TESTS) + end + + if isempty(tests) + println("No tests matching filter: $test_filter") + println("\nAvailable test categories:") + for op_type in sort(unique(t.operation_type for t in ALL_TESTS)) + matching = filter(t -> t.operation_type == op_type, ALL_TESTS) + println(" $op_type: $(length(matching)) tests") + for test in matching[1:min(3, length(matching))] + println(" - $(test.name)") + end + length(matching) > 3 && println(" ... and $(length(matching) - 3) more") + end + return nothing + end + + println("Running $(length(tests)) test case(s):") + println() + + results = TestResult[] + + # Run all tests + for (i, test) in enumerate(tests) + @printf "[%2d/%2d] %-45s ... " i length(tests) test.name + flush(stdout) + + result = run_test(test, analysis_mode) + push!(results, result) + + if result.success + println("✓") + else + println("✗") + end + end + + println() + println("="^90) + println("Results") + println("="^90) + println() + + # Group by operation type + by_type = Dict{String, Vector{TestResult}}() + for result in results + op_type = result.test_def.operation_type + if !haskey(by_type, op_type) + by_type[op_type] = [] + end + push!(by_type[op_type], result) + end + + # Print results by type + for op_type in sort(collect(keys(by_type))) + type_results = by_type[op_type] + successful = count(r -> r.success, type_results) + + println("$op_type operations ($successful/$(length(type_results)) successful):") + + for result in sort(type_results, by = r -> r.test_def.complexity) + print_result(result) + end + + println() + end + + # Summary statistics + successful = filter(r -> r.success, results) + if !isempty(successful) + println("="^90) + println("Performance Summary") + println("="^90) + + times = [r.time_seconds * 1e6 for r in successful if !isnothing(r.time_seconds)] # convert to microseconds + + if isempty(times) + println("Execution times (microseconds): no timed successes") + else + println("Execution times (microseconds):") + @printf " Minimum: %.3f μs\n" minimum(times) + @printf " Maximum: %.3f μs\n" maximum(times) + @printf " Mean: %.3f μs\n" mean(times) + if length(times) >= 2 + @printf " Median: %.3f μs\n" median(times) + end + end + end + + println() + num_successful = length(successful) + num_failed = length(results) - num_successful + + if num_failed == 0 + println("✓ All $(num_successful) tests passed!") + else + println("✗ $(num_failed) test(s) failed out of $(length(results)) total") + end + + report = build_report(results, test_filter, analysis_mode) + comparison = nothing + if !isnothing(compare_against) + baseline_path = resolve_output_path(compare_against) + baseline_report = read_json_report(baseline_path) + comparison = compare_reports(report, baseline_report) + report["comparison"] = comparison + end + if !isnothing(output_json) + output_json = resolve_output_path(output_json) + write_json_report(output_json, report) + println("Wrote JSON report: $(output_json)") + end + if !isnothing(output_markdown) + output_markdown = resolve_output_path(output_markdown) + write_markdown_report(output_markdown, report; comparison) + println("Wrote markdown report: $(output_markdown)") + end + + return report +end + +# ============================================================================ +# Entry point +# ============================================================================ + +if abspath(PROGRAM_FILE) == @__FILE__ + # Initialize project first + initialize_project() + + options = parse_cli_args(ARGS) + main( + ; + test_filter = options.test_filter, + output_json = options.output_json, + output_markdown = options.output_markdown, + compare_against = options.compare_against, + analysis_mode = options.analysis_mode, + ) +end diff --git a/test/gpu/compiler_stress_regression.jl b/test/gpu/compiler_stress_regression.jl new file mode 100644 index 0000000000..75961d4339 --- /dev/null +++ b/test/gpu/compiler_stress_regression.jl @@ -0,0 +1,58 @@ +using Test +using CUDA + +# Regression tests for compiler stress behavior near known failure thresholds. +# These use compile-only mode for speed while still compiling GPU kernels. + +include(joinpath(@__DIR__, "..", "..", "perf", "stress_test_compiler.jl")) + +function _find_stress_test(name::String) + idx = findfirst(t -> t.name == name, ALL_TESTS) + @test !isnothing(idx) + return ALL_TESTS[idx] +end + +function _run_compile_mode(test_name::String) + old_slurm_job_id = get(ENV, "SLURM_JOB_ID", nothing) + # Force local subprocess execution in tests (avoid nested `srun`). + ENV["SLURM_JOB_ID"] = "climacore-test" + try + return run_test(_find_stress_test(test_name), "compile") + finally + if isnothing(old_slurm_job_id) + delete!(ENV, "SLURM_JOB_ID") + else + ENV["SLURM_JOB_ID"] = old_slurm_job_id + end + end +end + +@testset "GPU compiler stress regressions" begin + @test CUDA.functional() + + # Near-threshold pass should continue to compile. + div12 = _run_compile_mode("div_12_ops") + @test div12.success + @test !isnothing(div12.cuda_profile_summary) + @test div12.cuda_profile_summary.registers >= 48 + + # Known brink failures should remain explicit failures (not silent passes). + div14 = _run_compile_mode("div_14_ops") + @test !div14.success + + curl14 = _run_compile_mode("curl_14_ops") + @test !curl14.success + + # Nested lazy-broadcast case should still compile in compile-only mode. + # This is known to fail on Julia v1.10 and earlier, but should pass on v1.11+. + lazy_d4_b2 = _run_compile_mode("lazy_broadcast_d4_b2") + if VERSION < v"1.11" + @test_broken lazy_d4_b2.success + else + @test lazy_d4_b2.success + end + if lazy_d4_b2.success + @test !isnothing(lazy_d4_b2.llvm_analysis_summary) + @test lazy_d4_b2.llvm_analysis_summary.invoke_count == 0 + end +end diff --git a/test/runtests.jl b/test/runtests.jl index ecd1264ff7..d993837649 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -116,6 +116,7 @@ UnitTest("Vertical interpolation" ,"Remapping/interpolate_press UnitTest("Aqua" ,"aqua.jl"), UnitTest("Deprecations" ,"deprecations.jl"), UnitTest("GPU - cuda" ,"gpu/cuda.jl";meta=:gpu_only), +UnitTest("GPU - compiler stress regression" ,"gpu/compiler_stress_regression.jl";meta=:gpu_only), UnitTest("GPU - data" ,"DataLayouts/cuda.jl";meta=:gpu_only), UnitTest("Operators - spectral element CUDA" ,"Operators/spectralelement/rectilinear_cuda.jl";meta=:gpu_only), UnitTest("Operators - finite difference CUDA" ,"Operators/hybrid/unit_cuda.jl";meta=:gpu_only),