Skip to content
Draft
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7b59f0f
Initial work on compiler profiling.
mcourteaux Mar 8, 2026
deea8f8
Refine lambda argument requirements in IRMutator and IRVisitor
alexreinking Mar 8, 2026
6bed833
Early exit in the loop-checking visitor
alexreinking Mar 8, 2026
4ea11aa
Compute last_use in-line
alexreinking Mar 8, 2026
2e9f117
Avoid redundant FindBufferUsage in For loop visitor
alexreinking Mar 8, 2026
9094ea8
fixup! Compute last_use in-line
alexreinking Mar 8, 2026
de8dc95
More profiling stuff.
mcourteaux Mar 8, 2026
def7be2
Fix build when not compiling in profiling.
mcourteaux Mar 8, 2026
dfc98d9
Disable RTTI naming when it's not enabled in the build config.
mcourteaux Mar 9, 2026
23071b2
Merge remote-tracking branch 'origin/alexreinking/inject-host-copies-…
mcourteaux Mar 9, 2026
b297402
Cleanup.
mcourteaux Mar 9, 2026
25aef21
Annotate InjectHostDevBufferCopies
mcourteaux Mar 9, 2026
cd1488a
Annotate Bounds and AddImageChecks
mcourteaux Mar 9, 2026
01cb49c
More annotating.
mcourteaux Mar 9, 2026
ec468a4
Clang-format and makefile fix, and support no RTTI.
mcourteaux Mar 9, 2026
f833bfd
Missing header in makefile.
mcourteaux Mar 9, 2026
865c601
Merge branch 'main' into compiler-profiling
mcourteaux Mar 14, 2026
2f3c0f3
Remove Profiled<...> from all mutators/visitors.
mcourteaux Mar 14, 2026
8c7aaed
Strip PerformanceCounter and use chrono instead, for simplicity.
mcourteaux Mar 14, 2026
c53b74b
Ditch profiled_xxx in favor of a simple call to operator()(...)
mcourteaux Mar 14, 2026
5738ed9
Change the main entry point of visitors and mutators to operator().
mcourteaux Mar 14, 2026
e6f9238
Clang-format
mcourteaux Mar 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF)
Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL)
Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON
DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI)
Halide_feature(WITH_COMPILER_PROFILING "Enable internal compiler tracing" OFF)
Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING
"Intercepting JIT compilation with a serialization roundtrip, for test only"
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ SOURCE_FILES = \
CodeGen_WebAssembly.cpp \
CodeGen_WebGPU_Dev.cpp \
CodeGen_X86.cpp \
CompilerProfiling.cpp \
CompilerLogger.cpp \
ConstantBounds.cpp \
ConstantInterval.cpp \
Expand Down Expand Up @@ -677,6 +678,7 @@ HEADER_FILES = \
CodeGen_PyTorch.h \
CodeGen_Targets.h \
CodeGen_WebGPU_Dev.h \
CompilerProfiling.h \
CompilerLogger.h \
ConciseCasts.h \
CPlusPlusMangle.h \
Expand Down Expand Up @@ -759,6 +761,7 @@ HEADER_FILES = \
Param.h \
Parameter.h \
PartitionLoops.h \
PerformanceCounter.h \
Pipeline.h \
Prefetch.h \
PrefetchDirective.h \
Expand Down
17 changes: 11 additions & 6 deletions src/AddImageChecks.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AddImageChecks.h"
#include "CompilerProfiling.h"
#include "ExternFuncArgument.h"
#include "Function.h"
#include "IRMutator.h"
Expand Down Expand Up @@ -103,6 +104,7 @@ class TrimStmtToPartsThatAccessBuffers : public IRMutator {
bool touches_buffer = false;
const map<string, FindBuffers::Result> &buffers;

protected:
using IRMutator::visit;

Expr visit(const Call *op) override {
Expand Down Expand Up @@ -161,11 +163,12 @@ Stmt add_image_checks_inner(Stmt s,
const map<string, Function> &env,
const FuncValueBounds &fb,
bool will_inject_host_copies) {
ZoneScoped;

bool no_bounds_query = t.has_feature(Target::NoBoundsQuery);

// First hunt for all the referenced buffers
FindBuffers finder;
Profiled<FindBuffers> finder;
map<string, FindBuffers::Result> &bufs = finder.buffers;

// Add the output buffer(s).
Expand All @@ -185,10 +188,10 @@ Stmt add_image_checks_inner(Stmt s,

// Add the input buffer(s) and annotate which output buffers are
// used on host.
s.accept(&finder);
finder.profiled_visit(s);

Scope<Interval> empty_scope;
Stmt sub_stmt = TrimStmtToPartsThatAccessBuffers(bufs).mutate(s);
Stmt sub_stmt = Profiled<TrimStmtToPartsThatAccessBuffers>(bufs).profiled_mutate(s);
map<string, Box> boxes = boxes_touched(sub_stmt, empty_scope, fb);

// Now iterate through all the buffers, creating a list of lets
Expand Down Expand Up @@ -225,7 +228,7 @@ Stmt add_image_checks_inner(Stmt s,
string extent_name = concat_strings(name, ".extent.", i);
string stride_name = concat_strings(name, ".stride.", i);
replace_with_required[min_name] = Variable::make(Int(32), min_name + ".required");
replace_with_required[extent_name] = simplify(Variable::make(Int(32), extent_name + ".required"));
replace_with_required[extent_name] = Variable::make(Int(32), extent_name + ".required");
replace_with_required[stride_name] = Variable::make(Int(32), stride_name + ".required");
}
}
Expand Down Expand Up @@ -737,6 +740,7 @@ Stmt add_image_checks(const Stmt &s,
// Checks for images go at the marker deposited by computation
// bounds inference.
class Injector : public IRMutator {
protected:
using IRMutator::visit;

Expr visit(const Variable *op) override {
Expand Down Expand Up @@ -794,9 +798,10 @@ Stmt add_image_checks(const Stmt &s,
bool will_inject_host_copies)
: outputs(outputs), t(t), order(order), env(env), fb(fb), will_inject_host_copies(will_inject_host_copies) {
}
} injector(outputs, t, order, env, fb, will_inject_host_copies);
};
Profiled<Injector> injector(outputs, t, order, env, fb, will_inject_host_copies);

return injector.mutate(s);
return injector.profiled_mutate(s);
}

} // namespace Internal
Expand Down
3 changes: 2 additions & 1 deletion src/AddParameterChecks.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AddParameterChecks.h"
#include "CompilerProfiling.h"
#include "IROperator.h"
#include "IRVisitor.h"
#include "Substitute.h"
Expand Down Expand Up @@ -35,7 +36,7 @@ class FindParameters : public IRGraphVisitor {
Stmt add_parameter_checks(const vector<Stmt> &preconditions, Stmt s, const Target &t) {

// First, find all the parameters
FindParameters finder;
Profiled<FindParameters> finder;
s.accept(&finder);

map<string, Expr> replace_with_constrained;
Expand Down
24 changes: 18 additions & 6 deletions src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ class NoOpCollapsingMutator : public IRMutator {
};

class GenerateProducerBody : public NoOpCollapsingMutator {
protected:
const string &func;
vector<Expr> sema;
std::set<string> producers_dropped;
Expand Down Expand Up @@ -285,6 +286,7 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
};

class GenerateConsumerBody : public NoOpCollapsingMutator {
protected:
const string &func;
vector<Expr> sema;

Expand Down Expand Up @@ -342,6 +344,7 @@ class GenerateConsumerBody : public NoOpCollapsingMutator {
};

class CloneAcquire : public IRMutator {
protected:
using IRMutator::visit;

const string &old_name;
Expand Down Expand Up @@ -390,6 +393,7 @@ class CountConsumeNodes : public IRVisitor {
};

class ForkAsyncProducers : public IRMutator {
protected:
using IRMutator::visit;

const map<string, Function> &env;
Expand Down Expand Up @@ -493,6 +497,7 @@ class ForkAsyncProducers : public IRMutator {
// simple failure case, error_async_require_fail. One has not been
// written for the complex nested case yet.)
class InitializeSemaphores : public IRMutator {
protected:
using IRMutator::visit;

const Type sema_type = type_of<halide_semaphore_t *>();
Expand Down Expand Up @@ -558,6 +563,7 @@ class InitializeSemaphores : public IRMutator {
// A class to support stmt_uses_vars queries that repeatedly hit the same
// sub-stmts. Used to support TightenProducerConsumerNodes below.
class CachingStmtUsesVars : public IRMutator {
protected:
const Scope<> &query;
bool found_use = false;
std::map<Stmt, bool> cache;
Expand Down Expand Up @@ -613,6 +619,7 @@ class CachingStmtUsesVars : public IRMutator {

// Tighten the scope of consume nodes as much as possible to avoid needless synchronization.
class TightenProducerConsumerNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope<> &scope, CachingStmtUsesVars &uses_vars) {
Expand Down Expand Up @@ -703,6 +710,7 @@ class TightenProducerConsumerNodes : public IRMutator {

// Update indices to add ring buffer.
class UpdateIndices : public IRMutator {
protected:
using IRMutator::visit;

Stmt visit(const Provide *op) override {
Expand Down Expand Up @@ -734,6 +742,7 @@ class UpdateIndices : public IRMutator {

// Inject ring buffering.
class InjectRingBuffering : public IRMutator {
protected:
using IRMutator::visit;

struct Loop {
Expand Down Expand Up @@ -816,6 +825,7 @@ class InjectRingBuffering : public IRMutator {
// Broaden the scope of acquire nodes to pack trailing work into the
// same task and to potentially reduce the nesting depth of tasks.
class ExpandAcquireNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt visit(const Block *op) override {
Expand Down Expand Up @@ -918,6 +928,7 @@ class ExpandAcquireNodes : public IRMutator {
};

class TightenForkNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt make_fork(const Stmt &first, const Stmt &rest) {
Expand Down Expand Up @@ -1005,12 +1016,13 @@ class TightenForkNodes : public IRMutator {
} // namespace

Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
s = TightenProducerConsumerNodes(env).mutate(s);
s = InjectRingBuffering(env).mutate(s);
s = ForkAsyncProducers(env).mutate(s);
s = ExpandAcquireNodes().mutate(s);
s = TightenForkNodes().mutate(s);
s = InitializeSemaphores().mutate(s);
ZoneScoped;
s = Profiled<TightenProducerConsumerNodes>(env).profiled_mutate(s);
s = Profiled<InjectRingBuffering>(env).profiled_mutate(s);
s = Profiled<ForkAsyncProducers>(env).profiled_mutate(s);
s = Profiled<ExpandAcquireNodes>().profiled_mutate(s);
s = Profiled<TightenForkNodes>().profiled_mutate(s);
s = Profiled<InitializeSemaphores>().profiled_mutate(s);
return s;
}

Expand Down
3 changes: 2 additions & 1 deletion src/BoundConstantExtentLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace Internal {

namespace {
class BoundLoops : public IRMutator {
protected:
using IRMutator::visit;

std::vector<std::pair<std::string, Expr>> lets;
Expand Down Expand Up @@ -128,7 +129,7 @@ class BoundLoops : public IRMutator {
} // namespace

Stmt bound_constant_extent_loops(const Stmt &s) {
return BoundLoops().mutate(s);
return Profiled<BoundLoops>().profiled_mutate(s);
}

} // namespace Internal
Expand Down
1 change: 1 addition & 0 deletions src/BoundSmallAllocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class BoundSmallAllocations : public IRMutator {
} // namespace

Stmt bound_small_allocations(const Stmt &s) {
ZoneScoped;
return BoundSmallAllocations().mutate(s);
}

Expand Down
26 changes: 15 additions & 11 deletions src/Bounds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope<Interval> &scop
}

Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope) {
ZoneScoped;
Expr expr = bound_correlated_differences(simplify(remove_likelies(e)));
Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true);
interval = simplify(interval);
Expand Down Expand Up @@ -227,7 +228,7 @@ class Bounds : public IRVisitor {

#endif // DO_TRACK_BOUNDS_INTERVALS

private:
protected:
// Compute the intrinsic bounds of a function.
void bounds_of_func(const string &name, int value_index, Type t) {
// if we can't get a good bound from the function, fall back to the bounds of the type.
Expand Down Expand Up @@ -1791,11 +1792,11 @@ Interval bounds_of_expr_in_scope_with_indent(const Expr &expr, const Scope<Inter
debug(0) << spaces << "BoundsOfExprInScope {\n"
<< spaces << " expr: " << expr << "\n";
#endif
Bounds b(&scope, fb, const_bound);
Profiled<Bounds> b(&scope, fb, const_bound);
#if DO_TRACK_BOUNDS_INTERVALS
b.log_indent = indent + 1;
#endif
expr.accept(&b);
b.profiled_visit(expr);
#if DO_TRACK_BOUNDS_INTERVALS
debug(0) << spaces << " mn=" << simplify(b.interval.min) << "\n"
<< spaces << " mx=" << simplify(b.interval.max) << "\n"
Expand Down Expand Up @@ -2019,6 +2020,7 @@ class FindInnermostVar : public IRVisitor {

// Place innermost vars in an IfThenElse's condition as far to the left as possible.
class SolveIfThenElse : public IRMutator {
protected:
// Scope of variable names and their depths. Higher depth indicates
// variable defined more innermost.
Scope<int> vars_depth;
Expand Down Expand Up @@ -2247,7 +2249,7 @@ class BoxesTouched : public IRGraphVisitor {

#endif // DO_TRACK_BOUNDS_INTERVALS

private:
protected:
struct VarInstance {
string var;
int instance;
Expand Down Expand Up @@ -3000,6 +3002,7 @@ class BoxesTouched : public IRGraphVisitor {

map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides,
const string &fn, const Scope<Interval> &scope, const FuncValueBounds &fb) {
ZoneScoped;
if (!fn.empty() && s.defined()) {
// Filter things down to the relevant sub-Stmts, so we don't spend a
// long time reasoning about lets and ifs that don't surround an
Expand Down Expand Up @@ -3099,27 +3102,27 @@ map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool
// as possible, so that BoxesTouched can prune the variable scope tighter
// when encountering the IfThenElse.
if (s.defined()) {
s = SolveIfThenElse().mutate(s);
s = Profiled<SolveIfThenElse>().profiled_mutate(s);
}

// Do calls and provides separately, for better simplification.
BoxesTouched calls(consider_calls, false, fn, &scope, fb);
BoxesTouched provides(false, consider_provides, fn, &scope, fb);
Profiled<BoxesTouched> calls(consider_calls, false, fn, &scope, fb);
Profiled<BoxesTouched> provides(false, consider_provides, fn, &scope, fb);

if (consider_calls) {
if (e.defined()) {
e.accept(&calls);
calls.profiled_visit(e);
}
if (s.defined()) {
s.accept(&calls);
calls.profiled_visit(s);
}
}
if (consider_provides) {
if (e.defined()) {
e.accept(&provides);
provides.profiled_visit(e);
}
if (s.defined()) {
s.accept(&provides);
provides.profiled_visit(s);
}
}

Expand Down Expand Up @@ -3255,6 +3258,7 @@ Interval compute_pure_function_definition_value_bounds(

FuncValueBounds compute_function_value_bounds(const vector<string> &order,
const map<string, Function> &env) {
ZoneScoped;
FuncValueBounds fb;

for (const auto &func_name : order) {
Expand Down
8 changes: 8 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ target_sources(
CodeGen_Vulkan_Dev.h
CodeGen_WebGPU_Dev.h
CompilerLogger.h
CompilerProfiling.h
ConciseCasts.h
CPlusPlusMangle.h
ConstantBounds.h
Expand Down Expand Up @@ -166,6 +167,7 @@ target_sources(
Param.h
Parameter.h
PartitionLoops.h
PerformanceCounter.h
Pipeline.h
Prefetch.h
PrefetchDirective.h
Expand Down Expand Up @@ -269,6 +271,7 @@ target_sources(
CodeGen_WebGPU_Dev.cpp
CodeGen_X86.cpp
CompilerLogger.cpp
CompilerProfiling.cpp
CPlusPlusMangle.cpp
ConstantBounds.cpp
ConstantInterval.cpp
Expand Down Expand Up @@ -515,6 +518,11 @@ target_compile_definitions(Halide PRIVATE WITH_SPIRV)
target_compile_definitions(Halide PRIVATE WITH_VULKAN)
target_compile_definitions(Halide PRIVATE WITH_WEBGPU)

if (WITH_COMPILER_PROFILING)
target_compile_definitions(Halide PRIVATE WITH_COMPILER_PROFILING)
endif()


##
# Flatbuffers and Serialization dependencies.
##
Expand Down
Loading
Loading