From 0ba525b1514c04adbb65e46cd4cec87c74207645 Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 15:21:43 +0000 Subject: [PATCH 1/6] perf: skip kernel `sharecommon` on theorems --- src/kernel/environment.cpp | 14 +++++--------- src/runtime/sharecommon.cpp | 11 +---------- src/runtime/sharecommon.h | 15 +-------------- 3 files changed, 7 insertions(+), 33 deletions(-) diff --git a/src/kernel/environment.cpp b/src/kernel/environment.cpp index 8c5d2b1a79b3..e322800895e7 100644 --- a/src/kernel/environment.cpp +++ b/src/kernel/environment.cpp @@ -9,7 +9,6 @@ Author: Leonardo de Moura #include #include "runtime/sstream.h" #include "runtime/thread.h" -#include "runtime/sharecommon.h" #include "util/map_foreach.h" #include "util/io.h" #include "kernel/environment.h" @@ -194,15 +193,12 @@ environment environment::add_theorem(declaration const & d, bool check) const { theorem_val const & v = d.to_theorem_val(); if (check) { type_checker checker(*this, diag.get()); - sharecommon_persistent_fn share; - expr val(share(v.get_value().raw())); - expr type(share(v.get_type().raw())); check_constant_val(*this, v.to_constant_val(), checker); - if (!checker.is_prop(type)) - throw theorem_type_is_not_prop(*this, v.get_name(), type); - check_no_metavar_no_fvar(*this, v.get_name(), val); - expr val_type = checker.check(val, v.get_lparams()); - if (!checker.is_def_eq(val_type, type)) + if (!checker.is_prop(v.get_type())) + throw theorem_type_is_not_prop(*this, v.get_name(), v.get_type()); + check_no_metavar_no_fvar(*this, v.get_name(), v.get_value()); + expr val_type = checker.check(v.get_value(), v.get_lparams()); + if (!checker.is_def_eq(val_type, v.get_type())) throw definition_type_mismatch_exception(*this, d, val_type); } return diag.update(add(constant_info(d))); diff --git a/src/runtime/sharecommon.cpp b/src/runtime/sharecommon.cpp index 887054e97104..a48cbb41d239 100644 --- a/src/runtime/sharecommon.cpp +++ b/src/runtime/sharecommon.cpp @@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE. Author: Leonardo de Moura */ #include +#include #include "runtime/sharecommon.h" #include "runtime/hash.h" @@ -443,14 +444,4 @@ lean_object * sharecommon_quick_fn::visit(lean_object * a) { extern "C" LEAN_EXPORT obj_res lean_sharecommon_quick(obj_arg a) { return sharecommon_quick_fn()(a); } - -lean_object * sharecommon_persistent_fn::operator()(lean_object * e) { - lean_object * r = check_cache(e); - if (r != nullptr) - return r; - m_saved.push_back(object_ref(e, true)); - r = visit(e); - m_saved.push_back(object_ref(r, true)); - return r; -} }; diff --git a/src/runtime/sharecommon.h b/src/runtime/sharecommon.h index 643a13b37cc7..f022e25fe240 100644 --- a/src/runtime/sharecommon.h +++ b/src/runtime/sharecommon.h @@ -5,8 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE. Author: Leonardo de Moura */ #pragma once -#include -#include "runtime/object_ref.h" +#include "runtime/object.h" #include "util/alloc.h" namespace lean { @@ -55,16 +54,4 @@ class LEAN_EXPORT sharecommon_quick_fn { } }; -/* -Similar to `sharecommon_quick_fn`, but we save the entry points and result values to ensure -they are not deleted. -*/ -class LEAN_EXPORT sharecommon_persistent_fn : private sharecommon_quick_fn { - std::vector m_saved; -public: - sharecommon_persistent_fn(bool s = false):sharecommon_quick_fn(s) {} - void set_check_set(bool f) { m_check_set = f; } - lean_object * operator()(lean_object * e); -}; - }; From 6ec325d06b1fa05875d6cb55c619873fdfa32eda Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 15:58:54 +0000 Subject: [PATCH 2/6] move into mkAuxTheorem --- src/Lean/Meta/Closure.lean | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Lean/Meta/Closure.lean b/src/Lean/Meta/Closure.lean index d5c6d4bf86de..6ddd7dfc43ae 100644 --- a/src/Lean/Meta/Closure.lean +++ b/src/Lean/Meta/Closure.lean @@ -456,10 +456,15 @@ def mkAuxDefinitionFor (name : Name) (value : Expr) (zetaDelta : Bool := false) -/ def mkAuxTheorem (type : Expr) (value : Expr) (zetaDelta : Bool := false) (kind? : Option Name := none) (cache := true) : MetaM Expr := do let result ← Closure.mkValueTypeClosure type value zetaDelta - let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList result.type result.value + let (resType, resValue) ← profileitM Exception "share common exprs" (← getOptions) do + withTraceNode `Meta.Closure.maxSharing (fun _ => return m!"share common exprs") do + let es := ShareCommon.shareCommon' #[result.type, result.value] + return (es[0]!, es[1]!) + let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList resType resValue return mkAppN (mkConst name result.levelArgs.toList) result.exprArgs builtin_initialize registerTraceClass `Meta.Closure + registerTraceClass `Meta.Closure.maxSharing end Lean.Meta From ea735132cfff5e914b9a3aa5aa5895468ce0555d Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 16:47:29 +0000 Subject: [PATCH 3/6] perf: de-virtualize `replace_rec_fn` dispatch via templated functor This PR turns `replace_rec_fn` into a class template parameterized on the functor type and exposes it from `kernel/replace_fn.h`, alongside templated `replace(...)` overloads that monomorphize on the functor at the call site. Hot callers like `instantiate`, `instantiate_rev`, and the C-exported `lean_expr_instantiate*_core` variants in `kernel/instantiate.cpp` all pass their substitution closure as a stateless lambda; previously each call into the closure went through `std::function`'s indirect dispatch on every recursion step, blocking inlining of the loose-bvar-range early return. With this change the closure body inlines into `replace_rec_fn::apply` and the `std::function` indirection is gone. The legacy `replace(expr, std::function<...>, bool)` overload is preserved as a thin out-of-line wrapper that instantiates the template once with `std::function` as the functor type, so external callers continue to work unchanged. On `leanchecker --fresh Init.Data.List.Lemmas` this shaves `17.10 G -> 16.50 G` instructions (~3.5%) and `1.74s -> 1.69s` wall-clock (~3%). All existing callers benefit automatically since they were already passing lambdas directly; no caller-side changes were required. --- src/kernel/replace_fn.cpp | 73 ++------------------------- src/kernel/replace_fn.h | 103 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 68 deletions(-) diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp index 08e6b17d85cd..dae0cacb2209 100644 --- a/src/kernel/replace_fn.cpp +++ b/src/kernel/replace_fn.cpp @@ -13,75 +13,12 @@ Author: Leonardo de Moura namespace lean { -class replace_rec_fn { - struct key_hasher { - std::size_t operator()(std::pair const & p) const { - return hash((size_t)p.first >> 3, p.second); - } - }; - lean::unordered_map, expr, key_hasher> m_cache; - std::function(expr const &, unsigned)> m_f; - bool m_use_cache; - - expr save_result(expr const & e, unsigned offset, expr r, bool shared) { - if (shared) - m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r)); - return r; - } - - expr apply(expr const & e, unsigned offset) { - bool shared = false; - if (m_use_cache && !is_likely_unshared(e)) { - auto it = m_cache.find(mk_pair(e.raw(), offset)); - if (it != m_cache.end()) - return it->second; - shared = true; - } - if (optional r = m_f(e, offset)) { - return save_result(e, offset, std::move(*r), shared); - } else { - switch (e.kind()) { - case expr_kind::Const: case expr_kind::Sort: - case expr_kind::BVar: case expr_kind::Lit: - case expr_kind::MVar: case expr_kind::FVar: - return save_result(e, offset, e, shared); - case expr_kind::MData: { - expr new_e = apply(mdata_expr(e), offset); - return save_result(e, offset, update_mdata(e, new_e), shared); - } - case expr_kind::Proj: { - expr new_e = apply(proj_expr(e), offset); - return save_result(e, offset, update_proj(e, new_e), shared); - } - case expr_kind::App: { - expr new_f = apply(app_fn(e), offset); - expr new_a = apply(app_arg(e), offset); - return save_result(e, offset, update_app(e, new_f, new_a), shared); - } - case expr_kind::Pi: case expr_kind::Lambda: { - expr new_d = apply(binding_domain(e), offset); - expr new_b = apply(binding_body(e), offset+1); - return save_result(e, offset, update_binding(e, new_d, new_b), shared); - } - case expr_kind::Let: { - expr new_t = apply(let_type(e), offset); - expr new_v = apply(let_value(e), offset); - expr new_b = apply(let_body(e), offset+1); - return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared); - } - } - lean_unreachable(); - } - } -public: - template - replace_rec_fn(F const & f, bool use_cache):m_f(f), m_use_cache(use_cache) {} - - expr operator()(expr const & e) { return apply(e, 0); } -}; - +// Out-of-line `std::function`-typed overload, declared in replace_fn.h. Defers +// to the templated implementation, instantiated once with `std::function` as +// the functor type. Hot callers should bypass this and use the templated +// `replace` directly to avoid the per-call indirect dispatch. expr replace(expr const & e, std::function(expr const &, unsigned)> const & f, bool use_cache) { - return replace_rec_fn(f, use_cache)(e); + return replace_rec_fn(expr const &, unsigned)>>(f, use_cache)(e); } class replace_fn { diff --git a/src/kernel/replace_fn.h b/src/kernel/replace_fn.h index ed2d8f9d39a0..d706f5bbf9fd 100644 --- a/src/kernel/replace_fn.h +++ b/src/kernel/replace_fn.h @@ -6,11 +6,91 @@ Author: Leonardo de Moura */ #pragma once #include +#include #include "runtime/interrupt.h" #include "kernel/expr.h" #include "kernel/expr_maps.h" +#include "util/alloc.h" namespace lean { + +/** + \brief Class-template implementation of `replace`. Stores the user functor by + value (not via `std::function`) so calls into it inline at every recursion + step. Hot callers should instantiate this directly via `replace(...)`; + the legacy `std::function`-typed `replace` overload below is a thin wrapper + for cold/external callers that need type erasure. + + The functor `F` must be callable as `F(expr const &, unsigned) -> optional`. +*/ +template +class replace_rec_fn { + struct key_hasher { + std::size_t operator()(std::pair const & p) const { + return hash((size_t)p.first >> 3, p.second); + } + }; + lean::unordered_map, expr, key_hasher> m_cache; + F m_f; + bool m_use_cache; + + expr save_result(expr const & e, unsigned offset, expr r, bool shared) { + if (shared) + m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r)); + return r; + } + + expr apply(expr const & e, unsigned offset) { + bool shared = false; + if (m_use_cache && !is_likely_unshared(e)) { + auto it = m_cache.find(mk_pair(e.raw(), offset)); + if (it != m_cache.end()) + return it->second; + shared = true; + } + if (optional r = m_f(e, offset)) { + return save_result(e, offset, std::move(*r), shared); + } else { + switch (e.kind()) { + case expr_kind::Const: case expr_kind::Sort: + case expr_kind::BVar: case expr_kind::Lit: + case expr_kind::MVar: case expr_kind::FVar: + return save_result(e, offset, e, shared); + case expr_kind::MData: { + expr new_e = apply(mdata_expr(e), offset); + return save_result(e, offset, update_mdata(e, new_e), shared); + } + case expr_kind::Proj: { + expr new_e = apply(proj_expr(e), offset); + return save_result(e, offset, update_proj(e, new_e), shared); + } + case expr_kind::App: { + expr new_f = apply(app_fn(e), offset); + expr new_a = apply(app_arg(e), offset); + return save_result(e, offset, update_app(e, new_f, new_a), shared); + } + case expr_kind::Pi: case expr_kind::Lambda: { + expr new_d = apply(binding_domain(e), offset); + expr new_b = apply(binding_body(e), offset+1); + return save_result(e, offset, update_binding(e, new_d, new_b), shared); + } + case expr_kind::Let: { + expr new_t = apply(let_type(e), offset); + expr new_v = apply(let_value(e), offset); + expr new_b = apply(let_body(e), offset+1); + return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared); + } + } + lean_unreachable(); + } + } +public: + template + replace_rec_fn(G && f, bool use_cache):m_f(std::forward(f)), m_use_cache(use_cache) {} + + expr operator()(expr const & e) { return apply(e, 0); } +}; + /** \brief Apply f to the subexpressions of a given expression. @@ -18,6 +98,29 @@ namespace lean { In a call f(s, n), n is the scope level, i.e., the number of bindings operators that enclosing \c s. The replaces only visits children of \c e if f return none_expr. + + These templated overloads monomorphize on the functor type so the call into + `f` inlines at every recursion step. Prefer them over the `std::function` + overloads for hot callers. The first overload accepts functors taking + `(expr const &, unsigned)`; the second accepts functors taking just + `(expr const &)` and adapts them by ignoring the offset argument. +*/ +template, int> = 0> +inline expr replace(expr const & e, F && f, bool use_cache = true) { + return replace_rec_fn>(std::forward(f), use_cache)(e); +} + +template && + !std::is_invocable_v, int> = 0> +inline expr replace(expr const & e, F && f, bool use_cache = true) { + auto adapter = [f = std::forward(f)](expr const & e, unsigned) { return f(e); }; + return replace_rec_fn(std::move(adapter), use_cache)(e); +} + +/** + \brief Type-erased overload of `replace` for cold callers and external use. + Internally instantiates `replace_rec_fn>`, so it pays the + `std::function` indirection on every recursive call. */ expr replace(expr const & e, std::function(expr const &, unsigned)> const & f, bool use_cache = true); inline expr replace(expr const & e, std::function(expr const &)> const & f, bool use_cache = true) { From 81bb2f61e2aabe4719fdcafc0be36ee39650673d Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 16:52:06 +0000 Subject: [PATCH 4/6] perf: skip caching identity results in `replace_rec_fn` This PR skips inserting an entry into `replace_rec_fn`'s cache when the result equals the input, i.e. when the user functor decided that the subterm is unchanged. The dominant kernel callers of `replace` are the `instantiate*` family in `kernel/instantiate.cpp`, and their substitution closure short-circuits to `some_expr(m)` whenever `offset >= get_loose_bvar_range(m)`. That predicate is true for the vast majority of subterms in any non-trivial expression, so the cache was being filled almost exclusively with `(node, offset) -> node` self-mappings. Each such entry costs an insert and pollutes the cache for actually-useful entries, while a re-visit can recompute the same answer by calling the functor again at O(1) cost (the loose-bvar range is memoized on the expr node). On `leanchecker --fresh Init.Data.List.Lemmas` this shaves `17.10 G -> 16.55 G` instructions (~3.2%) and `1.74s -> 1.61s` wall-clock (~7.5%). --- src/kernel/replace_fn.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp index 08e6b17d85cd..8f76a5f4c0ee 100644 --- a/src/kernel/replace_fn.cpp +++ b/src/kernel/replace_fn.cpp @@ -24,7 +24,11 @@ class replace_rec_fn { bool m_use_cache; expr save_result(expr const & e, unsigned offset, expr r, bool shared) { - if (shared) + // Skip caching identity results: a re-visit can recompute by calling + // `m_f` again, which for the dominant `instantiate*` callers is just a + // memoized loose-bvar-range check. The cache slot would otherwise be + // pure overhead, since the value equals the (already-shared) input. + if (shared && r.raw() != e.raw()) m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r)); return r; } From c3da42e66fdf6b9019935845e2e0b8c47e5a532e Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 17:51:04 +0000 Subject: [PATCH 5/6] perf: replace identity-skip with pre-cache skip predicate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR replaces the `r.raw() != e.raw()` skip-when-result-is-input check in `replace_rec_fn::save_result` (introduced earlier in this stack) with an opt-in pre-cache `skip` predicate that callers can supply alongside the rewriter functor. The skip predicate is invoked *before* the cache lookup; if it returns true, the input is returned unchanged with no cache touch. The earlier identity-skip approach blew up catastrophically on workloads like `Init.WFExtrinsicFix` (~3-4x instruction count), because it dropped caching for the *recursion-identity* case as well as the cheap `m_f`-early-return case. In the recursion-identity case the cache had been remembering "we already descended into this shared subterm and proved it's unchanged" — without that memo, every revisit redid the full recursive descent and the work compounded exponentially. The skip predicate fixes this by separating the two cases: only the cheap "no work needed for this subtree" guard goes into `skip`, and the ordinary cache logic handles every other path unchanged. For `instantiate*`/`lift_loose_bvars`/`lower_loose_bvars`, the predicate is the existing memoized loose-bvar-range check that was previously the first line of `m_f`. It's strictly cheaper than the cache lookup it replaces. Measurements vs the `xkn` baseline on `leanchecker --fresh`: | | baseline | this PR | |-----------------------|----------|---------| | Init.WFExtrinsicFix | 9.32 G | 9.07 G | | Init.Data.List.Lemmas | 17.10 G | 16.61 G | Both workloads improve modestly, with no regression on either. --- src/kernel/expr.cpp | 21 +++++++++++---------- src/kernel/instantiate.cpp | 35 +++++++++++++++++++++-------------- src/kernel/replace_fn.cpp | 27 ++++++++++++++++++++++----- src/kernel/replace_fn.h | 12 ++++++++++++ 4 files changed, 66 insertions(+), 29 deletions(-) diff --git a/src/kernel/expr.cpp b/src/kernel/expr.cpp index f87e72aaee16..a78e69eb1f9b 100644 --- a/src/kernel/expr.cpp +++ b/src/kernel/expr.cpp @@ -418,12 +418,13 @@ expr lower_loose_bvars(expr const & e, unsigned s, unsigned d) { if (d == 0 || s >= get_loose_bvar_range(e)) return e; lean_assert(s >= d); - return replace(e, [=](expr const & e, unsigned offset) -> optional { + return replace(e, + [=](expr const & e, unsigned offset) { + unsigned s1 = s + offset; + return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e); + }, + [=](expr const & e, unsigned offset) -> optional { unsigned s1 = s + offset; - if (s1 < s) - return some_expr(e); // overflow, vidx can't be >= max unsigned - if (s1 >= get_loose_bvar_range(e)) - return some_expr(e); // expression e does not contain bound variables with idx >= s1 if (is_bvar(e) && bvar_idx(e) >= s1) { lean_assert(bvar_idx(e) >= offset + d); return some_expr(mk_bvar(bvar_idx(e) - nat(d))); @@ -448,12 +449,12 @@ extern "C" LEAN_EXPORT object * lean_expr_lower_loose_bvars(b_obj_arg e, b_obj_a expr lift_loose_bvars(expr const & e, unsigned s, unsigned d) { if (d == 0 || s >= get_loose_bvar_range(e)) return e; - return replace(e, [=](expr const & e, unsigned offset) -> optional { + return replace(e, + [=](expr const & e, unsigned offset) { unsigned s1 = s + offset; - if (s1 < s) - return some_expr(e); // overflow, vidx can't be >= max unsigned - if (s1 >= get_loose_bvar_range(e)) - return some_expr(e); // expression e does not contain bound variables with idx >= s1 + return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e); + }, + [=](expr const & e, unsigned offset) -> optional { if (is_var(e) && bvar_idx(e) >= s + offset) { return some_expr(mk_bvar(bvar_idx(e) + nat(d))); } else { diff --git a/src/kernel/instantiate.cpp b/src/kernel/instantiate.cpp index b845eda9129c..f04e1d4a1724 100644 --- a/src/kernel/instantiate.cpp +++ b/src/kernel/instantiate.cpp @@ -15,13 +15,14 @@ namespace lean { expr instantiate(expr const & a, unsigned s, unsigned n, expr const * subst) { if (s >= get_loose_bvar_range(a) || n == 0) return a; - return replace(a, [=](expr const & m, unsigned offset) -> optional { + return replace(a, + [=](expr const & m, unsigned offset) { unsigned s1 = s + offset; - if (s1 < s) - return some_expr(m); // overflow, vidx can't be >= max unsigned - if (s1 >= get_loose_bvar_range(m)) - return some_expr(m); // expression m does not contain loose bound variables with idx >= s1 + return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(m); + }, + [=](expr const & m, unsigned offset) -> optional { if (is_bvar(m)) { + unsigned s1 = s + offset; nat const & vidx = bvar_idx(m); if (vidx >= s1) { unsigned h = s1 + n; @@ -58,9 +59,11 @@ static object * lean_expr_instantiate_core(b_obj_arg a0, size_t n, object** subs lean_inc(a0); return a0; } - expr r = replace(a, [=](expr const & m, unsigned offset) -> optional { - if (offset >= get_loose_bvar_range(m)) - return some_expr(m); // expression m does not contain loose bound variables with idx >= offset + expr r = replace(a, + [=](expr const & m, unsigned offset) { + return offset >= get_loose_bvar_range(m); + }, + [=](expr const & m, unsigned offset) -> optional { if (is_bvar(m)) { nat const & vidx = bvar_idx(m); if (vidx >= offset) { @@ -99,9 +102,11 @@ extern "C" LEAN_EXPORT object * lean_expr_instantiate_range(b_obj_arg a, b_obj_a expr instantiate_rev(expr const & a, unsigned n, expr const * subst) { if (!has_loose_bvars(a)) return a; - return replace(a, [=](expr const & m, unsigned offset) -> optional { - if (offset >= get_loose_bvar_range(m)) - return some_expr(m); // expression m does not contain loose bound variables with idx >= offset + return replace(a, + [=](expr const & m, unsigned offset) { + return offset >= get_loose_bvar_range(m); + }, + [=](expr const & m, unsigned offset) -> optional { if (is_bvar(m)) { nat const & vidx = bvar_idx(m); if (vidx >= offset) { @@ -123,9 +128,11 @@ static object * lean_expr_instantiate_rev_core(object * a0, size_t n, object ** lean_inc(a0); return a0; } - expr r = replace(a, [=](expr const & m, unsigned offset) -> optional { - if (offset >= get_loose_bvar_range(m)) - return some_expr(m); // expression m does not contain loose bound variables with idx >= offset + expr r = replace(a, + [=](expr const & m, unsigned offset) { + return offset >= get_loose_bvar_range(m); + }, + [=](expr const & m, unsigned offset) -> optional { if (is_bvar(m)) { nat const & vidx = bvar_idx(m); if (vidx >= offset) { diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp index 8f76a5f4c0ee..10beab6ea54b 100644 --- a/src/kernel/replace_fn.cpp +++ b/src/kernel/replace_fn.cpp @@ -20,20 +20,27 @@ class replace_rec_fn { } }; lean::unordered_map, expr, key_hasher> m_cache; + // Optional pre-cache skip predicate. When set, called *before* the cache + // lookup; if it returns true, the input is returned unchanged with no cache + // touch. This is the right tool for "this entire subtree is unaffected" + // bail-outs (e.g. `instantiate*`'s loose-bvar-range check), since it both + // avoids the cache lookup and avoids polluting the cache with identity + // entries — without the pitfall of the earlier `r == e` skip in + // `save_result`, which also dropped the *recursion-identity* case and blew + // up exponentially on workloads like `Init.WFExtrinsicFix`. + std::function m_skip; std::function(expr const &, unsigned)> m_f; bool m_use_cache; expr save_result(expr const & e, unsigned offset, expr r, bool shared) { - // Skip caching identity results: a re-visit can recompute by calling - // `m_f` again, which for the dominant `instantiate*` callers is just a - // memoized loose-bvar-range check. The cache slot would otherwise be - // pure overhead, since the value equals the (already-shared) input. - if (shared && r.raw() != e.raw()) + if (shared) m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r)); return r; } expr apply(expr const & e, unsigned offset) { + if (m_skip && m_skip(e, offset)) + return e; bool shared = false; if (m_use_cache && !is_likely_unshared(e)) { auto it = m_cache.find(mk_pair(e.raw(), offset)); @@ -80,6 +87,9 @@ class replace_rec_fn { public: template replace_rec_fn(F const & f, bool use_cache):m_f(f), m_use_cache(use_cache) {} + template + replace_rec_fn(Skip const & skip, F const & f, bool use_cache): + m_skip(skip), m_f(f), m_use_cache(use_cache) {} expr operator()(expr const & e) { return apply(e, 0); } }; @@ -88,6 +98,13 @@ expr replace(expr const & e, std::function(expr const &, unsigned return replace_rec_fn(f, use_cache)(e); } +expr replace(expr const & e, + std::function const & skip, + std::function(expr const &, unsigned)> const & f, + bool use_cache) { + return replace_rec_fn(skip, f, use_cache)(e); +} + class replace_fn { lean::unordered_map m_cache; lean_object * m_f; diff --git a/src/kernel/replace_fn.h b/src/kernel/replace_fn.h index ed2d8f9d39a0..4efaff9457a0 100644 --- a/src/kernel/replace_fn.h +++ b/src/kernel/replace_fn.h @@ -23,4 +23,16 @@ expr replace(expr const & e, std::function(expr const &, unsigned inline expr replace(expr const & e, std::function(expr const &)> const & f, bool use_cache = true) { return replace(e, [&](expr const & e, unsigned) { return f(e); }, use_cache); } + +/** + \brief As `replace`, but with a pre-cache skip predicate. `skip(m, offset)` + is called before any cache lookup; if it returns true, `m` is returned + unchanged with no cache touch. Use this for cheap "this entire subtree is + unaffected" guards (e.g. a memoized loose-bvar-range check), since it both + skips the cache lookup and avoids polluting the cache with identity entries. +*/ +expr replace(expr const & e, + std::function const & skip, + std::function(expr const &, unsigned)> const & f, + bool use_cache = true); } From ecc6f08357d3402d1feafdbeb313e0c81d95298d Mon Sep 17 00:00:00 2001 From: Sebastian Ullrich Date: Sun, 12 Apr 2026 16:37:48 +0000 Subject: [PATCH 6/6] perf: small-buffer inline cache for `replace_rec_fn` This PR replaces the `std::unordered_map`-based cache in `replace_rec_fn` with a small-buffer cache that stores its first 16 entries inline in uninitialized stack storage and only allocates a real hash map for the rare large traversal. Instrumentation across a full `leanchecker --fresh Init.Data.List.Lemmas` run shows that 87% of `replace_rec_fn` instances hold at most 15 entries and only 0.21% exceed 128, with a mean cache size of just 9 entries spread across ~950k instances. At that scale a hash map is the wrong data structure: its per-instance bucket-array allocation and per-entry node allocation dwarf the cost of a linear scan over a handful of entries. The new structure pays nothing for entries that are never inserted, no allocation at all on the common path, and falls back to the original `unordered_map` once the inline buffer fills. Combined with the existing `is_likely_unshared` filter, lookups on the common path are just a tight scan over a stack-resident array. On `leanchecker --fresh Init.Data.List.Lemmas` this shaves `17.10 G -> 16.18 G` instructions (~5.4%) and `1.74s -> 1.62s` wall-clock (~6.7%) compared to the previous baseline. It supersedes the prior `try_emplace` and `reserve(128)` micro-optimizations on the same cache, both of which are no longer needed since the hash map is no longer on the hot path. --- src/kernel/replace_fn.cpp | 75 +++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp index 08e6b17d85cd..c86597f0a186 100644 --- a/src/kernel/replace_fn.cpp +++ b/src/kernel/replace_fn.cpp @@ -13,28 +13,91 @@ Author: Leonardo de Moura namespace lean { -class replace_rec_fn { +// Small-buffer cache for `replace_rec_fn`. The histogram of cache sizes during a +// typical run is heavily skewed toward small caches: ~87% of instances hold ≤15 +// entries and ~99% hold ≤63, with a long thin tail up to a few thousand. A linear +// scan over a stack-resident array beats any hash map at that scale, so we keep +// the first `INLINE_CAP` entries inline and only fall back to a real hash map +// (lazily allocated) for the rare large traversal. +class replace_cache { struct key_hasher { std::size_t operator()(std::pair const & p) const { return hash((size_t)p.first >> 3, p.second); } }; - lean::unordered_map, expr, key_hasher> m_cache; + using key_t = std::pair; + using map_t = lean::unordered_map; + static constexpr unsigned INLINE_CAP = 16; + struct entry { key_t k; expr v; }; + // Uninitialized storage; only entries [0, m_size) are constructed. This + // avoids paying for `INLINE_CAP` default-constructed `expr`s on every + // `replace_rec_fn` instance, which matters because the typical traversal + // creates a fresh cache holding only a handful of entries. + alignas(entry) std::byte m_inline_storage[sizeof(entry) * INLINE_CAP]; + unsigned m_size = 0; + std::unique_ptr m_overflow; + + entry * inline_at(unsigned i) { + return std::launder(reinterpret_cast(m_inline_storage) + i); + } + entry const * inline_at(unsigned i) const { + return std::launder(reinterpret_cast(m_inline_storage) + i); + } +public: + replace_cache() = default; + replace_cache(replace_cache const &) = delete; + replace_cache & operator=(replace_cache const &) = delete; + ~replace_cache() { + for (unsigned i = 0; i < m_size; ++i) inline_at(i)->~entry(); + } + + expr const * find(key_t const & k) const { + for (unsigned i = 0; i < m_size; ++i) { + entry const * e = inline_at(i); + if (e->k == k) return &e->v; + } + if (m_overflow) { + auto it = m_overflow->find(k); + if (it != m_overflow->end()) return &it->second; + } + return nullptr; + } + void insert(key_t const & k, expr const & v) { + if (!m_overflow) { + if (m_size < INLINE_CAP) { + new (inline_at(m_size)) entry{k, v}; + ++m_size; + return; + } + m_overflow.reset(new map_t()); + m_overflow->reserve(INLINE_CAP * 4); + for (unsigned i = 0; i < m_size; ++i) { + entry * e = inline_at(i); + m_overflow->emplace(e->k, std::move(e->v)); + e->~entry(); + } + m_size = 0; + } + m_overflow->insert(mk_pair(k, v)); + } +}; + +class replace_rec_fn { + replace_cache m_cache; std::function(expr const &, unsigned)> m_f; bool m_use_cache; expr save_result(expr const & e, unsigned offset, expr r, bool shared) { if (shared) - m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r)); + m_cache.insert(mk_pair(e.raw(), offset), r); return r; } expr apply(expr const & e, unsigned offset) { bool shared = false; if (m_use_cache && !is_likely_unshared(e)) { - auto it = m_cache.find(mk_pair(e.raw(), offset)); - if (it != m_cache.end()) - return it->second; + if (expr const * cached = m_cache.find(mk_pair(e.raw(), offset))) + return *cached; shared = true; } if (optional r = m_f(e, offset)) {