From 0ba525b1514c04adbb65e46cd4cec87c74207645 Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 15:21:43 +0000
Subject: [PATCH 1/6] perf: skip kernel `sharecommon` on theorems

---
 src/kernel/environment.cpp  | 14 +++++---------
 src/runtime/sharecommon.cpp | 11 +----------
 src/runtime/sharecommon.h   | 15 +--------------
 3 files changed, 7 insertions(+), 33 deletions(-)
diff --git a/src/kernel/environment.cpp b/src/kernel/environment.cpp
index 8c5d2b1a79b3..e322800895e7 100644
--- a/src/kernel/environment.cpp
+++ b/src/kernel/environment.cpp
@@ -9,7 +9,6 @@ Author: Leonardo de Moura
 #include <limits>
 #include "runtime/sstream.h"
 #include "runtime/thread.h"
-#include "runtime/sharecommon.h"
 #include "util/map_foreach.h"
 #include "util/io.h"
 #include "kernel/environment.h"
@@ -194,15 +193,12 @@ environment environment::add_theorem(declaration const & d, bool check) const {
     theorem_val const & v = d.to_theorem_val();
     if (check) {
         type_checker checker(*this, diag.get());
-        sharecommon_persistent_fn share;
-        expr val(share(v.get_value().raw()));
-        expr type(share(v.get_type().raw()));
         check_constant_val(*this, v.to_constant_val(), checker);
-        if (!checker.is_prop(type))
-            throw theorem_type_is_not_prop(*this, v.get_name(), type);
-        check_no_metavar_no_fvar(*this, v.get_name(), val);
-        expr val_type = checker.check(val, v.get_lparams());
-        if (!checker.is_def_eq(val_type, type))
+        if (!checker.is_prop(v.get_type()))
+            throw theorem_type_is_not_prop(*this, v.get_name(), v.get_type());
+        check_no_metavar_no_fvar(*this, v.get_name(), v.get_value());
+        expr val_type = checker.check(v.get_value(), v.get_lparams());
+        if (!checker.is_def_eq(val_type, v.get_type()))
             throw definition_type_mismatch_exception(*this, d, val_type);
     }
     return diag.update(add(constant_info(d)));
diff --git a/src/runtime/sharecommon.cpp b/src/runtime/sharecommon.cpp
index 887054e97104..a48cbb41d239 100644
--- a/src/runtime/sharecommon.cpp
+++ b/src/runtime/sharecommon.cpp
@@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #include <cstring>
+#include <vector>
 #include "runtime/sharecommon.h"
 #include "runtime/hash.h"
 
@@ -443,14 +444,4 @@ lean_object * sharecommon_quick_fn::visit(lean_object * a) {
 extern "C" LEAN_EXPORT obj_res lean_sharecommon_quick(obj_arg a) {
     return sharecommon_quick_fn()(a);
 }
-
-lean_object * sharecommon_persistent_fn::operator()(lean_object * e) {
-    lean_object * r = check_cache(e);
-    if (r != nullptr)
-        return r;
-    m_saved.push_back(object_ref(e, true));
-    r = visit(e);
-    m_saved.push_back(object_ref(r, true));
-    return r;
-}
 };
diff --git a/src/runtime/sharecommon.h b/src/runtime/sharecommon.h
index 643a13b37cc7..f022e25fe240 100644
--- a/src/runtime/sharecommon.h
+++ b/src/runtime/sharecommon.h
@@ -5,8 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #pragma once
-#include <vector>
-#include "runtime/object_ref.h"
+#include "runtime/object.h"
 #include "util/alloc.h"
 
 namespace lean {
@@ -55,16 +54,4 @@ class LEAN_EXPORT sharecommon_quick_fn {
     }
 };
 
-/*
-Similar to `sharecommon_quick_fn`, but we save the entry points and result values to ensure
-they are not deleted.
-*/
-class LEAN_EXPORT sharecommon_persistent_fn : private sharecommon_quick_fn {
-    std::vector<object_ref> m_saved;
-public:
-    sharecommon_persistent_fn(bool s = false):sharecommon_quick_fn(s) {}
-    void set_check_set(bool f) { m_check_set = f; }
-    lean_object * operator()(lean_object * e);
-};
-
 };

From 6ec325d06b1fa05875d6cb55c619873fdfa32eda Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 15:58:54 +0000
Subject: [PATCH 2/6] move into mkAuxTheorem

---
 src/Lean/Meta/Closure.lean | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Lean/Meta/Closure.lean b/src/Lean/Meta/Closure.lean
index d5c6d4bf86de..6ddd7dfc43ae 100644
--- a/src/Lean/Meta/Closure.lean
+++ b/src/Lean/Meta/Closure.lean
@@ -456,10 +456,15 @@ def mkAuxDefinitionFor (name : Name) (value : Expr) (zetaDelta : Bool := false)
 -/
 def mkAuxTheorem (type : Expr) (value : Expr) (zetaDelta : Bool := false) (kind? : Option Name := none) (cache := true) : MetaM Expr := do
   let result ← Closure.mkValueTypeClosure type value zetaDelta
-  let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList result.type result.value
+  let (resType, resValue) ← profileitM Exception "share common exprs" (← getOptions) do
+    withTraceNode `Meta.Closure.maxSharing (fun _ => return m!"share common exprs") do
+      let es := ShareCommon.shareCommon' #[result.type, result.value]
+      return (es[0]!, es[1]!)
+  let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList resType resValue
   return mkAppN (mkConst name result.levelArgs.toList) result.exprArgs
 
 builtin_initialize
   registerTraceClass `Meta.Closure
+  registerTraceClass `Meta.Closure.maxSharing
 
 end Lean.Meta

From ea735132cfff5e914b9a3aa5aa5895468ce0555d Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 16:47:29 +0000
Subject: [PATCH 3/6] perf: de-virtualize `replace_rec_fn` dispatch via
 templated functor

This PR turns `replace_rec_fn` into a class template parameterized on the
functor type and exposes it from `kernel/replace_fn.h`, alongside templated
`replace<F>(...)` overloads that monomorphize on the functor at the call
site. Hot callers like `instantiate`, `instantiate_rev`, and the C-exported
`lean_expr_instantiate*_core` variants in `kernel/instantiate.cpp` all pass
their substitution closure as a stateless lambda; previously each call into
the closure went through `std::function`'s indirect dispatch on every
recursion step, blocking inlining of the loose-bvar-range early return.
With this change the closure body inlines into `replace_rec_fn::apply` and
the `std::function` indirection is gone.

The legacy `replace(expr, std::function<...>, bool)` overload is preserved
as a thin out-of-line wrapper that instantiates the template once with
`std::function` as the functor type, so external callers continue to work
unchanged.

On `leanchecker --fresh Init.Data.List.Lemmas` this shaves
`17.10 G -> 16.50 G` instructions (~3.5%) and `1.74s -> 1.69s` wall-clock
(~3%). All existing callers benefit automatically since they were already
passing lambdas directly; no caller-side changes were required.
---
 src/kernel/replace_fn.cpp |  73 ++-------------------------
 src/kernel/replace_fn.h   | 103 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
index 08e6b17d85cd..dae0cacb2209 100644
--- a/src/kernel/replace_fn.cpp
+++ b/src/kernel/replace_fn.cpp
@@ -13,75 +13,12 @@ Author: Leonardo de Moura
 
 namespace lean {
 
-class replace_rec_fn {
-    struct key_hasher {
-        std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
-            return hash((size_t)p.first >> 3, p.second);
-        }
-    };
-    lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
-    std::function<optional<expr>(expr const &, unsigned)> m_f;
-    bool                                                  m_use_cache;
-
-    expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
-        if (shared)
-            m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
-        return r;
-    }
-
-    expr apply(expr const & e, unsigned offset) {
-        bool shared = false;
-        if (m_use_cache && !is_likely_unshared(e)) {
-            auto it = m_cache.find(mk_pair(e.raw(), offset));
-            if (it != m_cache.end())
-                return it->second;
-            shared = true;
-        }
-        if (optional<expr> r = m_f(e, offset)) {
-            return save_result(e, offset, std::move(*r), shared);
-        } else {
-            switch (e.kind()) {
-            case expr_kind::Const: case expr_kind::Sort:
-            case expr_kind::BVar:  case expr_kind::Lit:
-            case expr_kind::MVar:  case expr_kind::FVar:
-                return save_result(e, offset, e, shared);
-            case expr_kind::MData: {
-                expr new_e = apply(mdata_expr(e), offset);
-                return save_result(e, offset, update_mdata(e, new_e), shared);
-            }
-            case expr_kind::Proj: {
-                expr new_e = apply(proj_expr(e), offset);
-                return save_result(e, offset, update_proj(e, new_e), shared);
-            }
-            case expr_kind::App: {
-                expr new_f = apply(app_fn(e), offset);
-                expr new_a = apply(app_arg(e), offset);
-                return save_result(e, offset, update_app(e, new_f, new_a), shared);
-            }
-            case expr_kind::Pi: case expr_kind::Lambda: {
-                expr new_d = apply(binding_domain(e), offset);
-                expr new_b = apply(binding_body(e), offset+1);
-                return save_result(e, offset, update_binding(e, new_d, new_b), shared);
-            }
-            case expr_kind::Let: {
-                expr new_t = apply(let_type(e), offset);
-                expr new_v = apply(let_value(e), offset);
-                expr new_b = apply(let_body(e), offset+1);
-                return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared);
-            }
-            }
-            lean_unreachable();
-        }
-    }
-public:
-    template<typename F>
-    replace_rec_fn(F const & f, bool use_cache):m_f(f), m_use_cache(use_cache) {}
-
-    expr operator()(expr const & e) { return apply(e, 0); }
-};
-
+// Out-of-line `std::function`-typed overload, declared in replace_fn.h. Defers
+// to the templated implementation, instantiated once with `std::function` as
+// the functor type. Hot callers should bypass this and use the templated
+// `replace<F>` directly to avoid the per-call indirect dispatch.
 expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned)> const & f, bool use_cache) {
-    return replace_rec_fn(f, use_cache)(e);
+    return replace_rec_fn<std::function<optional<expr>(expr const &, unsigned)>>(f, use_cache)(e);
 }
 
 class replace_fn {
diff --git a/src/kernel/replace_fn.h b/src/kernel/replace_fn.h
index ed2d8f9d39a0..d706f5bbf9fd 100644
--- a/src/kernel/replace_fn.h
+++ b/src/kernel/replace_fn.h
@@ -6,11 +6,91 @@ Author: Leonardo de Moura
 */
 #pragma once
 #include <tuple>
+#include <utility>
 #include "runtime/interrupt.h"
 #include "kernel/expr.h"
 #include "kernel/expr_maps.h"
+#include "util/alloc.h"
 
 namespace lean {
+
+/**
+   \brief Class-template implementation of `replace`. Stores the user functor by
+   value (not via `std::function`) so calls into it inline at every recursion
+   step. Hot callers should instantiate this directly via `replace<F>(...)`;
+   the legacy `std::function`-typed `replace` overload below is a thin wrapper
+   for cold/external callers that need type erasure.
+
+   The functor `F` must be callable as `F(expr const &, unsigned) -> optional<expr>`.
+*/
+template<typename F>
+class replace_rec_fn {
+    struct key_hasher {
+        std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
+            return hash((size_t)p.first >> 3, p.second);
+        }
+    };
+    lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
+    F    m_f;
+    bool m_use_cache;
+
+    expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
+        if (shared)
+            m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
+        return r;
+    }
+
+    expr apply(expr const & e, unsigned offset) {
+        bool shared = false;
+        if (m_use_cache && !is_likely_unshared(e)) {
+            auto it = m_cache.find(mk_pair(e.raw(), offset));
+            if (it != m_cache.end())
+                return it->second;
+            shared = true;
+        }
+        if (optional<expr> r = m_f(e, offset)) {
+            return save_result(e, offset, std::move(*r), shared);
+        } else {
+            switch (e.kind()) {
+            case expr_kind::Const: case expr_kind::Sort:
+            case expr_kind::BVar:  case expr_kind::Lit:
+            case expr_kind::MVar:  case expr_kind::FVar:
+                return save_result(e, offset, e, shared);
+            case expr_kind::MData: {
+                expr new_e = apply(mdata_expr(e), offset);
+                return save_result(e, offset, update_mdata(e, new_e), shared);
+            }
+            case expr_kind::Proj: {
+                expr new_e = apply(proj_expr(e), offset);
+                return save_result(e, offset, update_proj(e, new_e), shared);
+            }
+            case expr_kind::App: {
+                expr new_f = apply(app_fn(e), offset);
+                expr new_a = apply(app_arg(e), offset);
+                return save_result(e, offset, update_app(e, new_f, new_a), shared);
+            }
+            case expr_kind::Pi: case expr_kind::Lambda: {
+                expr new_d = apply(binding_domain(e), offset);
+                expr new_b = apply(binding_body(e), offset+1);
+                return save_result(e, offset, update_binding(e, new_d, new_b), shared);
+            }
+            case expr_kind::Let: {
+                expr new_t = apply(let_type(e), offset);
+                expr new_v = apply(let_value(e), offset);
+                expr new_b = apply(let_body(e), offset+1);
+                return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared);
+            }
+            }
+            lean_unreachable();
+        }
+    }
+public:
+    template<typename G>
+    replace_rec_fn(G && f, bool use_cache):m_f(std::forward<G>(f)), m_use_cache(use_cache) {}
+
+    expr operator()(expr const & e) { return apply(e, 0); }
+};
+
 /**
    \brief Apply <tt>f</tt> to the subexpressions of a given expression.
 
@@ -18,6 +98,29 @@ namespace lean {
    In a call <tt>f(s, n)</tt>, n is the scope level, i.e., the number of
    bindings operators that enclosing \c s. The replaces only visits children of \c e
    if f return none_expr.
+
+   These templated overloads monomorphize on the functor type so the call into
+   `f` inlines at every recursion step. Prefer them over the `std::function`
+   overloads for hot callers. The first overload accepts functors taking
+   `(expr const &, unsigned)`; the second accepts functors taking just
+   `(expr const &)` and adapts them by ignoring the offset argument.
+*/
+template<typename F, std::enable_if_t<std::is_invocable_v<F const &, expr const &, unsigned>, int> = 0>
+inline expr replace(expr const & e, F && f, bool use_cache = true) {
+    return replace_rec_fn<std::decay_t<F>>(std::forward<F>(f), use_cache)(e);
+}
+
+template<typename F, std::enable_if_t<std::is_invocable_v<F const &, expr const &> &&
+                                      !std::is_invocable_v<F const &, expr const &, unsigned>, int> = 0>
+inline expr replace(expr const & e, F && f, bool use_cache = true) {
+    auto adapter = [f = std::forward<F>(f)](expr const & e, unsigned) { return f(e); };
+    return replace_rec_fn<decltype(adapter)>(std::move(adapter), use_cache)(e);
+}
+
+/**
+   \brief Type-erased overload of `replace` for cold callers and external use.
+   Internally instantiates `replace_rec_fn<std::function<...>>`, so it pays the
+   `std::function` indirection on every recursive call.
 */
 expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned)> const & f, bool use_cache = true);
 inline expr replace(expr const & e, std::function<optional<expr>(expr const &)> const & f, bool use_cache = true) {

From 81bb2f61e2aabe4719fdcafc0be36ee39650673d Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 16:52:06 +0000
Subject: [PATCH 4/6] perf: skip caching identity results in `replace_rec_fn`

This PR skips inserting an entry into `replace_rec_fn`'s cache when the
result equals the input, i.e. when the user functor decided that the
subterm is unchanged. The dominant kernel callers of `replace` are the
`instantiate*` family in `kernel/instantiate.cpp`, and their substitution
closure short-circuits to `some_expr(m)` whenever
`offset >= get_loose_bvar_range(m)`. That predicate is true for the vast
majority of subterms in any non-trivial expression, so the cache was being
filled almost exclusively with `(node, offset) -> node` self-mappings.
Each such entry costs an insert and pollutes the cache for actually-useful
entries, while a re-visit can recompute the same answer by calling the
functor again at O(1) cost (the loose-bvar range is memoized on the expr
node).

On `leanchecker --fresh Init.Data.List.Lemmas` this shaves
`17.10 G -> 16.55 G` instructions (~3.2%) and `1.74s -> 1.61s` wall-clock
(~7.5%).
---
 src/kernel/replace_fn.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
index 08e6b17d85cd..8f76a5f4c0ee 100644
--- a/src/kernel/replace_fn.cpp
+++ b/src/kernel/replace_fn.cpp
@@ -24,7 +24,11 @@ class replace_rec_fn {
     bool                                                  m_use_cache;
 
     expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
-        if (shared)
+        // Skip caching identity results: a re-visit can recompute by calling
+        // `m_f` again, which for the dominant `instantiate*` callers is just a
+        // memoized loose-bvar-range check. The cache slot would otherwise be
+        // pure overhead, since the value equals the (already-shared) input.
+        if (shared && r.raw() != e.raw())
             m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
         return r;
     }

From c3da42e66fdf6b9019935845e2e0b8c47e5a532e Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 17:51:04 +0000
Subject: [PATCH 5/6] perf: replace identity-skip with pre-cache skip predicate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR replaces the `r.raw() != e.raw()` skip-when-result-is-input check
in `replace_rec_fn::save_result` (introduced earlier in this stack) with
an opt-in pre-cache `skip` predicate that callers can supply alongside
the rewriter functor. The skip predicate is invoked *before* the cache
lookup; if it returns true, the input is returned unchanged with no
cache touch.

The earlier identity-skip approach blew up catastrophically on workloads
like `Init.WFExtrinsicFix` (~3-4x instruction count), because it dropped
caching for the *recursion-identity* case as well as the cheap
`m_f`-early-return case. In the recursion-identity case the cache had
been remembering "we already descended into this shared subterm and
proved it's unchanged" — without that memo, every revisit redid the
full recursive descent and the work compounded exponentially.

The skip predicate fixes this by separating the two cases: only the
cheap "no work needed for this subtree" guard goes into `skip`, and the
ordinary cache logic handles every other path unchanged. For
`instantiate*`/`lift_loose_bvars`/`lower_loose_bvars`, the predicate is
the existing memoized loose-bvar-range check that was previously the
first line of `m_f`. It's strictly cheaper than the cache lookup it
replaces.

Measurements vs the `xkn` baseline on `leanchecker --fresh`:

|                       | baseline | this PR |
|-----------------------|----------|---------|
| Init.WFExtrinsicFix   | 9.32 G   | 9.07 G  |
| Init.Data.List.Lemmas | 17.10 G  | 16.61 G |

Both workloads improve modestly, with no regression on either.
---
 src/kernel/expr.cpp        | 21 +++++++++++----------
 src/kernel/instantiate.cpp | 35 +++++++++++++++++++++--------------
 src/kernel/replace_fn.cpp  | 27 ++++++++++++++++++++++-----
 src/kernel/replace_fn.h    | 12 ++++++++++++
 4 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/src/kernel/expr.cpp b/src/kernel/expr.cpp
index f87e72aaee16..a78e69eb1f9b 100644
--- a/src/kernel/expr.cpp
+++ b/src/kernel/expr.cpp
@@ -418,12 +418,13 @@ expr lower_loose_bvars(expr const & e, unsigned s, unsigned d) {
     if (d == 0 || s >= get_loose_bvar_range(e))
         return e;
     lean_assert(s >= d);
-    return replace(e, [=](expr const & e, unsigned offset) -> optional<expr> {
+    return replace(e,
+        [=](expr const & e, unsigned offset) {
+            unsigned s1 = s + offset;
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e);
+        },
+        [=](expr const & e, unsigned offset) -> optional<expr> {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(e); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(e))
-                return some_expr(e); // expression e does not contain bound variables with idx >= s1
             if (is_bvar(e) && bvar_idx(e) >= s1) {
                 lean_assert(bvar_idx(e) >= offset + d);
                 return some_expr(mk_bvar(bvar_idx(e) - nat(d)));
@@ -448,12 +449,12 @@ extern "C" LEAN_EXPORT object * lean_expr_lower_loose_bvars(b_obj_arg e, b_obj_a
 expr lift_loose_bvars(expr const & e, unsigned s, unsigned d) {
     if (d == 0 || s >= get_loose_bvar_range(e))
         return e;
-    return replace(e, [=](expr const & e, unsigned offset) -> optional<expr> {
+    return replace(e,
+        [=](expr const & e, unsigned offset) {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(e); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(e))
-                return some_expr(e); // expression e does not contain bound variables with idx >= s1
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e);
+        },
+        [=](expr const & e, unsigned offset) -> optional<expr> {
             if (is_var(e) && bvar_idx(e) >= s + offset) {
                 return some_expr(mk_bvar(bvar_idx(e) + nat(d)));
             } else {
diff --git a/src/kernel/instantiate.cpp b/src/kernel/instantiate.cpp
index b845eda9129c..f04e1d4a1724 100644
--- a/src/kernel/instantiate.cpp
+++ b/src/kernel/instantiate.cpp
@@ -15,13 +15,14 @@ namespace lean {
 expr instantiate(expr const & a, unsigned s, unsigned n, expr const * subst) {
     if (s >= get_loose_bvar_range(a) || n == 0)
         return a;
-    return replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
+    return replace(a,
+        [=](expr const & m, unsigned offset) {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(m); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= s1
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
+                unsigned s1 = s + offset;
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= s1) {
                     unsigned h = s1 + n;
@@ -58,9 +59,11 @@ static object * lean_expr_instantiate_core(b_obj_arg a0, size_t n, object** subs
         lean_inc(a0);
         return a0;
     }
-    expr r = replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    expr r = replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
@@ -99,9 +102,11 @@ extern "C" LEAN_EXPORT object * lean_expr_instantiate_range(b_obj_arg a, b_obj_a
 expr instantiate_rev(expr const & a, unsigned n, expr const * subst) {
     if (!has_loose_bvars(a))
         return a;
-    return replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    return replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
@@ -123,9 +128,11 @@ static object * lean_expr_instantiate_rev_core(object * a0, size_t n, object **
         lean_inc(a0);
         return a0;
     }
-    expr r = replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    expr r = replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
index 8f76a5f4c0ee..10beab6ea54b 100644
--- a/src/kernel/replace_fn.cpp
+++ b/src/kernel/replace_fn.cpp
@@ -20,20 +20,27 @@ class replace_rec_fn {
         }
     };
     lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
+    // Optional pre-cache skip predicate. When set, called *before* the cache
+    // lookup; if it returns true, the input is returned unchanged with no cache
+    // touch. This is the right tool for "this entire subtree is unaffected"
+    // bail-outs (e.g. `instantiate*`'s loose-bvar-range check), since it both
+    // avoids the cache lookup and avoids polluting the cache with identity
+    // entries — without the pitfall of the earlier `r == e` skip in
+    // `save_result`, which also dropped the *recursion-identity* case and blew
+    // up exponentially on workloads like `Init.WFExtrinsicFix`.
+    std::function<bool(expr const &, unsigned)> m_skip;
     std::function<optional<expr>(expr const &, unsigned)> m_f;
     bool                                                  m_use_cache;
 
     expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
-        // Skip caching identity results: a re-visit can recompute by calling
-        // `m_f` again, which for the dominant `instantiate*` callers is just a
-        // memoized loose-bvar-range check. The cache slot would otherwise be
-        // pure overhead, since the value equals the (already-shared) input.
-        if (shared && r.raw() != e.raw())
+        if (shared)
             m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
         return r;
     }
 
     expr apply(expr const & e, unsigned offset) {
+        if (m_skip && m_skip(e, offset))
+            return e;
         bool shared = false;
         if (m_use_cache && !is_likely_unshared(e)) {
             auto it = m_cache.find(mk_pair(e.raw(), offset));
@@ -80,6 +87,9 @@ class replace_rec_fn {
 public:
     template<typename F>
     replace_rec_fn(F const & f, bool use_cache):m_f(f), m_use_cache(use_cache) {}
+    template<typename Skip, typename F>
+    replace_rec_fn(Skip const & skip, F const & f, bool use_cache):
+        m_skip(skip), m_f(f), m_use_cache(use_cache) {}
 
     expr operator()(expr const & e) { return apply(e, 0); }
 };
@@ -88,6 +98,13 @@ expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned
     return replace_rec_fn(f, use_cache)(e);
 }
 
+expr replace(expr const & e,
+             std::function<bool(expr const &, unsigned)> const & skip,
+             std::function<optional<expr>(expr const &, unsigned)> const & f,
+             bool use_cache) {
+    return replace_rec_fn(skip, f, use_cache)(e);
+}
+
 class replace_fn {
     lean::unordered_map<lean_object *, expr> m_cache;
     lean_object * m_f;
diff --git a/src/kernel/replace_fn.h b/src/kernel/replace_fn.h
index ed2d8f9d39a0..4efaff9457a0 100644
--- a/src/kernel/replace_fn.h
+++ b/src/kernel/replace_fn.h
@@ -23,4 +23,16 @@ expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned
 inline expr replace(expr const & e, std::function<optional<expr>(expr const &)> const & f, bool use_cache = true) {
     return replace(e, [&](expr const & e, unsigned) { return f(e); }, use_cache);
 }
+
+/**
+   \brief As `replace`, but with a pre-cache skip predicate. `skip(m, offset)`
+   is called before any cache lookup; if it returns true, `m` is returned
+   unchanged with no cache touch. Use this for cheap "this entire subtree is
+   unaffected" guards (e.g. a memoized loose-bvar-range check), since it both
+   skips the cache lookup and avoids polluting the cache with identity entries.
+*/
+expr replace(expr const & e,
+             std::function<bool(expr const &, unsigned)> const & skip,
+             std::function<optional<expr>(expr const &, unsigned)> const & f,
+             bool use_cache = true);
 }

From ecc6f08357d3402d1feafdbeb313e0c81d95298d Mon Sep 17 00:00:00 2001
From: Sebastian Ullrich <sebasti@nullri.ch>
Date: Sun, 12 Apr 2026 16:37:48 +0000
Subject: [PATCH 6/6] perf: small-buffer inline cache for `replace_rec_fn`

This PR replaces the `std::unordered_map`-based cache in `replace_rec_fn`
with a small-buffer cache that stores its first 16 entries inline in
uninitialized stack storage and only allocates a real hash map for the
rare large traversal. Instrumentation across a full
`leanchecker --fresh Init.Data.List.Lemmas` run shows that 87% of
`replace_rec_fn` instances hold at most 15 entries and only 0.21%
exceed 128, with a mean cache size of just 9 entries spread across
~950k instances. At that scale a hash map is the wrong data structure:
its per-instance bucket-array allocation and per-entry node allocation
dwarf the cost of a linear scan over a handful of entries.

The new structure pays nothing for entries that are never inserted, no
allocation at all on the common path, and falls back to the original
`unordered_map` once the inline buffer fills. Combined with the
existing `is_likely_unshared` filter, lookups on the common path are
just a tight scan over a stack-resident array.

On `leanchecker --fresh Init.Data.List.Lemmas` this shaves
`17.10 G -> 16.18 G` instructions (~5.4%) and `1.74s -> 1.62s`
wall-clock (~6.7%) compared to the previous baseline. It supersedes
the prior `try_emplace` and `reserve(128)` micro-optimizations on the
same cache, both of which are no longer needed since the hash map is
no longer on the hot path.
---
 src/kernel/replace_fn.cpp | 75 +++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 6 deletions(-)

diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
index 08e6b17d85cd..c86597f0a186 100644
--- a/src/kernel/replace_fn.cpp
+++ b/src/kernel/replace_fn.cpp
@@ -13,28 +13,91 @@ Author: Leonardo de Moura
 
 namespace lean {
 
-class replace_rec_fn {
+// Small-buffer cache for `replace_rec_fn`. The histogram of cache sizes during a
+// typical run is heavily skewed toward small caches: ~87% of instances hold ≤15
+// entries and ~99% hold ≤63, with a long thin tail up to a few thousand. A linear
+// scan over a stack-resident array beats any hash map at that scale, so we keep
+// the first `INLINE_CAP` entries inline and only fall back to a real hash map
+// (lazily allocated) for the rare large traversal.
+class replace_cache {
     struct key_hasher {
         std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
             return hash((size_t)p.first >> 3, p.second);
         }
     };
-    lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
+    using key_t = std::pair<lean_object *, unsigned>;
+    using map_t = lean::unordered_map<key_t, expr, key_hasher>;
+    static constexpr unsigned INLINE_CAP = 16;
+    struct entry { key_t k; expr v; };
+    // Uninitialized storage; only entries [0, m_size) are constructed. This
+    // avoids paying for `INLINE_CAP` default-constructed `expr`s on every
+    // `replace_rec_fn` instance, which matters because the typical traversal
+    // creates a fresh cache holding only a handful of entries.
+    alignas(entry) std::byte m_inline_storage[sizeof(entry) * INLINE_CAP];
+    unsigned m_size = 0;
+    std::unique_ptr<map_t> m_overflow;
+
+    entry * inline_at(unsigned i) {
+        return std::launder(reinterpret_cast<entry *>(m_inline_storage) + i);
+    }
+    entry const * inline_at(unsigned i) const {
+        return std::launder(reinterpret_cast<entry const *>(m_inline_storage) + i);
+    }
+public:
+    replace_cache() = default;
+    replace_cache(replace_cache const &) = delete;
+    replace_cache & operator=(replace_cache const &) = delete;
+    ~replace_cache() {
+        for (unsigned i = 0; i < m_size; ++i) inline_at(i)->~entry();
+    }
+
+    expr const * find(key_t const & k) const {
+        for (unsigned i = 0; i < m_size; ++i) {
+            entry const * e = inline_at(i);
+            if (e->k == k) return &e->v;
+        }
+        if (m_overflow) {
+            auto it = m_overflow->find(k);
+            if (it != m_overflow->end()) return &it->second;
+        }
+        return nullptr;
+    }
+    void insert(key_t const & k, expr const & v) {
+        if (!m_overflow) {
+            if (m_size < INLINE_CAP) {
+                new (inline_at(m_size)) entry{k, v};
+                ++m_size;
+                return;
+            }
+            m_overflow.reset(new map_t());
+            m_overflow->reserve(INLINE_CAP * 4);
+            for (unsigned i = 0; i < m_size; ++i) {
+                entry * e = inline_at(i);
+                m_overflow->emplace(e->k, std::move(e->v));
+                e->~entry();
+            }
+            m_size = 0;
+        }
+        m_overflow->insert(mk_pair(k, v));
+    }
+};
+
+class replace_rec_fn {
+    replace_cache m_cache;
     std::function<optional<expr>(expr const &, unsigned)> m_f;
     bool                                                  m_use_cache;
 
     expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
         if (shared)
-            m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
+            m_cache.insert(mk_pair(e.raw(), offset), r);
         return r;
     }
 
     expr apply(expr const & e, unsigned offset) {
         bool shared = false;
         if (m_use_cache && !is_likely_unshared(e)) {
-            auto it = m_cache.find(mk_pair(e.raw(), offset));
-            if (it != m_cache.end())
-                return it->second;
+            if (expr const * cached = m_cache.find(mk_pair(e.raw(), offset)))
+                return *cached;
             shared = true;
         }
         if (optional<expr> r = m_f(e, offset)) {