diff --git a/src/Lean/Meta/Closure.lean b/src/Lean/Meta/Closure.lean
index d5c6d4bf86de..6ddd7dfc43ae 100644
--- a/src/Lean/Meta/Closure.lean
+++ b/src/Lean/Meta/Closure.lean
@@ -456,10 +456,15 @@ def mkAuxDefinitionFor (name : Name) (value : Expr) (zetaDelta : Bool := false)
 -/
 def mkAuxTheorem (type : Expr) (value : Expr) (zetaDelta : Bool := false) (kind? : Option Name := none) (cache := true) : MetaM Expr := do
   let result ← Closure.mkValueTypeClosure type value zetaDelta
-  let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList result.type result.value
+  let (resType, resValue) ← profileitM Exception "share common exprs" (← getOptions) do
+    withTraceNode `Meta.Closure.maxSharing (fun _ => return m!"share common exprs") do
+      let es := ShareCommon.shareCommon' #[result.type, result.value]
+      return (es[0]!, es[1]!)
+  let name ← mkAuxLemma (kind? := kind?) (cache := cache) result.levelParams.toList resType resValue
   return mkAppN (mkConst name result.levelArgs.toList) result.exprArgs
 
 builtin_initialize
   registerTraceClass `Meta.Closure
+  registerTraceClass `Meta.Closure.maxSharing
 
 end Lean.Meta
diff --git a/src/kernel/environment.cpp b/src/kernel/environment.cpp
index 8c5d2b1a79b3..e322800895e7 100644
--- a/src/kernel/environment.cpp
+++ b/src/kernel/environment.cpp
@@ -9,7 +9,6 @@ Author: Leonardo de Moura
 #include <limits>
 #include "runtime/sstream.h"
 #include "runtime/thread.h"
-#include "runtime/sharecommon.h"
 #include "util/map_foreach.h"
 #include "util/io.h"
 #include "kernel/environment.h"
@@ -194,15 +193,12 @@ environment environment::add_theorem(declaration const & d, bool check) const {
     theorem_val const & v = d.to_theorem_val();
     if (check) {
         type_checker checker(*this, diag.get());
-        sharecommon_persistent_fn share;
-        expr val(share(v.get_value().raw()));
-        expr type(share(v.get_type().raw()));
         check_constant_val(*this, v.to_constant_val(), checker);
-        if (!checker.is_prop(type))
-            throw theorem_type_is_not_prop(*this, v.get_name(), type);
-        check_no_metavar_no_fvar(*this, v.get_name(), val);
-        expr val_type = checker.check(val, v.get_lparams());
-        if (!checker.is_def_eq(val_type, type))
+        if (!checker.is_prop(v.get_type()))
+            throw theorem_type_is_not_prop(*this, v.get_name(), v.get_type());
+        check_no_metavar_no_fvar(*this, v.get_name(), v.get_value());
+        expr val_type = checker.check(v.get_value(), v.get_lparams());
+        if (!checker.is_def_eq(val_type, v.get_type()))
             throw definition_type_mismatch_exception(*this, d, val_type);
     }
     return diag.update(add(constant_info(d)));
diff --git a/src/kernel/expr.cpp b/src/kernel/expr.cpp
index f87e72aaee16..495df62cd745 100644
--- a/src/kernel/expr.cpp
+++ b/src/kernel/expr.cpp
@@ -100,7 +100,7 @@ extern "C" uint8 lean_expr_has_level_param(obj_arg e);
 bool has_univ_param(expr const & e) { return lean_expr_has_level_param(e.to_obj_arg()); }
 
 extern "C" unsigned lean_expr_loose_bvar_range(object * e);
-unsigned get_loose_bvar_range(expr const & e) { return lean_expr_loose_bvar_range(e.to_obj_arg()); }
+unsigned loose_bvar_range_core(expr const & e) { return lean_expr_loose_bvar_range(e.to_obj_arg()); }
 
 extern "C" LEAN_EXPORT uint64_t lean_expr_mk_data(uint64_t hash, object * bvarRange, uint32_t approxDepth, uint8_t hasFVar, uint8_t hasExprMVar, uint8_t hasLevelMVar, uint8_t hasLevelParam) {
     if (approxDepth > 255) approxDepth = 255;
@@ -418,12 +418,13 @@ expr lower_loose_bvars(expr const & e, unsigned s, unsigned d) {
     if (d == 0 || s >= get_loose_bvar_range(e))
         return e;
     lean_assert(s >= d);
-    return replace(e, [=](expr const & e, unsigned offset) -> optional<expr> {
+    return replace(e,
+        [=](expr const & e, unsigned offset) {
+            unsigned s1 = s + offset;
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e);
+        },
+        [=](expr const & e, unsigned offset) -> optional<expr> {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(e); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(e))
-                return some_expr(e); // expression e does not contain bound variables with idx >= s1
             if (is_bvar(e) && bvar_idx(e) >= s1) {
                 lean_assert(bvar_idx(e) >= offset + d);
                 return some_expr(mk_bvar(bvar_idx(e) - nat(d)));
@@ -448,12 +449,12 @@ extern "C" LEAN_EXPORT object * lean_expr_lower_loose_bvars(b_obj_arg e, b_obj_a
 expr lift_loose_bvars(expr const & e, unsigned s, unsigned d) {
     if (d == 0 || s >= get_loose_bvar_range(e))
         return e;
-    return replace(e, [=](expr const & e, unsigned offset) -> optional<expr> {
+    return replace(e,
+        [=](expr const & e, unsigned offset) {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(e); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(e))
-                return some_expr(e); // expression e does not contain bound variables with idx >= s1
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(e);
+        },
+        [=](expr const & e, unsigned offset) -> optional<expr> {
             if (is_var(e) && bvar_idx(e) >= s + offset) {
                 return some_expr(mk_bvar(bvar_idx(e) + nat(d)));
             } else {
diff --git a/src/kernel/expr.h b/src/kernel/expr.h
index bcf83b97aa4b..69ef27317166 100644
--- a/src/kernel/expr.h
+++ b/src/kernel/expr.h
@@ -155,7 +155,13 @@ inline bool has_fvar(expr const & e) {
     return r;
 }
 bool has_univ_param(expr const & e);
-unsigned get_loose_bvar_range(expr const & e);
+/* This is the implementation in Lean */
+unsigned loose_bvar_range_core(expr const & e);
+inline unsigned get_loose_bvar_range(expr const & e) {
+    unsigned r = static_cast<unsigned>(get_data(e) >> 44);
+    lean_assert(r == loose_bvar_range_core(e));
+    return r;
+}
 
 struct expr_hash { unsigned operator()(expr const & e) const { return hash(e); } };
 struct expr_pair_hash {
diff --git a/src/kernel/instantiate.cpp b/src/kernel/instantiate.cpp
index b845eda9129c..f04e1d4a1724 100644
--- a/src/kernel/instantiate.cpp
+++ b/src/kernel/instantiate.cpp
@@ -15,13 +15,14 @@ namespace lean {
 expr instantiate(expr const & a, unsigned s, unsigned n, expr const * subst) {
     if (s >= get_loose_bvar_range(a) || n == 0)
         return a;
-    return replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
+    return replace(a,
+        [=](expr const & m, unsigned offset) {
             unsigned s1 = s + offset;
-            if (s1 < s)
-                return some_expr(m); // overflow, vidx can't be >= max unsigned
-            if (s1 >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= s1
+            return s1 < s /* overflow */ || s1 >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
+                unsigned s1 = s + offset;
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= s1) {
                     unsigned h = s1 + n;
@@ -58,9 +59,11 @@ static object * lean_expr_instantiate_core(b_obj_arg a0, size_t n, object** subs
         lean_inc(a0);
         return a0;
     }
-    expr r = replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    expr r = replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
@@ -99,9 +102,11 @@ extern "C" LEAN_EXPORT object * lean_expr_instantiate_range(b_obj_arg a, b_obj_a
 expr instantiate_rev(expr const & a, unsigned n, expr const * subst) {
     if (!has_loose_bvars(a))
         return a;
-    return replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    return replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
@@ -123,9 +128,11 @@ static object * lean_expr_instantiate_rev_core(object * a0, size_t n, object **
         lean_inc(a0);
         return a0;
     }
-    expr r = replace(a, [=](expr const & m, unsigned offset) -> optional<expr> {
-            if (offset >= get_loose_bvar_range(m))
-                return some_expr(m); // expression m does not contain loose bound variables with idx >= offset
+    expr r = replace(a,
+        [=](expr const & m, unsigned offset) {
+            return offset >= get_loose_bvar_range(m);
+        },
+        [=](expr const & m, unsigned offset) -> optional<expr> {
             if (is_bvar(m)) {
                 nat const & vidx = bvar_idx(m);
                 if (vidx >= offset) {
diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
index 08e6b17d85cd..dcaa98be23c3 100644
--- a/src/kernel/replace_fn.cpp
+++ b/src/kernel/replace_fn.cpp
@@ -13,75 +13,22 @@ Author: Leonardo de Moura
 
 namespace lean {
 
-class replace_rec_fn {
-    struct key_hasher {
-        std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
-            return hash((size_t)p.first >> 3, p.second);
-        }
-    };
-    lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
-    std::function<optional<expr>(expr const &, unsigned)> m_f;
-    bool                                                  m_use_cache;
-
-    expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
-        if (shared)
-            m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
-        return r;
-    }
-
-    expr apply(expr const & e, unsigned offset) {
-        bool shared = false;
-        if (m_use_cache && !is_likely_unshared(e)) {
-            auto it = m_cache.find(mk_pair(e.raw(), offset));
-            if (it != m_cache.end())
-                return it->second;
-            shared = true;
-        }
-        if (optional<expr> r = m_f(e, offset)) {
-            return save_result(e, offset, std::move(*r), shared);
-        } else {
-            switch (e.kind()) {
-            case expr_kind::Const: case expr_kind::Sort:
-            case expr_kind::BVar:  case expr_kind::Lit:
-            case expr_kind::MVar:  case expr_kind::FVar:
-                return save_result(e, offset, e, shared);
-            case expr_kind::MData: {
-                expr new_e = apply(mdata_expr(e), offset);
-                return save_result(e, offset, update_mdata(e, new_e), shared);
-            }
-            case expr_kind::Proj: {
-                expr new_e = apply(proj_expr(e), offset);
-                return save_result(e, offset, update_proj(e, new_e), shared);
-            }
-            case expr_kind::App: {
-                expr new_f = apply(app_fn(e), offset);
-                expr new_a = apply(app_arg(e), offset);
-                return save_result(e, offset, update_app(e, new_f, new_a), shared);
-            }
-            case expr_kind::Pi: case expr_kind::Lambda: {
-                expr new_d = apply(binding_domain(e), offset);
-                expr new_b = apply(binding_body(e), offset+1);
-                return save_result(e, offset, update_binding(e, new_d, new_b), shared);
-            }
-            case expr_kind::Let: {
-                expr new_t = apply(let_type(e), offset);
-                expr new_v = apply(let_value(e), offset);
-                expr new_b = apply(let_body(e), offset+1);
-                return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared);
-            }
-            }
-            lean_unreachable();
-        }
-    }
-public:
-    template<typename F>
-    replace_rec_fn(F const & f, bool use_cache):m_f(f), m_use_cache(use_cache) {}
-
-    expr operator()(expr const & e) { return apply(e, 0); }
-};
-
+// Out-of-line `std::function`-typed overloads, declared in replace_fn.h. Defer
+// to the templated implementation in the header, instantiated once per
+// `std::function` shape. Hot callers should bypass these and use the templated
+// `replace<F>` (or `replace<Skip, F>`) directly to avoid the per-call indirect
+// dispatch through `std::function`.
 expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned)> const & f, bool use_cache) {
-    return replace_rec_fn(f, use_cache)(e);
+    return replace_rec_fn<std::function<optional<expr>(expr const &, unsigned)>>(f, use_cache)(e);
+}
+
+expr replace(expr const & e,
+             std::function<bool(expr const &, unsigned)> const & skip,
+             std::function<optional<expr>(expr const &, unsigned)> const & f,
+             bool use_cache) {
+    using F = std::function<optional<expr>(expr const &, unsigned)>;
+    using S = std::function<bool(expr const &, unsigned)>;
+    return replace_rec_fn<F, S>(skip, f, use_cache)(e);
 }
 
 class replace_fn {
diff --git a/src/kernel/replace_fn.h b/src/kernel/replace_fn.h
index ed2d8f9d39a0..83f6d20e53b8 100644
--- a/src/kernel/replace_fn.h
+++ b/src/kernel/replace_fn.h
@@ -6,11 +6,180 @@ Author: Leonardo de Moura
 */
 #pragma once
 #include <tuple>
+#include <utility>
 #include "runtime/interrupt.h"
 #include "kernel/expr.h"
 #include "kernel/expr_maps.h"
+#include "util/alloc.h"
 
 namespace lean {
+
+// Small-buffer cache for `replace_rec_fn`. The histogram of cache sizes during
+// a typical run is heavily skewed toward small caches: ~87% of instances hold
+// ≤15 entries and ~99% hold ≤63, with a long thin tail up to a few thousand.
+// A linear scan over a stack-resident array beats any hash map at that scale,
+// so we keep the first `INLINE_CAP` entries inline and only fall back to a
+// real hash map (lazily allocated) for the rare large traversal.
+class replace_cache {
+    struct key_hasher {
+        std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
+            return hash((size_t)p.first >> 3, p.second);
+        }
+    };
+    using key_t = std::pair<lean_object *, unsigned>;
+    using map_t = lean::unordered_map<key_t, expr, key_hasher>;
+    static constexpr unsigned INLINE_CAP = 16;
+    struct entry { key_t k; expr v; };
+    // Uninitialized storage; only entries [0, m_size) are constructed. This
+    // avoids paying for `INLINE_CAP` default-constructed `expr`s on every
+    // `replace_rec_fn` instance, which matters because the typical traversal
+    // creates a fresh cache holding only a handful of entries.
+    alignas(entry) std::byte m_inline_storage[sizeof(entry) * INLINE_CAP];
+    unsigned m_size = 0;
+    std::unique_ptr<map_t> m_overflow;
+
+    entry * inline_at(unsigned i) {
+        return std::launder(reinterpret_cast<entry *>(m_inline_storage) + i);
+    }
+    entry const * inline_at(unsigned i) const {
+        return std::launder(reinterpret_cast<entry const *>(m_inline_storage) + i);
+    }
+public:
+    replace_cache() = default;
+    replace_cache(replace_cache const &) = delete;
+    replace_cache & operator=(replace_cache const &) = delete;
+    ~replace_cache() {
+        for (unsigned i = 0; i < m_size; ++i) inline_at(i)->~entry();
+    }
+
+    expr const * find(key_t const & k) const {
+        for (unsigned i = 0; i < m_size; ++i) {
+            entry const * e = inline_at(i);
+            if (e->k == k) return &e->v;
+        }
+        if (m_overflow) {
+            auto it = m_overflow->find(k);
+            if (it != m_overflow->end()) return &it->second;
+        }
+        return nullptr;
+    }
+    void insert(key_t const & k, expr const & v) {
+        if (!m_overflow) {
+            if (m_size < INLINE_CAP) {
+                new (inline_at(m_size)) entry{k, v};
+                ++m_size;
+                return;
+            }
+            // Promote: drain inline entries into the overflow map and reset
+            // `m_size` so future finds skip the (now-empty) inline scan and
+            // future inserts go straight to overflow. Without this drain,
+            // workloads with a long tail of large caches (e.g. the `big_do`
+            // elab benchmark, where ~3% of caches hold 1k-8k entries but
+            // contribute essentially all the lookup work) waste 16 stale
+            // comparisons on every post-promotion lookup.
+            m_overflow.reset(new map_t());
+            m_overflow->reserve(INLINE_CAP * 4);
+            for (unsigned i = 0; i < m_size; ++i) {
+                entry * e = inline_at(i);
+                m_overflow->emplace(e->k, std::move(e->v));
+                e->~entry();
+            }
+            m_size = 0;
+        }
+        m_overflow->insert(mk_pair(k, v));
+    }
+};
+
+/// Default skip predicate: never skips. Used by `replace_rec_fn` callers that
+/// don't want a pre-cache bail-out.
+struct replace_no_skip {
+    bool operator()(expr const &, unsigned) const { return false; }
+};
+
+/**
+   \brief Class-template implementation of `replace`. Stores the user functor by
+   value (not via `std::function`) so calls into it inline at every recursion
+   step. Hot callers should instantiate this directly via `replace<F>(...)`;
+   the legacy `std::function`-typed `replace` overload below is a thin wrapper
+   for cold/external callers that need type erasure.
+
+   The functor `F` must be callable as `F(expr const &, unsigned) -> optional<expr>`.
+   The optional `Skip` predicate is invoked *before* the cache lookup; if it
+   returns true the input is returned unchanged with no cache touch. Use this
+   for cheap "this entire subtree is unaffected" guards (e.g. a memoized
+   loose-bvar-range check) — it both avoids the cache lookup and avoids
+   polluting the cache with identity entries, without the pitfall of an
+   `r == e` check in `save_result` that would also drop the *recursion-identity*
+   case and blow up exponentially on workloads like `Init.WFExtrinsicFix`.
+*/
+template<typename F, typename Skip = replace_no_skip>
+class replace_rec_fn {
+    replace_cache m_cache;
+    Skip m_skip;
+    F    m_f;
+    bool m_use_cache;
+
+    expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
+        if (shared)
+            m_cache.insert(mk_pair(e.raw(), offset), r);
+        return r;
+    }
+
+    expr apply(expr const & e, unsigned offset) {
+        if (m_skip(e, offset))
+            return e;
+        bool shared = false;
+        if (m_use_cache && !is_likely_unshared(e)) {
+            if (expr const * cached = m_cache.find(mk_pair(e.raw(), offset)))
+                return *cached;
+            shared = true;
+        }
+        if (optional<expr> r = m_f(e, offset)) {
+            return save_result(e, offset, std::move(*r), shared);
+        } else {
+            switch (e.kind()) {
+            case expr_kind::Const: case expr_kind::Sort:
+            case expr_kind::BVar:  case expr_kind::Lit:
+            case expr_kind::MVar:  case expr_kind::FVar:
+                return save_result(e, offset, e, shared);
+            case expr_kind::MData: {
+                expr new_e = apply(mdata_expr(e), offset);
+                return save_result(e, offset, update_mdata(e, new_e), shared);
+            }
+            case expr_kind::Proj: {
+                expr new_e = apply(proj_expr(e), offset);
+                return save_result(e, offset, update_proj(e, new_e), shared);
+            }
+            case expr_kind::App: {
+                expr new_f = apply(app_fn(e), offset);
+                expr new_a = apply(app_arg(e), offset);
+                return save_result(e, offset, update_app(e, new_f, new_a), shared);
+            }
+            case expr_kind::Pi: case expr_kind::Lambda: {
+                expr new_d = apply(binding_domain(e), offset);
+                expr new_b = apply(binding_body(e), offset+1);
+                return save_result(e, offset, update_binding(e, new_d, new_b), shared);
+            }
+            case expr_kind::Let: {
+                expr new_t = apply(let_type(e), offset);
+                expr new_v = apply(let_value(e), offset);
+                expr new_b = apply(let_body(e), offset+1);
+                return save_result(e, offset, update_let(e, new_t, new_v, new_b), shared);
+            }
+            }
+            lean_unreachable();
+        }
+    }
+public:
+    template<typename G>
+    replace_rec_fn(G && f, bool use_cache):m_f(std::forward<G>(f)), m_use_cache(use_cache) {}
+    template<typename H, typename G>
+    replace_rec_fn(H && skip, G && f, bool use_cache):
+        m_skip(std::forward<H>(skip)), m_f(std::forward<G>(f)), m_use_cache(use_cache) {}
+
+    expr operator()(expr const & e) { return apply(e, 0); }
+};
+
 /**
    \brief Apply <tt>f</tt> to the subexpressions of a given expression.
 
@@ -18,9 +187,57 @@ namespace lean {
    In a call <tt>f(s, n)</tt>, n is the scope level, i.e., the number of
    bindings operators that enclosing \c s. The replaces only visits children of \c e
    if f return none_expr.
+
+   These templated overloads monomorphize on the functor type so the call into
+   `f` inlines at every recursion step. Prefer them over the `std::function`
+   overloads for hot callers. The first overload accepts functors taking
+   `(expr const &, unsigned)`; the second accepts functors taking just
+   `(expr const &)` and adapts them by ignoring the offset argument.
+*/
+template<typename F, std::enable_if_t<std::is_invocable_v<F const &, expr const &, unsigned>, int> = 0>
+inline expr replace(expr const & e, F && f, bool use_cache = true) {
+    return replace_rec_fn<std::decay_t<F>>(std::forward<F>(f), use_cache)(e);
+}
+
+template<typename F, std::enable_if_t<std::is_invocable_v<F const &, expr const &> &&
+                                      !std::is_invocable_v<F const &, expr const &, unsigned>, int> = 0>
+inline expr replace(expr const & e, F && f, bool use_cache = true) {
+    auto adapter = [f = std::forward<F>(f)](expr const & e, unsigned) { return f(e); };
+    return replace_rec_fn<decltype(adapter)>(std::move(adapter), use_cache)(e);
+}
+
+/**
+   \brief As the templated `replace` above, but with a pre-cache `skip`
+   predicate. `skip(m, offset)` is invoked before any cache lookup; if it
+   returns true, `m` is returned unchanged with no cache touch.
+*/
+template<typename Skip, typename F,
+         std::enable_if_t<std::is_invocable_r_v<bool, Skip const &, expr const &, unsigned> &&
+                          std::is_invocable_v<F const &, expr const &, unsigned>, int> = 0>
+inline expr replace(expr const & e, Skip && skip, F && f, bool use_cache = true) {
+    return replace_rec_fn<std::decay_t<F>, std::decay_t<Skip>>(
+        std::forward<Skip>(skip), std::forward<F>(f), use_cache)(e);
+}
+
+/**
+   \brief Type-erased overload of `replace` for cold callers and external use.
+   Internally instantiates `replace_rec_fn<std::function<...>>`, so it pays the
+   `std::function` indirection on every recursive call.
 */
 expr replace(expr const & e, std::function<optional<expr>(expr const &, unsigned)> const & f, bool use_cache = true);
 inline expr replace(expr const & e, std::function<optional<expr>(expr const &)> const & f, bool use_cache = true) {
     return replace(e, [&](expr const & e, unsigned) { return f(e); }, use_cache);
 }
+
+/**
+   \brief As `replace`, but with a pre-cache skip predicate. `skip(m, offset)`
+   is called before any cache lookup; if it returns true, `m` is returned
+   unchanged with no cache touch. Use this for cheap "this entire subtree is
+   unaffected" guards (e.g. a memoized loose-bvar-range check), since it both
+   skips the cache lookup and avoids polluting the cache with identity entries.
+*/
+expr replace(expr const & e,
+             std::function<bool(expr const &, unsigned)> const & skip,
+             std::function<optional<expr>(expr const &, unsigned)> const & f,
+             bool use_cache = true);
 }
diff --git a/src/runtime/sharecommon.cpp b/src/runtime/sharecommon.cpp
index 887054e97104..a48cbb41d239 100644
--- a/src/runtime/sharecommon.cpp
+++ b/src/runtime/sharecommon.cpp
@@ -5,6 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #include <cstring>
+#include <vector>
 #include "runtime/sharecommon.h"
 #include "runtime/hash.h"
 
@@ -443,14 +444,4 @@ lean_object * sharecommon_quick_fn::visit(lean_object * a) {
 extern "C" LEAN_EXPORT obj_res lean_sharecommon_quick(obj_arg a) {
     return sharecommon_quick_fn()(a);
 }
-
-lean_object * sharecommon_persistent_fn::operator()(lean_object * e) {
-    lean_object * r = check_cache(e);
-    if (r != nullptr)
-        return r;
-    m_saved.push_back(object_ref(e, true));
-    r = visit(e);
-    m_saved.push_back(object_ref(r, true));
-    return r;
-}
 };
diff --git a/src/runtime/sharecommon.h b/src/runtime/sharecommon.h
index 643a13b37cc7..f022e25fe240 100644
--- a/src/runtime/sharecommon.h
+++ b/src/runtime/sharecommon.h
@@ -5,8 +5,7 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Author: Leonardo de Moura
 */
 #pragma once
-#include <vector>
-#include "runtime/object_ref.h"
+#include "runtime/object.h"
 #include "util/alloc.h"
 
 namespace lean {
@@ -55,16 +54,4 @@ class LEAN_EXPORT sharecommon_quick_fn {
     }
 };
 
-/*
-Similar to `sharecommon_quick_fn`, but we save the entry points and result values to ensure
-they are not deleted.
-*/
-class LEAN_EXPORT sharecommon_persistent_fn : private sharecommon_quick_fn {
-    std::vector<object_ref> m_saved;
-public:
-    sharecommon_persistent_fn(bool s = false):sharecommon_quick_fn(s) {}
-    void set_check_set(bool f) { m_check_set = f; }
-    lean_object * operator()(lean_object * e);
-};
-
 };