perf: small-buffer inline cache for replace_rec_fn

Kha · Kha · commit ecc6f08357d3 · 2026-04-12T20:24:03.000Z
This PR replaces the `std::unordered_map`-based cache in `replace_rec_fn`
with a small-buffer cache that stores its first 16 entries inline in
uninitialized stack storage and only allocates a real hash map for the
rare large traversal. Instrumentation across a full
`leanchecker --fresh Init.Data.List.Lemmas` run shows that 87% of
`replace_rec_fn` instances hold at most 15 entries and only 0.21%
exceed 128, with a mean cache size of just 9 entries spread across
~950k instances. At that scale a hash map is the wrong data structure:
its per-instance bucket-array allocation and per-entry node allocation
dwarf the cost of a linear scan over a handful of entries.

The new structure pays nothing for entries that are never inserted, no
allocation at all on the common path, and falls back to the original
`unordered_map` once the inline buffer fills. Combined with the
existing `is_likely_unshared` filter, lookups on the common path are
just a tight scan over a stack-resident array.

On `leanchecker --fresh Init.Data.List.Lemmas` this shaves
`17.10 G -&gt; 16.18 G` instructions (~5.4%) and `1.74s -&gt; 1.62s`
wall-clock (~6.7%) compared to the previous baseline. It supersedes
the prior `try_emplace` and `reserve(128)` micro-optimizations on the
same cache, both of which are no longer needed since the hash map is
no longer on the hot path.
diff --git a/src/kernel/replace_fn.cpp b/src/kernel/replace_fn.cpp
@@ -13,28 +13,91 @@ Author: Leonardo de Moura
 
 namespace lean {
 
-class replace_rec_fn {
+// Small-buffer cache for `replace_rec_fn`. The histogram of cache sizes during a
+// typical run is heavily skewed toward small caches: ~87% of instances hold ≤15
+// entries and ~99% hold ≤63, with a long thin tail up to a few thousand. A linear
+// scan over a stack-resident array beats any hash map at that scale, so we keep
+// the first `INLINE_CAP` entries inline and only fall back to a real hash map
+// (lazily allocated) for the rare large traversal.
+class replace_cache {
     struct key_hasher {
         std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
             return hash((size_t)p.first >> 3, p.second);
         }
     };
-    lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
+    using key_t = std::pair<lean_object *, unsigned>;
+    using map_t = lean::unordered_map<key_t, expr, key_hasher>;
+    static constexpr unsigned INLINE_CAP = 16;
+    struct entry { key_t k; expr v; };
+    // Uninitialized storage; only entries [0, m_size) are constructed. This
+    // avoids paying for `INLINE_CAP` default-constructed `expr`s on every
+    // `replace_rec_fn` instance, which matters because the typical traversal
+    // creates a fresh cache holding only a handful of entries.
+    alignas(entry) std::byte m_inline_storage[sizeof(entry) * INLINE_CAP];
+    unsigned m_size = 0;
+    std::unique_ptr<map_t> m_overflow;
+
+    entry * inline_at(unsigned i) {
+        return std::launder(reinterpret_cast<entry *>(m_inline_storage) + i);
+    }
+    entry const * inline_at(unsigned i) const {
+        return std::launder(reinterpret_cast<entry const *>(m_inline_storage) + i);
+    }
+public:
+    replace_cache() = default;
+    replace_cache(replace_cache const &) = delete;
+    replace_cache & operator=(replace_cache const &) = delete;
+    ~replace_cache() {
+        for (unsigned i = 0; i < m_size; ++i) inline_at(i)->~entry();
+    }
+
+    expr const * find(key_t const & k) const {
+        for (unsigned i = 0; i < m_size; ++i) {
+            entry const * e = inline_at(i);
+            if (e->k == k) return &e->v;
+        }
+        if (m_overflow) {
+            auto it = m_overflow->find(k);
+            if (it != m_overflow->end()) return &it->second;
+        }
+        return nullptr;
+    }
+    void insert(key_t const & k, expr const & v) {
+        if (!m_overflow) {
+            if (m_size < INLINE_CAP) {
+                new (inline_at(m_size)) entry{k, v};
+                ++m_size;
+                return;
+            }
+            m_overflow.reset(new map_t());
+            m_overflow->reserve(INLINE_CAP * 4);
+            for (unsigned i = 0; i < m_size; ++i) {
+                entry * e = inline_at(i);
+                m_overflow->emplace(e->k, std::move(e->v));
+                e->~entry();
+            }
+            m_size = 0;
+        }
+        m_overflow->insert(mk_pair(k, v));
+    }
+};
+
+class replace_rec_fn {
+    replace_cache m_cache;
     std::function<optional<expr>(expr const &, unsigned)> m_f;
     bool                                                  m_use_cache;
 
     expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
         if (shared)
-            m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
+            m_cache.insert(mk_pair(e.raw(), offset), r);
         return r;
     }
 
     expr apply(expr const & e, unsigned offset) {
         bool shared = false;
         if (m_use_cache && !is_likely_unshared(e)) {
-            auto it = m_cache.find(mk_pair(e.raw(), offset));
-            if (it != m_cache.end())
-                return it->second;
+            if (expr const * cached = m_cache.find(mk_pair(e.raw(), offset)))
+                return *cached;
             shared = true;
         }
         if (optional<expr> r = m_f(e, offset)) {