Skip to content

Commit ecc6f08

Browse files
committed
perf: small-buffer inline cache for replace_rec_fn
This PR replaces the `std::unordered_map`-based cache in `replace_rec_fn` with a small-buffer cache that stores its first 16 entries inline in uninitialized stack storage and only allocates a real hash map for the rare large traversal. Instrumentation across a full `leanchecker --fresh Init.Data.List.Lemmas` run shows that 87% of `replace_rec_fn` instances hold at most 15 entries and only 0.21% exceed 128, with a mean cache size of just 9 entries spread across ~950k instances. At that scale a hash map is the wrong data structure: its per-instance bucket-array allocation and per-entry node allocation dwarf the cost of a linear scan over a handful of entries. The new structure pays nothing for entries that are never inserted, no allocation at all on the common path, and falls back to the original `unordered_map` once the inline buffer fills. Combined with the existing `is_likely_unshared` filter, lookups on the common path are just a tight scan over a stack-resident array. On `leanchecker --fresh Init.Data.List.Lemmas` this shaves `17.10 G -> 16.18 G` instructions (~5.4%) and `1.74s -> 1.62s` wall-clock (~6.7%) compared to the previous baseline. It supersedes the prior `try_emplace` and `reserve(128)` micro-optimizations on the same cache, both of which are no longer needed since the hash map is no longer on the hot path.
1 parent 82bb27f commit ecc6f08

File tree

1 file changed

+69
-6
lines changed

1 file changed

+69
-6
lines changed

src/kernel/replace_fn.cpp

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,91 @@ Author: Leonardo de Moura
1313

1414
namespace lean {
1515

16-
class replace_rec_fn {
16+
// Small-buffer cache for `replace_rec_fn`. The histogram of cache sizes during a
17+
// typical run is heavily skewed toward small caches: ~87% of instances hold ≤15
18+
// entries and ~99% hold ≤63, with a long thin tail up to a few thousand. A linear
19+
// scan over a stack-resident array beats any hash map at that scale, so we keep
20+
// the first `INLINE_CAP` entries inline and only fall back to a real hash map
21+
// (lazily allocated) for the rare large traversal.
22+
class replace_cache {
1723
struct key_hasher {
1824
std::size_t operator()(std::pair<lean_object *, unsigned> const & p) const {
1925
return hash((size_t)p.first >> 3, p.second);
2026
}
2127
};
22-
lean::unordered_map<std::pair<lean_object *, unsigned>, expr, key_hasher> m_cache;
28+
using key_t = std::pair<lean_object *, unsigned>;
29+
using map_t = lean::unordered_map<key_t, expr, key_hasher>;
30+
static constexpr unsigned INLINE_CAP = 16;
31+
struct entry { key_t k; expr v; };
32+
// Uninitialized storage; only entries [0, m_size) are constructed. This
33+
// avoids paying for `INLINE_CAP` default-constructed `expr`s on every
34+
// `replace_rec_fn` instance, which matters because the typical traversal
35+
// creates a fresh cache holding only a handful of entries.
36+
alignas(entry) std::byte m_inline_storage[sizeof(entry) * INLINE_CAP];
37+
unsigned m_size = 0;
38+
std::unique_ptr<map_t> m_overflow;
39+
40+
entry * inline_at(unsigned i) {
41+
return std::launder(reinterpret_cast<entry *>(m_inline_storage) + i);
42+
}
43+
entry const * inline_at(unsigned i) const {
44+
return std::launder(reinterpret_cast<entry const *>(m_inline_storage) + i);
45+
}
46+
public:
47+
replace_cache() = default;
48+
replace_cache(replace_cache const &) = delete;
49+
replace_cache & operator=(replace_cache const &) = delete;
50+
~replace_cache() {
51+
for (unsigned i = 0; i < m_size; ++i) inline_at(i)->~entry();
52+
}
53+
54+
expr const * find(key_t const & k) const {
55+
for (unsigned i = 0; i < m_size; ++i) {
56+
entry const * e = inline_at(i);
57+
if (e->k == k) return &e->v;
58+
}
59+
if (m_overflow) {
60+
auto it = m_overflow->find(k);
61+
if (it != m_overflow->end()) return &it->second;
62+
}
63+
return nullptr;
64+
}
65+
void insert(key_t const & k, expr const & v) {
66+
if (!m_overflow) {
67+
if (m_size < INLINE_CAP) {
68+
new (inline_at(m_size)) entry{k, v};
69+
++m_size;
70+
return;
71+
}
72+
m_overflow.reset(new map_t());
73+
m_overflow->reserve(INLINE_CAP * 4);
74+
for (unsigned i = 0; i < m_size; ++i) {
75+
entry * e = inline_at(i);
76+
m_overflow->emplace(e->k, std::move(e->v));
77+
e->~entry();
78+
}
79+
m_size = 0;
80+
}
81+
m_overflow->insert(mk_pair(k, v));
82+
}
83+
};
84+
85+
class replace_rec_fn {
86+
replace_cache m_cache;
2387
std::function<optional<expr>(expr const &, unsigned)> m_f;
2488
bool m_use_cache;
2589

2690
expr save_result(expr const & e, unsigned offset, expr r, bool shared) {
2791
if (shared)
28-
m_cache.insert(mk_pair(mk_pair(e.raw(), offset), r));
92+
m_cache.insert(mk_pair(e.raw(), offset), r);
2993
return r;
3094
}
3195

3296
expr apply(expr const & e, unsigned offset) {
3397
bool shared = false;
3498
if (m_use_cache && !is_likely_unshared(e)) {
35-
auto it = m_cache.find(mk_pair(e.raw(), offset));
36-
if (it != m_cache.end())
37-
return it->second;
99+
if (expr const * cached = m_cache.find(mk_pair(e.raw(), offset)))
100+
return *cached;
38101
shared = true;
39102
}
40103
if (optional<expr> r = m_f(e, offset)) {

0 commit comments

Comments
 (0)