Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
1c72b5e
perf(range-tombstone): optimize RT lookup in table-skip and point-read
polaz Mar 21, 2026
f4702d3
fix(range-tombstone): sort RT list before binary search in table-skip
polaz Mar 21, 2026
99d380d
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 21, 2026
3046c7e
refactor(range-tombstone): use explicit start-key sort and .take() fo…
polaz Mar 21, 2026
b2163bd
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 21, 2026
933413c
fix(range-tombstone): add seqno desc tiebreaker to per-table RT sort
polaz Mar 21, 2026
622a8cf
docs(range-tombstone): clarify intentional double-sort in table-skip …
polaz Mar 21, 2026
5474bb7
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
b7cf866
perf(range-tombstone): use sort_unstable_by and clarify KV seqno inva…
polaz Mar 22, 2026
8835a72
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
c3c38e8
fix(range-tombstone): validate seqno#kv_max against seqno#max on load
polaz Mar 22, 2026
d164976
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
cd6556e
refactor(range-tombstone): rename colocated RT test to reflect actual…
polaz Mar 22, 2026
dbf92a1
test(range-tombstone): verify KV-only and RT-only seqno metadata inva…
polaz Mar 22, 2026
0aabe0c
docs(range-tombstone): clarify highest_kv_seqno excludes RT sentinel
polaz Mar 22, 2026
17887db
refactor(range-tombstone): extract kv seqno validation into testable …
polaz Mar 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions src/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,21 +236,41 @@ impl TreeIter {
}
}

// Sort SST-sourced RTs by start key for binary search in
// table-skip below. The full list is re-sorted (with memtable RTs)
// later for the RangeTombstoneFilter.
all_range_tombstones.sort_by(|a, b| a.0.cmp(&b.0));
Comment thread
polaz marked this conversation as resolved.
Outdated

for table in single_tables {
// Table-skip: if a range tombstone fully covers this table
// with a higher seqno, skip it entirely (avoid I/O).
// NOTE: get_highest_seqno() includes RT seqnos, so a covering
// RT stored in the same table won't trigger skip (conservative
// but correct). Separate KV/RT seqno bounds would improve this.
// key_range.max() is inclusive, fully_covers uses half-open: max < rt.end
let is_covered = all_range_tombstones.iter().any(|(rt, cutoff)| {
rt.visible_at(*cutoff)
&& rt.fully_covers(
table.metadata.key_range.min(),
table.metadata.key_range.max(),
)
&& rt.seqno > table.get_highest_seqno()
});
//
// Uses get_highest_kv_seqno() which excludes RT seqnos, so a
// covering RT stored in the same table can now trigger skip.
//
// Binary search on sorted RT list: partition_point finds the
// first RT with start > table_min; only the prefix [0..idx]
// can have start <= table_min (required for fully_covers).
let table_min: &[u8] = table.metadata.key_range.min().as_ref();
let table_max: &[u8] = table.metadata.key_range.max().as_ref();
let table_kv_seqno = table.get_highest_kv_seqno();

let candidate_end =
all_range_tombstones.partition_point(|(rt, _)| rt.start.as_ref() <= table_min);
Comment thread
polaz marked this conversation as resolved.

Comment thread
polaz marked this conversation as resolved.
// SAFETY: partition_point returns 0..=len, so this slice never panics.
#[expect(
clippy::indexing_slicing,
reason = "partition_point guarantees idx <= len"
)]
let is_covered =
all_range_tombstones[..candidate_end]
.iter()
.any(|(rt, cutoff)| {
rt.visible_at(*cutoff)
&& rt.fully_covers(table_min, table_max)
&& rt.seqno > table_kv_seqno
});
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
Outdated
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.

Comment thread
polaz marked this conversation as resolved.
if !is_covered {
let reader = table
Expand Down
18 changes: 18 additions & 0 deletions src/table/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ pub struct ParsedMeta {
pub index_block_count: u64,
pub key_range: KeyRange,
pub(super) seqnos: (SeqNo, SeqNo),

/// Highest seqno from KV entries only (excludes range tombstones).
///
/// Falls back to `seqnos.1` (overall max) for tables written before
/// this field was introduced, which is conservative but correct.
pub(super) highest_kv_seqno: SeqNo,
pub file_size: u64,
pub item_count: u64,
pub tombstone_count: u64,
Expand Down Expand Up @@ -189,6 +195,17 @@ impl ParsedMeta {
(min, max)
};

// Optional field introduced for table-skip optimization.
// Old tables lack this key; fall back to overall max (conservative).
// If the key exists but is truncated, propagate the I/O error to
// surface metadata corruption rather than silently falling back.
let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) {
let mut bytes = &item.value[..];
bytes.read_u64::<LittleEndian>()?
} else {
seqnos.1
};
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.

let data_block_compression = {
let bytes = block
.point_read(b"compression#data", SeqNo::MAX)
Expand All @@ -214,6 +231,7 @@ impl ParsedMeta {
index_block_count,
key_range,
seqnos,
highest_kv_seqno,
file_size,
item_count,
tombstone_count,
Expand Down
20 changes: 19 additions & 1 deletion src/table/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,11 @@ impl Table {
)));
}

Self::decode_range_tombstones(&block)?
let mut rts = Self::decode_range_tombstones(&block)?;
// Sort range tombstones using their `Ord` implementation to enable
// binary search in table-skip and point-read suppression paths.
rts.sort();
Comment thread
polaz marked this conversation as resolved.
Outdated
rts
} else {
Vec::new()
};
Expand Down Expand Up @@ -787,6 +791,20 @@ impl Table {
self.metadata.seqnos.1 + self.global_seqno()
}

/// Returns the highest sequence number from KV entries only,
/// excluding range tombstone seqnos.
///
/// This enables more aggressive table-skip: a covering RT stored
/// in the same table can trigger skip because its seqno may exceed
/// the KV-only max even though it doesn't exceed the overall max.
///
/// For tables written before this field was introduced, falls back
/// to `get_highest_seqno()` (conservative but correct).
#[must_use]
pub fn get_highest_kv_seqno(&self) -> SeqNo {
self.metadata.highest_kv_seqno + self.global_seqno()
}

/// Returns the number of tombstone markers in the `Table`.
#[must_use]
#[doc(hidden)]
Expand Down
10 changes: 9 additions & 1 deletion src/table/writer/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,15 @@ pub struct Metadata {
/// Lowest encountered seqno
pub lowest_seqno: SeqNo,

/// Highest encountered seqno
/// Highest encountered seqno (includes both KV and RT)
pub highest_seqno: SeqNo,

/// Highest encountered seqno from KV entries only (excludes range tombstones).
///
/// Used for table-skip decisions: a covering RT stored in the same table
/// can now trigger skip because `rt.seqno > highest_kv_seqno` may be true
/// even when `rt.seqno <= highest_seqno`.
pub highest_kv_seqno: SeqNo,
}

impl Default for Metadata {
Expand All @@ -60,6 +67,7 @@ impl Default for Metadata {

lowest_seqno: SeqNo::MAX,
highest_seqno: 0,
highest_kv_seqno: 0,
}
}
}
4 changes: 4 additions & 0 deletions src/table/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ impl Writer {

self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno);
self.meta.highest_seqno = self.meta.highest_seqno.max(seqno);
self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno);
Comment thread
polaz marked this conversation as resolved.

Ok(())
}
Expand Down Expand Up @@ -436,6 +437,7 @@ impl Writer {
{
let saved_lo = self.meta.lowest_seqno;
let saved_hi = self.meta.highest_seqno;
let saved_kv_hi = self.meta.highest_kv_seqno;

// Write a sentinel key to force index block creation in RT-only
// tables. The sentinel must use the start key of the same
Expand All @@ -455,6 +457,7 @@ impl Writer {
// actual block contents for consistency with recovery/tests.
self.meta.lowest_seqno = saved_lo;
self.meta.highest_seqno = saved_hi;
self.meta.highest_kv_seqno = saved_kv_hi;

// Ensure the table's key range covers all range tombstones.
self.meta.first_key = Some(start);
Expand Down Expand Up @@ -598,6 +601,7 @@ impl Writer {
"restart_interval#index",
&self.index_block_restart_interval.to_le_bytes(),
),
meta("seqno#kv_max", &self.meta.highest_kv_seqno.to_le_bytes()),
meta("seqno#max", &self.meta.highest_seqno.to_le_bytes()),
meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes()),
meta("table_id", &self.table_id.to_le_bytes()),
Expand Down
16 changes: 15 additions & 1 deletion src/tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,9 @@ impl Tree {
// or widens metadata in the inclusive-upper-bound fallback. That makes
// `metadata.key_range.contains_key(key)` a sound early reject here and
// avoids scanning RT blocks for unrelated SSTs on point reads.
//
// Per-table RT lists are sorted by (start asc, seqno desc) on load,
Comment thread
polaz marked this conversation as resolved.
Outdated
// so binary search narrows candidates to RTs with start <= key.
for table in super_version
.version
.iter_levels()
Expand All @@ -824,7 +827,18 @@ impl Tree {
.filter(|t| !t.range_tombstones().is_empty())
.filter(|t| t.metadata.key_range.contains_key(key))
{
for rt in table.range_tombstones() {
let rts = table.range_tombstones();
let candidate_end = rts.partition_point(|rt| rt.start.as_ref() <= key);

// SAFETY: partition_point returns 0..=len, so this slice never panics.
#[expect(
clippy::indexing_slicing,
reason = "partition_point guarantees idx <= len"
)]
let candidates = &rts[..candidate_end];
for rt in candidates {
// Binary search already narrowed to start <= key; should_suppress
// re-checks contains_key (harmless) and avoids semantic drift.
if rt.should_suppress(key, key_seqno, read_seqno) {
return true;
}
Expand Down
116 changes: 116 additions & 0 deletions tests/range_tombstone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,122 @@ fn range_tombstone_memtable_narrow_range_queries_ignore_disjoint_rt() -> lsm_tre
Ok(())
}

// --- Separate KV/RT seqno bounds ---

/// Tables that contain both KVs and range tombstones should track
/// separate `highest_kv_seqno`. This enables table-skip for a covering
/// RT stored in the same table: `rt.seqno > highest_kv_seqno` can be
/// true even when `rt.seqno <= highest_seqno`.
#[test]
fn kv_seqno_excludes_range_tombstone_seqno() -> lsm_tree::Result<()> {
let folder = get_tmp_folder();
let tree = open_tree(folder.path());

// KVs at seqno 1..4
tree.insert("a", "val_a", 1);
tree.insert("b", "val_b", 2);
tree.insert("c", "val_c", 3);
tree.insert("d", "val_d", 4);

// RT at seqno 10 — higher than any KV
tree.remove_range("a", "z", 10);

// Flush everything into a single SST
tree.flush_active_memtable(0)?;

let table = find_rt_table(&tree);

// highest_seqno includes RT seqno (10)
assert_eq!(table.get_highest_seqno(), 10);
// highest_kv_seqno excludes RT — only KVs (max is 4)
assert_eq!(table.get_highest_kv_seqno(), 4);

Ok(())
}

/// Table-skip should work when the covering RT is in the same table,
/// thanks to `get_highest_kv_seqno()`. Verify via range iteration:
/// if table-skip fires, the suppressed keys never appear.
#[test]
fn table_skip_with_colocated_range_tombstone() -> lsm_tree::Result<()> {
Comment thread
polaz marked this conversation as resolved.
Outdated
let folder = get_tmp_folder();
let tree = open_tree(folder.path());

// KVs at seqno 1..3
tree.insert("a", "val_a", 1);
tree.insert("b", "val_b", 2);
tree.insert("c", "val_c", 3);

// Covering RT [a, z) at seqno 10 — in the same memtable
tree.remove_range("a", "z", 10);

// Flush: both KVs and RT go into one SST
tree.flush_active_memtable(0)?;

// Range scan at seqno 11 — all keys suppressed
assert_eq!(collect_keys(&tree, 11)?, Vec::<Vec<u8>>::new());
// Reverse scan too
assert_eq!(collect_keys_rev(&tree, 11)?, Vec::<Vec<u8>>::new());

// Point reads also suppressed
assert_eq!(None, tree.get("a", 11)?);
assert_eq!(None, tree.get("b", 11)?);
assert_eq!(None, tree.get("c", 11)?);

Ok(())
}

/// Binary search correctness: covering RT with start exactly at table min key.
#[test]
fn table_skip_rt_start_equals_table_min() -> lsm_tree::Result<()> {
let folder = get_tmp_folder();
let tree = open_tree(folder.path());

tree.insert("m", "val_m", 1);
tree.insert("n", "val_n", 2);
tree.insert("o", "val_o", 3);

// RT starts exactly at "m" (table min)
tree.remove_range("m", "p", 10);
tree.flush_active_memtable(0)?;

assert_eq!(collect_keys(&tree, 11)?, Vec::<Vec<u8>>::new());
assert_eq!(None, tree.get("m", 11)?);
assert_eq!(None, tree.get("n", 11)?);
assert_eq!(None, tree.get("o", 11)?);

Ok(())
}

/// Point-read binary search: multiple RTs in a table, only one covers the key.
#[test]
fn point_read_binary_search_multiple_rts() -> lsm_tree::Result<()> {
let folder = get_tmp_folder();
let tree = open_tree(folder.path());

tree.insert("a", "val_a", 1);
tree.insert("d", "val_d", 2);
tree.insert("g", "val_g", 3);
tree.insert("j", "val_j", 4);

// Two disjoint RTs
tree.remove_range("a", "c", 10); // covers "a"
tree.remove_range("g", "i", 11); // covers "g"

tree.flush_active_memtable(0)?;

// "a" suppressed by first RT
assert_eq!(None, tree.get("a", 12)?);
// "d" not covered by any RT
assert_eq!(Some("val_d".as_bytes().into()), tree.get("d", 12)?);
// "g" suppressed by second RT
assert_eq!(None, tree.get("g", 12)?);
// "j" not covered
assert_eq!(Some("val_j".as_bytes().into()), tree.get("j", 12)?);

Ok(())
}

#[test]
fn range_tombstone_disjoint_survives_recovery_for_narrow_scans() -> lsm_tree::Result<()> {
let folder = get_tmp_folder();
Expand Down
Loading