Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
1c72b5e
perf(range-tombstone): optimize RT lookup in table-skip and point-read
polaz Mar 21, 2026
f4702d3
fix(range-tombstone): sort RT list before binary search in table-skip
polaz Mar 21, 2026
99d380d
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 21, 2026
3046c7e
refactor(range-tombstone): use explicit start-key sort and .take() fo…
polaz Mar 21, 2026
b2163bd
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 21, 2026
933413c
fix(range-tombstone): add seqno desc tiebreaker to per-table RT sort
polaz Mar 21, 2026
622a8cf
docs(range-tombstone): clarify intentional double-sort in table-skip …
polaz Mar 21, 2026
5474bb7
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
b7cf866
perf(range-tombstone): use sort_unstable_by and clarify KV seqno inva…
polaz Mar 22, 2026
8835a72
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
c3c38e8
fix(range-tombstone): validate seqno#kv_max against seqno#max on load
polaz Mar 22, 2026
d164976
Merge branch 'main' into feat/#27-perf-optimize-range-tombstone-looku…
polaz Mar 22, 2026
cd6556e
refactor(range-tombstone): rename colocated RT test to reflect actual…
polaz Mar 22, 2026
dbf92a1
test(range-tombstone): verify KV-only and RT-only seqno metadata inva…
polaz Mar 22, 2026
0aabe0c
docs(range-tombstone): clarify highest_kv_seqno excludes RT sentinel
polaz Mar 22, 2026
17887db
refactor(range-tombstone): extract kv seqno validation into testable …
polaz Mar 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions src/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,21 +236,42 @@ impl TreeIter {
}
}

// Sort SST-sourced RTs by start key for binary search in
// table-skip below. This is intentionally a separate sort from
// the full sort+dedup later: table-skip runs here (before memtable
// RTs are collected), so only SST RTs are present. The later sort
// covers the complete list. Both sorts are O(n log n) on their
// respective subsets; the SST-only subset is typically small.
all_range_tombstones.sort_unstable_by(|(a, _), (b, _)| a.start.cmp(&b.start));

for table in single_tables {
// Table-skip: if a range tombstone fully covers this table
// with a higher seqno, skip it entirely (avoid I/O).
// NOTE: get_highest_seqno() includes RT seqnos, so a covering
// RT stored in the same table won't trigger skip (conservative
// but correct). Separate KV/RT seqno bounds would improve this.
// key_range.max() is inclusive, fully_covers uses half-open: max < rt.end
let is_covered = all_range_tombstones.iter().any(|(rt, cutoff)| {
rt.visible_at(*cutoff)
&& rt.fully_covers(
table.metadata.key_range.min(),
table.metadata.key_range.max(),
)
&& rt.seqno > table.get_highest_seqno()
});
//
// Uses get_highest_kv_seqno() which excludes RT seqnos, so a
// covering RT stored in the same table can now trigger skip.
//
// Binary search on sorted RT list: partition_point finds the
// first RT with start > table_min; only the prefix [0..idx]
// can have start <= table_min (required for fully_covers).
// key_range.max() is inclusive; fully_covers checks max < rt.end
// (half-open), so this is correct for inclusive upper bounds.
let table_min: &[u8] = table.metadata.key_range.min().as_ref();
let table_max: &[u8] = table.metadata.key_range.max().as_ref();
let table_kv_seqno = table.get_highest_kv_seqno();

let candidate_end =
all_range_tombstones.partition_point(|(rt, _)| rt.start.as_ref() <= table_min);
Comment thread
polaz marked this conversation as resolved.

Comment thread
polaz marked this conversation as resolved.
let is_covered =
all_range_tombstones
.iter()
.take(candidate_end)
.any(|(rt, cutoff)| {
rt.visible_at(*cutoff)
&& rt.fully_covers(table_min, table_max)
&& rt.seqno > table_kv_seqno
});
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.

Comment thread
polaz marked this conversation as resolved.
if !is_covered {
let reader = table
Expand Down
62 changes: 62 additions & 0 deletions src/table/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ pub struct ParsedMeta {
pub index_block_count: u64,
pub key_range: KeyRange,
pub(super) seqnos: (SeqNo, SeqNo),

/// Highest seqno from KV entries only (excludes range tombstones).
///
/// Falls back to `seqnos.1` (overall max) for tables written before
/// this field was introduced, which is conservative but correct.
pub(super) highest_kv_seqno: SeqNo,
pub file_size: u64,
pub item_count: u64,
pub tombstone_count: u64,
Expand Down Expand Up @@ -74,6 +80,21 @@ macro_rules! read_u64 {
}};
}

/// Validates that `kv_seqno` does not exceed `max_seqno`.
///
/// KV-only seqno must be ≤ overall max (which includes both KV and RT seqnos).
/// A value above `max_seqno` indicates on-disk corruption.
fn validated_kv_seqno(kv_seqno: SeqNo, max_seqno: SeqNo) -> crate::Result<SeqNo> {
if kv_seqno > max_seqno {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"seqno#kv_max exceeds seqno#max",
)
.into());
}
Ok(kv_seqno)
}

impl ParsedMeta {
#[expect(clippy::expect_used, clippy::too_many_lines)]
pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result<Self> {
Expand Down Expand Up @@ -189,6 +210,20 @@ impl ParsedMeta {
(min, max)
};

// Optional field introduced for table-skip optimization.
// Old tables lack this key; fall back to overall max seqno
// (conservative: table-skip compares rt.seqno > highest_kv_seqno,
// so falling back to the higher overall max just disables the
// optimization for legacy tables — correct but not optimal).
// If the key exists but is truncated, propagate the I/O error to
// surface metadata corruption rather than silently falling back.
let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) {
let mut bytes = &item.value[..];
validated_kv_seqno(bytes.read_u64::<LittleEndian>()?, seqnos.1)?
} else {
seqnos.1
};
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.

let data_block_compression = {
let bytes = block
.point_read(b"compression#data", SeqNo::MAX)
Expand All @@ -214,6 +249,7 @@ impl ParsedMeta {
index_block_count,
key_range,
seqnos,
highest_kv_seqno,
file_size,
item_count,
tombstone_count,
Expand All @@ -224,3 +260,29 @@ impl ParsedMeta {
})
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn validated_kv_seqno_within_bounds() {
assert_eq!(validated_kv_seqno(5, 10).unwrap(), 5);
}

#[test]
fn validated_kv_seqno_equal_to_max() {
assert_eq!(validated_kv_seqno(10, 10).unwrap(), 10);
}

#[test]
fn validated_kv_seqno_zero() {
assert_eq!(validated_kv_seqno(0, 10).unwrap(), 0);
}

#[test]
fn validated_kv_seqno_exceeds_max_returns_error() {
let err = validated_kv_seqno(11, 10).unwrap_err();
assert!(matches!(err, crate::Error::Io(e) if e.kind() == std::io::ErrorKind::InvalidData));
}
}
23 changes: 22 additions & 1 deletion src/table/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,14 @@ impl Table {
)));
}

Self::decode_range_tombstones(&block)?
let mut rts = Self::decode_range_tombstones(&block)?;
// Sort range tombstones by (start asc, seqno desc) to enable
// binary search in point-read suppression paths. Uses explicit
// comparator so the partition_point invariant is independent of
// Ord changes. The seqno-desc tiebreaker ensures higher-seqno
// RTs are checked first when multiple share the same start key.
rts.sort_unstable_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno)));
rts
} else {
Vec::new()
};
Expand Down Expand Up @@ -787,6 +794,20 @@ impl Table {
self.metadata.seqnos.1 + self.global_seqno()
}

/// Returns the highest sequence number from KV entries only,
/// excluding range tombstone seqnos.
///
/// This enables more aggressive table-skip: a covering RT stored
/// in the same table can trigger skip because its seqno may exceed
/// the KV-only max even though it doesn't exceed the overall max.
///
/// For tables written before this field was introduced, falls back
/// to `get_highest_seqno()` (conservative but correct).
#[must_use]
pub fn get_highest_kv_seqno(&self) -> SeqNo {
self.metadata.highest_kv_seqno + self.global_seqno()
}

/// Returns the number of tombstone markers in the `Table`.
#[must_use]
#[doc(hidden)]
Expand Down
10 changes: 9 additions & 1 deletion src/table/writer/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,15 @@ pub struct Metadata {
/// Lowest encountered seqno
pub lowest_seqno: SeqNo,

/// Highest encountered seqno
/// Highest encountered seqno (includes both KV and RT)
pub highest_seqno: SeqNo,

/// Highest encountered seqno from KV entries only (excludes range tombstones).
///
/// Used for table-skip decisions: a covering RT stored in the same table
/// can now trigger skip because `rt.seqno > highest_kv_seqno` may be true
/// even when `rt.seqno <= highest_seqno`.
pub highest_kv_seqno: SeqNo,
}

impl Default for Metadata {
Expand All @@ -60,6 +67,7 @@ impl Default for Metadata {

lowest_seqno: SeqNo::MAX,
highest_seqno: 0,
highest_kv_seqno: 0,
}
}
}
10 changes: 10 additions & 0 deletions src/table/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,13 @@ impl Writer {

self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno);
self.meta.highest_seqno = self.meta.highest_seqno.max(seqno);
// highest_kv_seqno tracks the highest seqno among user KV entries
// written via write() (values, point tombstones, weak tombstones).
// Range tombstones (via write_range_tombstone) are excluded. In
// RT-only tables, finish() writes a synthetic sentinel via write()
// but restores highest_kv_seqno afterwards, so this bound reflects
// only actual user KV items.
self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno);
Comment thread
polaz marked this conversation as resolved.

Ok(())
}
Expand Down Expand Up @@ -436,6 +443,7 @@ impl Writer {
{
let saved_lo = self.meta.lowest_seqno;
let saved_hi = self.meta.highest_seqno;
let saved_kv_hi = self.meta.highest_kv_seqno;

// Write a sentinel key to force index block creation in RT-only
// tables. The sentinel must use the start key of the same
Expand All @@ -455,6 +463,7 @@ impl Writer {
// actual block contents for consistency with recovery/tests.
self.meta.lowest_seqno = saved_lo;
self.meta.highest_seqno = saved_hi;
self.meta.highest_kv_seqno = saved_kv_hi;

// Ensure the table's key range covers all range tombstones.
self.meta.first_key = Some(start);
Expand Down Expand Up @@ -598,6 +607,7 @@ impl Writer {
"restart_interval#index",
&self.index_block_restart_interval.to_le_bytes(),
),
meta("seqno#kv_max", &self.meta.highest_kv_seqno.to_le_bytes()),
meta("seqno#max", &self.meta.highest_seqno.to_le_bytes()),
meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes()),
meta("table_id", &self.table_id.to_le_bytes()),
Expand Down
10 changes: 9 additions & 1 deletion src/tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,9 @@ impl Tree {
// or widens metadata in the inclusive-upper-bound fallback. That makes
// `metadata.key_range.contains_key(key)` a sound early reject here and
// avoids scanning RT blocks for unrelated SSTs on point reads.
//
// Per-table RT lists are sorted by start key on load,
// so binary search narrows candidates to RTs with start <= key.
for table in super_version
.version
.iter_levels()
Expand All @@ -824,7 +827,12 @@ impl Tree {
.filter(|t| !t.range_tombstones().is_empty())
.filter(|t| t.metadata.key_range.contains_key(key))
{
for rt in table.range_tombstones() {
let rts = table.range_tombstones();
let candidate_end = rts.partition_point(|rt| rt.start.as_ref() <= key);

for rt in rts.iter().take(candidate_end) {
// Binary search already narrowed to start <= key; should_suppress
// re-checks contains_key (harmless) and avoids semantic drift.
if rt.should_suppress(key, key_seqno, read_seqno) {
return true;
}
Expand Down
Loading