From 1c72b5efc6470b91b95698b0850533b92b1aea84 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 21 Mar 2026 23:36:12 +0200 Subject: [PATCH 01/11] perf(range-tombstone): optimize RT lookup in table-skip and point-read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Separate KV/RT seqno tracking: new `seqno#kv_max` metadata field excludes range tombstone seqnos, enabling table-skip for covering RTs stored in the same table (previously impossible because `rt.seqno > get_highest_seqno()` was always false for co-located RTs) - Binary search on sorted RT list for table-skip: partition_point narrows candidates to RTs with start <= table_min, reducing O(tables × rt_count) to O(tables × (log(rt_count) + k)) - Binary search for point-read suppression: per-table RT lists are sorted on load, enabling partition_point to skip RTs with start > key - Backward-compatible: old tables without `seqno#kv_max` fall back to `seqno#max` (conservative but correct) Closes #27 --- src/range.rs | 39 +++++++++---- src/table/meta.rs | 16 ++++++ src/table/mod.rs | 20 ++++++- src/table/writer/meta.rs | 10 +++- src/table/writer/mod.rs | 4 ++ src/tree/mod.rs | 16 +++++- tests/range_tombstone.rs | 116 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 205 insertions(+), 16 deletions(-) diff --git a/src/range.rs b/src/range.rs index 42403536e..cb7a3149f 100644 --- a/src/range.rs +++ b/src/range.rs @@ -239,18 +239,33 @@ impl TreeIter { for table in single_tables { // Table-skip: if a range tombstone fully covers this table // with a higher seqno, skip it entirely (avoid I/O). - // NOTE: get_highest_seqno() includes RT seqnos, so a covering - // RT stored in the same table won't trigger skip (conservative - // but correct). Separate KV/RT seqno bounds would improve this. - // key_range.max() is inclusive, fully_covers uses half-open: max < rt.end - let is_covered = all_range_tombstones.iter().any(|(rt, cutoff)| { - rt.visible_at(*cutoff) - && rt.fully_covers( - table.metadata.key_range.min(), - table.metadata.key_range.max(), - ) - && rt.seqno > table.get_highest_seqno() - }); + // + // Uses get_highest_kv_seqno() which excludes RT seqnos, so a + // covering RT stored in the same table can now trigger skip. + // + // Binary search on sorted RT list: partition_point finds the + // first RT with start > table_min; only the prefix [0..idx] + // can have start <= table_min (required for fully_covers). + let table_min: &[u8] = table.metadata.key_range.min().as_ref(); + let table_max: &[u8] = table.metadata.key_range.max().as_ref(); + let table_kv_seqno = table.get_highest_kv_seqno(); + + let candidate_end = + all_range_tombstones.partition_point(|(rt, _)| rt.start.as_ref() <= table_min); + + // SAFETY: partition_point returns 0..=len, so this slice never panics. + #[expect( + clippy::indexing_slicing, + reason = "partition_point guarantees idx <= len" + )] + let is_covered = + all_range_tombstones[..candidate_end] + .iter() + .any(|(rt, cutoff)| { + rt.visible_at(*cutoff) + && rt.fully_covers(table_min, table_max) + && rt.seqno > table_kv_seqno + }); if !is_covered { let reader = table diff --git a/src/table/meta.rs b/src/table/meta.rs index ff1810f10..d63c53807 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -42,6 +42,12 @@ pub struct ParsedMeta { pub index_block_count: u64, pub key_range: KeyRange, pub(super) seqnos: (SeqNo, SeqNo), + + /// Highest seqno from KV entries only (excludes range tombstones). + /// + /// Falls back to `seqnos.1` (overall max) for tables written before + /// this field was introduced, which is conservative but correct. + pub(super) highest_kv_seqno: SeqNo, pub file_size: u64, pub item_count: u64, pub tombstone_count: u64, @@ -189,6 +195,15 @@ impl ParsedMeta { (min, max) }; + // Optional field introduced for table-skip optimization. + // Old tables lack this key; fall back to overall max (conservative). + let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) { + let mut bytes = &item.value[..]; + bytes.read_u64::().unwrap_or(seqnos.1) + } else { + seqnos.1 + }; + let data_block_compression = { let bytes = block .point_read(b"compression#data", SeqNo::MAX) @@ -214,6 +229,7 @@ impl ParsedMeta { index_block_count, key_range, seqnos, + highest_kv_seqno, file_size, item_count, tombstone_count, diff --git a/src/table/mod.rs b/src/table/mod.rs index a76083bd0..6770f772a 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -579,7 +579,11 @@ impl Table { ))); } - Self::decode_range_tombstones(&block)? + let mut rts = Self::decode_range_tombstones(&block)?; + // Sort by (start asc, seqno desc) to enable binary search in + // table-skip and point-read suppression paths. + rts.sort(); + rts } else { Vec::new() }; @@ -787,6 +791,20 @@ impl Table { self.metadata.seqnos.1 + self.global_seqno() } + /// Returns the highest sequence number from KV entries only, + /// excluding range tombstone seqnos. + /// + /// This enables more aggressive table-skip: a covering RT stored + /// in the same table can trigger skip because its seqno may exceed + /// the KV-only max even though it doesn't exceed the overall max. + /// + /// For tables written before this field was introduced, falls back + /// to `get_highest_seqno()` (conservative but correct). + #[must_use] + pub fn get_highest_kv_seqno(&self) -> SeqNo { + self.metadata.highest_kv_seqno + self.global_seqno() + } + /// Returns the number of tombstone markers in the `Table`. #[must_use] #[doc(hidden)] diff --git a/src/table/writer/meta.rs b/src/table/writer/meta.rs index f597bf39a..9b235ab30 100644 --- a/src/table/writer/meta.rs +++ b/src/table/writer/meta.rs @@ -38,8 +38,15 @@ pub struct Metadata { /// Lowest encountered seqno pub lowest_seqno: SeqNo, - /// Highest encountered seqno + /// Highest encountered seqno (includes both KV and RT) pub highest_seqno: SeqNo, + + /// Highest encountered seqno from KV entries only (excludes range tombstones). + /// + /// Used for table-skip decisions: a covering RT stored in the same table + /// can now trigger skip because `rt.seqno > highest_kv_seqno` may be true + /// even when `rt.seqno <= highest_seqno`. + pub highest_kv_seqno: SeqNo, } impl Default for Metadata { @@ -60,6 +67,7 @@ impl Default for Metadata { lowest_seqno: SeqNo::MAX, highest_seqno: 0, + highest_kv_seqno: 0, } } } diff --git a/src/table/writer/mod.rs b/src/table/writer/mod.rs index b0439ef80..e1bc29ad4 100644 --- a/src/table/writer/mod.rs +++ b/src/table/writer/mod.rs @@ -303,6 +303,7 @@ impl Writer { self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); + self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno); Ok(()) } @@ -436,6 +437,7 @@ impl Writer { { let saved_lo = self.meta.lowest_seqno; let saved_hi = self.meta.highest_seqno; + let saved_kv_hi = self.meta.highest_kv_seqno; // Write a sentinel key to force index block creation in RT-only // tables. The sentinel must use the start key of the same @@ -455,6 +457,7 @@ impl Writer { // actual block contents for consistency with recovery/tests. self.meta.lowest_seqno = saved_lo; self.meta.highest_seqno = saved_hi; + self.meta.highest_kv_seqno = saved_kv_hi; // Ensure the table's key range covers all range tombstones. self.meta.first_key = Some(start); @@ -598,6 +601,7 @@ impl Writer { "restart_interval#index", &self.index_block_restart_interval.to_le_bytes(), ), + meta("seqno#kv_max", &self.meta.highest_kv_seqno.to_le_bytes()), meta("seqno#max", &self.meta.highest_seqno.to_le_bytes()), meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes()), meta("table_id", &self.table_id.to_le_bytes()), diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 9f8979718..0ab86c8c1 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -816,6 +816,9 @@ impl Tree { // or widens metadata in the inclusive-upper-bound fallback. That makes // `metadata.key_range.contains_key(key)` a sound early reject here and // avoids scanning RT blocks for unrelated SSTs on point reads. + // + // Per-table RT lists are sorted by (start asc, seqno desc) on load, + // so binary search narrows candidates to RTs with start <= key. for table in super_version .version .iter_levels() @@ -824,8 +827,17 @@ impl Tree { .filter(|t| !t.range_tombstones().is_empty()) .filter(|t| t.metadata.key_range.contains_key(key)) { - for rt in table.range_tombstones() { - if rt.should_suppress(key, key_seqno, read_seqno) { + let rts = table.range_tombstones(); + let candidate_end = rts.partition_point(|rt| rt.start.as_ref() <= key); + + // SAFETY: partition_point returns 0..=len, so this slice never panics. + #[expect( + clippy::indexing_slicing, + reason = "partition_point guarantees idx <= len" + )] + let candidates = &rts[..candidate_end]; + for rt in candidates { + if rt.visible_at(read_seqno) && key < rt.end.as_ref() && key_seqno < rt.seqno { return true; } } diff --git a/tests/range_tombstone.rs b/tests/range_tombstone.rs index 7390250b7..c11881e16 100644 --- a/tests/range_tombstone.rs +++ b/tests/range_tombstone.rs @@ -1144,6 +1144,122 @@ fn range_tombstone_memtable_narrow_range_queries_ignore_disjoint_rt() -> lsm_tre Ok(()) } +// --- Separate KV/RT seqno bounds --- + +/// Tables that contain both KVs and range tombstones should track +/// separate `highest_kv_seqno`. This enables table-skip for a covering +/// RT stored in the same table: `rt.seqno > highest_kv_seqno` can be +/// true even when `rt.seqno <= highest_seqno`. +#[test] +fn kv_seqno_excludes_range_tombstone_seqno() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + // KVs at seqno 1..4 + tree.insert("a", "val_a", 1); + tree.insert("b", "val_b", 2); + tree.insert("c", "val_c", 3); + tree.insert("d", "val_d", 4); + + // RT at seqno 10 — higher than any KV + tree.remove_range("a", "z", 10); + + // Flush everything into a single SST + tree.flush_active_memtable(0)?; + + let table = find_rt_table(&tree); + + // highest_seqno includes RT seqno (10) + assert_eq!(table.get_highest_seqno(), 10); + // highest_kv_seqno excludes RT — only KVs (max is 4) + assert_eq!(table.get_highest_kv_seqno(), 4); + + Ok(()) +} + +/// Table-skip should work when the covering RT is in the same table, +/// thanks to `get_highest_kv_seqno()`. Verify via range iteration: +/// if table-skip fires, the suppressed keys never appear. +#[test] +fn table_skip_with_colocated_range_tombstone() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + // KVs at seqno 1..3 + tree.insert("a", "val_a", 1); + tree.insert("b", "val_b", 2); + tree.insert("c", "val_c", 3); + + // Covering RT [a, z) at seqno 10 — in the same memtable + tree.remove_range("a", "z", 10); + + // Flush: both KVs and RT go into one SST + tree.flush_active_memtable(0)?; + + // Range scan at seqno 11 — all keys suppressed + assert_eq!(collect_keys(&tree, 11)?, Vec::>::new()); + // Reverse scan too + assert_eq!(collect_keys_rev(&tree, 11)?, Vec::>::new()); + + // Point reads also suppressed + assert_eq!(None, tree.get("a", 11)?); + assert_eq!(None, tree.get("b", 11)?); + assert_eq!(None, tree.get("c", 11)?); + + Ok(()) +} + +/// Binary search correctness: covering RT with start exactly at table min key. +#[test] +fn table_skip_rt_start_equals_table_min() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + tree.insert("m", "val_m", 1); + tree.insert("n", "val_n", 2); + tree.insert("o", "val_o", 3); + + // RT starts exactly at "m" (table min) + tree.remove_range("m", "p", 10); + tree.flush_active_memtable(0)?; + + assert_eq!(collect_keys(&tree, 11)?, Vec::>::new()); + assert_eq!(None, tree.get("m", 11)?); + assert_eq!(None, tree.get("n", 11)?); + assert_eq!(None, tree.get("o", 11)?); + + Ok(()) +} + +/// Point-read binary search: multiple RTs in a table, only one covers the key. +#[test] +fn point_read_binary_search_multiple_rts() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + tree.insert("a", "val_a", 1); + tree.insert("d", "val_d", 2); + tree.insert("g", "val_g", 3); + tree.insert("j", "val_j", 4); + + // Two disjoint RTs + tree.remove_range("a", "c", 10); // covers "a" + tree.remove_range("g", "i", 11); // covers "g" + + tree.flush_active_memtable(0)?; + + // "a" suppressed by first RT + assert_eq!(None, tree.get("a", 12)?); + // "d" not covered by any RT + assert_eq!(Some("val_d".as_bytes().into()), tree.get("d", 12)?); + // "g" suppressed by second RT + assert_eq!(None, tree.get("g", 12)?); + // "j" not covered + assert_eq!(Some("val_j".as_bytes().into()), tree.get("j", 12)?); + + Ok(()) +} + #[test] fn range_tombstone_disjoint_survives_recovery_for_narrow_scans() -> lsm_tree::Result<()> { let folder = get_tmp_folder(); From f4702d3fe03dfe13453d7e466dc948a820f8ee2e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 00:17:30 +0200 Subject: [PATCH 02/11] fix(range-tombstone): sort RT list before binary search in table-skip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Sort SST-sourced RTs before table-skip loop so partition_point operates on sorted data (was using unsorted list — incorrect results) - Propagate read_u64 error for seqno#kv_max instead of silent fallback to surface metadata corruption - Use should_suppress() in point-read path instead of inlined logic to avoid semantic drift from the canonical suppression check - Update sort comment to reference Ord implementation --- src/range.rs | 5 +++++ src/table/meta.rs | 4 +++- src/table/mod.rs | 4 ++-- src/tree/mod.rs | 4 +++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/range.rs b/src/range.rs index cb7a3149f..5448e7699 100644 --- a/src/range.rs +++ b/src/range.rs @@ -236,6 +236,11 @@ impl TreeIter { } } + // Sort SST-sourced RTs by start key for binary search in + // table-skip below. The full list is re-sorted (with memtable RTs) + // later for the RangeTombstoneFilter. + all_range_tombstones.sort_by(|a, b| a.0.cmp(&b.0)); + for table in single_tables { // Table-skip: if a range tombstone fully covers this table // with a higher seqno, skip it entirely (avoid I/O). diff --git a/src/table/meta.rs b/src/table/meta.rs index d63c53807..a98aa3824 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -197,9 +197,11 @@ impl ParsedMeta { // Optional field introduced for table-skip optimization. // Old tables lack this key; fall back to overall max (conservative). + // If the key exists but is truncated, propagate the I/O error to + // surface metadata corruption rather than silently falling back. let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) { let mut bytes = &item.value[..]; - bytes.read_u64::().unwrap_or(seqnos.1) + bytes.read_u64::()? } else { seqnos.1 }; diff --git a/src/table/mod.rs b/src/table/mod.rs index 6770f772a..75d36c868 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -580,8 +580,8 @@ impl Table { } let mut rts = Self::decode_range_tombstones(&block)?; - // Sort by (start asc, seqno desc) to enable binary search in - // table-skip and point-read suppression paths. + // Sort range tombstones using their `Ord` implementation to enable + // binary search in table-skip and point-read suppression paths. rts.sort(); rts } else { diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 0ab86c8c1..dbae84e02 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -837,7 +837,9 @@ impl Tree { )] let candidates = &rts[..candidate_end]; for rt in candidates { - if rt.visible_at(read_seqno) && key < rt.end.as_ref() && key_seqno < rt.seqno { + // Binary search already narrowed to start <= key; should_suppress + // re-checks contains_key (harmless) and avoids semantic drift. + if rt.should_suppress(key, key_seqno, read_seqno) { return true; } } From 3046c7e92fa8806c8decbceebbf27c8aa4d81c05 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 00:50:07 +0200 Subject: [PATCH 03/11] refactor(range-tombstone): use explicit start-key sort and .take() for binary search - Sort by start key explicitly instead of delegating to Ord, so the partition_point invariant is enforced locally and independent of future Ord changes - Replace [..candidate_end] slicing with .iter().take(candidate_end) to avoid clippy::indexing_slicing suppressions --- src/range.rs | 16 +++++++--------- src/table/mod.rs | 7 ++++--- src/tree/mod.rs | 8 +------- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/range.rs b/src/range.rs index 5448e7699..b78c9a152 100644 --- a/src/range.rs +++ b/src/range.rs @@ -237,9 +237,11 @@ impl TreeIter { } // Sort SST-sourced RTs by start key for binary search in - // table-skip below. The full list is re-sorted (with memtable RTs) - // later for the RangeTombstoneFilter. - all_range_tombstones.sort_by(|a, b| a.0.cmp(&b.0)); + // table-skip below. Uses explicit start-key comparator so the + // partition_point invariant is enforced locally, independent of + // RangeTombstone::Ord. The full list is re-sorted (with memtable + // RTs) later for the RangeTombstoneFilter. + all_range_tombstones.sort_by(|(a, _), (b, _)| a.start.cmp(&b.start)); for table in single_tables { // Table-skip: if a range tombstone fully covers this table @@ -258,14 +260,10 @@ impl TreeIter { let candidate_end = all_range_tombstones.partition_point(|(rt, _)| rt.start.as_ref() <= table_min); - // SAFETY: partition_point returns 0..=len, so this slice never panics. - #[expect( - clippy::indexing_slicing, - reason = "partition_point guarantees idx <= len" - )] let is_covered = - all_range_tombstones[..candidate_end] + all_range_tombstones .iter() + .take(candidate_end) .any(|(rt, cutoff)| { rt.visible_at(*cutoff) && rt.fully_covers(table_min, table_max) diff --git a/src/table/mod.rs b/src/table/mod.rs index 75d36c868..edf28cca0 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -580,9 +580,10 @@ impl Table { } let mut rts = Self::decode_range_tombstones(&block)?; - // Sort range tombstones using their `Ord` implementation to enable - // binary search in table-skip and point-read suppression paths. - rts.sort(); + // Sort range tombstones by (start asc) to enable binary search + // in point-read suppression paths. Uses explicit comparator so + // the partition_point invariant is independent of Ord changes. + rts.sort_by(|a, b| a.start.cmp(&b.start)); rts } else { Vec::new() diff --git a/src/tree/mod.rs b/src/tree/mod.rs index dbae84e02..7db4b713c 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -830,13 +830,7 @@ impl Tree { let rts = table.range_tombstones(); let candidate_end = rts.partition_point(|rt| rt.start.as_ref() <= key); - // SAFETY: partition_point returns 0..=len, so this slice never panics. - #[expect( - clippy::indexing_slicing, - reason = "partition_point guarantees idx <= len" - )] - let candidates = &rts[..candidate_end]; - for rt in candidates { + for rt in rts.iter().take(candidate_end) { // Binary search already narrowed to start <= key; should_suppress // re-checks contains_key (harmless) and avoids semantic drift. if rt.should_suppress(key, key_seqno, read_seqno) { From 933413c35e4272390878ef253c299d55cdb4dfa8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 01:21:51 +0200 Subject: [PATCH 04/11] fix(range-tombstone): add seqno desc tiebreaker to per-table RT sort - Sort per-table RTs by (start asc, seqno desc) instead of start-only, so suppression checks short-circuit on the highest-seqno RT first - Align sort order documentation with actual comparator --- src/table/mod.rs | 10 ++++++---- src/tree/mod.rs | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/table/mod.rs b/src/table/mod.rs index edf28cca0..4a0412507 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -580,10 +580,12 @@ impl Table { } let mut rts = Self::decode_range_tombstones(&block)?; - // Sort range tombstones by (start asc) to enable binary search - // in point-read suppression paths. Uses explicit comparator so - // the partition_point invariant is independent of Ord changes. - rts.sort_by(|a, b| a.start.cmp(&b.start)); + // Sort range tombstones by (start asc, seqno desc) to enable + // binary search in point-read suppression paths. Uses explicit + // comparator so the partition_point invariant is independent of + // Ord changes. The seqno desc tiebreaker lets suppression checks + // short-circuit on the highest-seqno RT for a given start key. + rts.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno))); rts } else { Vec::new() diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 7db4b713c..1d840e5f8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -817,7 +817,7 @@ impl Tree { // `metadata.key_range.contains_key(key)` a sound early reject here and // avoids scanning RT blocks for unrelated SSTs on point reads. // - // Per-table RT lists are sorted by (start asc, seqno desc) on load, + // Per-table RT lists are sorted by start key on load, // so binary search narrows candidates to RTs with start <= key. for table in super_version .version From 622a8cf89998dc41222c8735326275f7926f3064 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 01:35:04 +0200 Subject: [PATCH 05/11] docs(range-tombstone): clarify intentional double-sort in table-skip path --- src/range.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/range.rs b/src/range.rs index b78c9a152..8459f63b6 100644 --- a/src/range.rs +++ b/src/range.rs @@ -237,10 +237,11 @@ impl TreeIter { } // Sort SST-sourced RTs by start key for binary search in - // table-skip below. Uses explicit start-key comparator so the - // partition_point invariant is enforced locally, independent of - // RangeTombstone::Ord. The full list is re-sorted (with memtable - // RTs) later for the RangeTombstoneFilter. + // table-skip below. This is intentionally a separate sort from + // the full sort+dedup later: table-skip runs here (before memtable + // RTs are collected), so only SST RTs are present. The later sort + // covers the complete list. Both sorts are O(n log n) on their + // respective subsets; the SST-only subset is typically small. all_range_tombstones.sort_by(|(a, _), (b, _)| a.start.cmp(&b.start)); for table in single_tables { From b7cf866b92820fbaff1b4cd4358433a4675b8c36 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 02:09:52 +0200 Subject: [PATCH 06/11] perf(range-tombstone): use sort_unstable_by and clarify KV seqno invariant - Switch to sort_unstable_by for RT sorts (no stability needed) - Clarify that highest_kv_seqno tracks all data-block item types (values, point tombstones, weak tombstones), not just values - Restore inclusive-max vs half-open documentation in table-skip - Expand backward-compat fallback rationale in ParsedMeta --- src/range.rs | 4 +++- src/table/meta.rs | 5 ++++- src/table/mod.rs | 2 +- src/table/writer/mod.rs | 3 +++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/range.rs b/src/range.rs index 8459f63b6..2bfc06b28 100644 --- a/src/range.rs +++ b/src/range.rs @@ -242,7 +242,7 @@ impl TreeIter { // RTs are collected), so only SST RTs are present. The later sort // covers the complete list. Both sorts are O(n log n) on their // respective subsets; the SST-only subset is typically small. - all_range_tombstones.sort_by(|(a, _), (b, _)| a.start.cmp(&b.start)); + all_range_tombstones.sort_unstable_by(|(a, _), (b, _)| a.start.cmp(&b.start)); for table in single_tables { // Table-skip: if a range tombstone fully covers this table @@ -254,6 +254,8 @@ impl TreeIter { // Binary search on sorted RT list: partition_point finds the // first RT with start > table_min; only the prefix [0..idx] // can have start <= table_min (required for fully_covers). + // key_range.max() is inclusive; fully_covers checks max < rt.end + // (half-open), so this is correct for inclusive upper bounds. let table_min: &[u8] = table.metadata.key_range.min().as_ref(); let table_max: &[u8] = table.metadata.key_range.max().as_ref(); let table_kv_seqno = table.get_highest_kv_seqno(); diff --git a/src/table/meta.rs b/src/table/meta.rs index a98aa3824..b706ffab7 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -196,7 +196,10 @@ impl ParsedMeta { }; // Optional field introduced for table-skip optimization. - // Old tables lack this key; fall back to overall max (conservative). + // Old tables lack this key; fall back to overall max seqno + // (conservative: table-skip compares rt.seqno > highest_kv_seqno, + // so falling back to the higher overall max just disables the + // optimization for legacy tables — correct but not optimal). // If the key exists but is truncated, propagate the I/O error to // surface metadata corruption rather than silently falling back. let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) { diff --git a/src/table/mod.rs b/src/table/mod.rs index 4a0412507..2a8b25dcc 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -585,7 +585,7 @@ impl Table { // comparator so the partition_point invariant is independent of // Ord changes. The seqno desc tiebreaker lets suppression checks // short-circuit on the highest-seqno RT for a given start key. - rts.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno))); + rts.sort_unstable_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno))); rts } else { Vec::new() diff --git a/src/table/writer/mod.rs b/src/table/writer/mod.rs index e1bc29ad4..29d89f09f 100644 --- a/src/table/writer/mod.rs +++ b/src/table/writer/mod.rs @@ -303,6 +303,9 @@ impl Writer { self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); + // All item types (values, point tombstones, weak tombstones) are KV + // entries stored in data blocks. Only range tombstones (written via + // write_range_tombstone) are excluded from highest_kv_seqno. self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno); Ok(()) From c3c38e88637ae0c5383213f5b2973f579f2912d2 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 02:31:32 +0200 Subject: [PATCH 07/11] fix(range-tombstone): validate seqno#kv_max against seqno#max on load - Reject tables where highest_kv_seqno exceeds overall max seqno, which indicates on-disk metadata corruption - Reword seqno-desc tiebreaker documentation to match actual behavior --- src/table/meta.rs | 14 +++++++++++++- src/table/mod.rs | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/table/meta.rs b/src/table/meta.rs index b706ffab7..87876bab0 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -204,7 +204,19 @@ impl ParsedMeta { // surface metadata corruption rather than silently falling back. let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) { let mut bytes = &item.value[..]; - bytes.read_u64::()? + let value = bytes.read_u64::()?; + // KV-only seqno must not exceed the overall max (which includes + // both KV and RT seqnos). A value above seqnos.1 indicates + // on-disk corruption — surface it rather than silently using + // a bogus bound that could cause incorrect table-skip decisions. + if value > seqnos.1 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "seqno#kv_max exceeds seqno#max", + ) + .into()); + } + value } else { seqnos.1 }; diff --git a/src/table/mod.rs b/src/table/mod.rs index 2a8b25dcc..5c5c28537 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -583,8 +583,8 @@ impl Table { // Sort range tombstones by (start asc, seqno desc) to enable // binary search in point-read suppression paths. Uses explicit // comparator so the partition_point invariant is independent of - // Ord changes. The seqno desc tiebreaker lets suppression checks - // short-circuit on the highest-seqno RT for a given start key. + // Ord changes. The seqno-desc tiebreaker ensures higher-seqno + // RTs are checked first when multiple share the same start key. rts.sort_unstable_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno))); rts } else { From cd6556e1a8b498dcc01cfc89b0b64fcd73feb3b8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 02:49:20 +0200 Subject: [PATCH 08/11] refactor(range-tombstone): rename colocated RT test to reflect actual scope Test verifies logical suppression (which works regardless of whether table-skip fires), not the table-skip optimization specifically. --- tests/range_tombstone.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/range_tombstone.rs b/tests/range_tombstone.rs index c11881e16..72451ca3b 100644 --- a/tests/range_tombstone.rs +++ b/tests/range_tombstone.rs @@ -1177,11 +1177,12 @@ fn kv_seqno_excludes_range_tombstone_seqno() -> lsm_tree::Result<()> { Ok(()) } -/// Table-skip should work when the covering RT is in the same table, -/// thanks to `get_highest_kv_seqno()`. Verify via range iteration: -/// if table-skip fires, the suppressed keys never appear. +/// When a covering range tombstone and its covered KVs are colocated in the +/// same table, reads at a higher seqno should not observe those KVs. +/// This verifies that the colocated range tombstone correctly suppresses +/// the covered keys for range scans (forward and reverse) and point lookups. #[test] -fn table_skip_with_colocated_range_tombstone() -> lsm_tree::Result<()> { +fn colocated_range_tombstone_suppresses_keys() -> lsm_tree::Result<()> { let folder = get_tmp_folder(); let tree = open_tree(folder.path()); From dbf92a137aab573fa7e571fbc89b94993a177e7a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 02:57:25 +0200 Subject: [PATCH 09/11] test(range-tombstone): verify KV-only and RT-only seqno metadata invariants - Tables without RTs have highest_kv_seqno == highest_seqno - RT-only tables have highest_kv_seqno == 0 (sentinel restored) - Assert highest_kv_seqno <= highest_seqno for all table types --- tests/range_tombstone.rs | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/range_tombstone.rs b/tests/range_tombstone.rs index 72451ca3b..1e82f90c6 100644 --- a/tests/range_tombstone.rs +++ b/tests/range_tombstone.rs @@ -1174,6 +1174,57 @@ fn kv_seqno_excludes_range_tombstone_seqno() -> lsm_tree::Result<()> { // highest_kv_seqno excludes RT — only KVs (max is 4) assert_eq!(table.get_highest_kv_seqno(), 4); + // Invariant: KV-only seqno must not exceed overall max + assert!(table.get_highest_kv_seqno() <= table.get_highest_seqno()); + + Ok(()) +} + +/// Without range tombstones, highest_kv_seqno equals highest_seqno +/// (all items are KV entries, none are RTs). +#[test] +fn kv_seqno_equals_overall_when_no_range_tombstones() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + tree.insert("a", "val_a", 1); + tree.insert("b", "val_b", 2); + tree.insert("c", "val_c", 3); + + tree.flush_active_memtable(0)?; + + let table = tree + .current_version() + .iter_tables() + .next() + .expect("should have one table") + .clone(); + + assert_eq!(table.get_highest_seqno(), 3); + assert_eq!(table.get_highest_kv_seqno(), 3); + + Ok(()) +} + +/// RT-only table: highest_kv_seqno is 0 because no KV items exist +/// (only the sentinel entry which has its seqno restored after write). +#[test] +fn kv_seqno_zero_for_rt_only_table() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + let tree = open_tree(folder.path()); + + // Only an RT, no KV inserts + tree.remove_range("a", "z", 10); + + tree.flush_active_memtable(0)?; + + let table = find_rt_table(&tree); + + // Overall seqno includes the RT + assert_eq!(table.get_highest_seqno(), 10); + // KV-only seqno is 0 — sentinel seqno is restored to pre-write state + assert_eq!(table.get_highest_kv_seqno(), 0); + Ok(()) } From 0aabe0c9befbf947429b446ebb79be7b4f1bc9bf Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 03:15:58 +0200 Subject: [PATCH 10/11] docs(range-tombstone): clarify highest_kv_seqno excludes RT sentinel --- src/table/writer/mod.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/table/writer/mod.rs b/src/table/writer/mod.rs index 29d89f09f..42ddd5833 100644 --- a/src/table/writer/mod.rs +++ b/src/table/writer/mod.rs @@ -303,9 +303,12 @@ impl Writer { self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); - // All item types (values, point tombstones, weak tombstones) are KV - // entries stored in data blocks. Only range tombstones (written via - // write_range_tombstone) are excluded from highest_kv_seqno. + // highest_kv_seqno tracks the highest seqno among user KV entries + // written via write() (values, point tombstones, weak tombstones). + // Range tombstones (via write_range_tombstone) are excluded. In + // RT-only tables, finish() writes a synthetic sentinel via write() + // but restores highest_kv_seqno afterwards, so this bound reflects + // only actual user KV items. self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno); Ok(()) From 17887dbbb8e718b650973c99829f233cfbc286e7 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 22 Mar 2026 03:32:53 +0200 Subject: [PATCH 11/11] refactor(range-tombstone): extract kv seqno validation into testable function Extract highest_kv_seqno corruption check into validated_kv_seqno() with unit tests for valid, equal, zero, and exceeds-max cases. --- src/table/meta.rs | 55 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/src/table/meta.rs b/src/table/meta.rs index 87876bab0..2c1ce984c 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -80,6 +80,21 @@ macro_rules! read_u64 { }}; } +/// Validates that `kv_seqno` does not exceed `max_seqno`. +/// +/// KV-only seqno must be ≤ overall max (which includes both KV and RT seqnos). +/// A value above `max_seqno` indicates on-disk corruption. +fn validated_kv_seqno(kv_seqno: SeqNo, max_seqno: SeqNo) -> crate::Result { + if kv_seqno > max_seqno { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "seqno#kv_max exceeds seqno#max", + ) + .into()); + } + Ok(kv_seqno) +} + impl ParsedMeta { #[expect(clippy::expect_used, clippy::too_many_lines)] pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result { @@ -204,19 +219,7 @@ impl ParsedMeta { // surface metadata corruption rather than silently falling back. let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) { let mut bytes = &item.value[..]; - let value = bytes.read_u64::()?; - // KV-only seqno must not exceed the overall max (which includes - // both KV and RT seqnos). A value above seqnos.1 indicates - // on-disk corruption — surface it rather than silently using - // a bogus bound that could cause incorrect table-skip decisions. - if value > seqnos.1 { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "seqno#kv_max exceeds seqno#max", - ) - .into()); - } - value + validated_kv_seqno(bytes.read_u64::()?, seqnos.1)? } else { seqnos.1 }; @@ -257,3 +260,29 @@ impl ParsedMeta { }) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validated_kv_seqno_within_bounds() { + assert_eq!(validated_kv_seqno(5, 10).unwrap(), 5); + } + + #[test] + fn validated_kv_seqno_equal_to_max() { + assert_eq!(validated_kv_seqno(10, 10).unwrap(), 10); + } + + #[test] + fn validated_kv_seqno_zero() { + assert_eq!(validated_kv_seqno(0, 10).unwrap(), 0); + } + + #[test] + fn validated_kv_seqno_exceeds_max_returns_error() { + let err = validated_kv_seqno(11, 10).unwrap_err(); + assert!(matches!(err, crate::Error::Io(e) if e.kind() == std::io::ErrorKind::InvalidData)); + } +}