From dbb4ee44e259377a5b747a781b565d7ef80866d0 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 13:53:19 +0200 Subject: [PATCH 01/30] feat: add optimized contains_prefix() method Add contains_prefix() to AbstractTree trait that checks if any key with the given prefix exists, stopping at the first match instead of materializing a full iterator. - Default implementation on AbstractTree uses prefix().next() - BlobTree overrides to delegate to index tree, avoiding value log reads - MVCC-correct: respects seqno visibility and tombstones Closes fjall-rs/lsm-tree#138 --- src/abstract_tree.rs | 38 ++++++++++ src/blob_tree/mod.rs | 11 +++ tests/tree_contains_prefix.rs | 136 ++++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 tests/tree_contains_prefix.rs diff --git a/src/abstract_tree.rs b/src/abstract_tree.rs index 0a3f123a2..917e8ec80 100644 --- a/src/abstract_tree.rs +++ b/src/abstract_tree.rs @@ -511,6 +511,44 @@ pub trait AbstractTree { self.get(key, seqno).map(|x| x.is_some()) } + /// Returns `true` if the tree contains any key with the given prefix. + /// + /// This is more efficient than `prefix().next().is_some()` as it avoids + /// materializing the full iterator guard and can stop at the first match. + /// + /// # Examples + /// + /// ``` + /// # let folder = tempfile::tempdir()?; + /// use lsm_tree::{AbstractTree, Config, Tree}; + /// + /// let tree = Config::new(folder, Default::default(), Default::default()).open()?; + /// assert!(!tree.contains_prefix("abc", 0, None)?); + /// + /// tree.insert("abc:1", "value", 0); + /// assert!(tree.contains_prefix("abc", 1, None)?); + /// assert!(!tree.contains_prefix("xyz", 1, None)?); + /// # + /// # Ok::<(), lsm_tree::Error>(()) + /// ``` + /// + /// # Errors + /// + /// Will return `Err` if an IO error occurs. + fn contains_prefix>( + &self, + prefix: K, + seqno: SeqNo, + index: Option<(Arc, SeqNo)>, + ) -> crate::Result { + Ok(self + .prefix(prefix, seqno, index) + .next() + .map(crate::Guard::key) + .transpose()? + .is_some()) + } + /// Inserts a key-value pair into the tree. /// /// If the key already exists, the item will be overwritten. diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 73ea1c119..b144ed915 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -555,6 +555,17 @@ impl AbstractTree for BlobTree { self.index.contains_key(key, seqno) } + // NOTE: Override the default implementation to not fetch + // data from the value log, so we get much faster prefix checks + fn contains_prefix>( + &self, + prefix: K, + seqno: SeqNo, + index: Option<(Arc, SeqNo)>, + ) -> crate::Result { + self.index.contains_prefix(prefix, seqno, index) + } + // NOTE: Override the default implementation to not fetch // data from the value log, so we get much faster scans fn len(&self, seqno: SeqNo, index: Option<(Arc, SeqNo)>) -> crate::Result { diff --git a/tests/tree_contains_prefix.rs b/tests/tree_contains_prefix.rs new file mode 100644 index 000000000..b4bc72eab --- /dev/null +++ b/tests/tree_contains_prefix.rs @@ -0,0 +1,136 @@ +use lsm_tree::{get_tmp_folder, AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use test_log::test; + +#[test] +fn tree_contains_prefix_empty_tree() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + assert!(!tree.contains_prefix("abc", SeqNo::MAX, None)?); + assert!(!tree.contains_prefix("", SeqNo::MAX, None)?); + + Ok(()) +} + +#[test] +fn tree_contains_prefix_basic() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("abc:1", "value1", 0); + tree.insert("abc:2", "value2", 1); + tree.insert("def:1", "value3", 2); + + assert!(tree.contains_prefix("abc", 3, None)?); + assert!(tree.contains_prefix("def", 3, None)?); + assert!(!tree.contains_prefix("xyz", 3, None)?); + assert!(!tree.contains_prefix("ab", 0, None)?); + + Ok(()) +} + +#[test] +fn tree_contains_prefix_no_match() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("abc", "value", 0); + tree.insert("abd", "value", 1); + + assert!(!tree.contains_prefix("xyz", 2, None)?); + assert!(!tree.contains_prefix("abe", 2, None)?); + assert!(!tree.contains_prefix("abca", 2, None)?); + + Ok(()) +} + +#[test] +fn tree_contains_prefix_mvcc() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + // Insert at seqno 4 + tree.insert("abc:1", "value", 4); + + // Not visible at seqno 3 (seqno filter is item_seqno < query_seqno) + assert!(!tree.contains_prefix("abc", 3, None)?); + + // Not visible at seqno 4 (strict less-than) + assert!(!tree.contains_prefix("abc", 4, None)?); + + // Visible at seqno 5 + assert!(tree.contains_prefix("abc", 5, None)?); + + // Visible at MAX + assert!(tree.contains_prefix("abc", SeqNo::MAX, None)?); + + Ok(()) +} + +#[test] +fn tree_contains_prefix_after_delete() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("abc:1", "value", 0); + tree.remove("abc:1", 1); + + // After deletion, prefix should not match + assert!(!tree.contains_prefix("abc", 2, None)?); + + // But at seqno 1 (before delete), it should still be visible + assert!(tree.contains_prefix("abc", 1, None)?); + + Ok(()) +} + +#[test] +fn tree_contains_prefix_after_flush() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("abc:1", "value1", 0); + tree.insert("abc:2", "value2", 1); + tree.flush_active_memtable(0)?; + + assert!(tree.contains_prefix("abc", 2, None)?); + assert!(!tree.contains_prefix("xyz", 2, None)?); + + Ok(()) +} From 453e729991af96f7bf32da280518aa7d2593cb82 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 14:03:12 +0200 Subject: [PATCH 02/30] refactor(contains_prefix): accurate doc wording and test corrections - Doc describes convenience and error propagation, not false optimization claim - Test asserts "ab" matches "abc:*" keys at visible seqno - Add BlobTree test covering delegated index-only path --- src/abstract_tree.rs | 6 +++-- tests/tree_contains_prefix.rs | 46 +++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/abstract_tree.rs b/src/abstract_tree.rs index 917e8ec80..bddc8f453 100644 --- a/src/abstract_tree.rs +++ b/src/abstract_tree.rs @@ -513,8 +513,10 @@ pub trait AbstractTree { /// Returns `true` if the tree contains any key with the given prefix. /// - /// This is more efficient than `prefix().next().is_some()` as it avoids - /// materializing the full iterator guard and can stop at the first match. + /// This is a convenience method that checks whether the corresponding + /// prefix iterator yields at least one item, while surfacing any IO + /// errors via the `Result` return type. Implementations may override + /// this method to provide a more efficient prefix-existence check. /// /// # Examples /// diff --git a/tests/tree_contains_prefix.rs b/tests/tree_contains_prefix.rs index b4bc72eab..9edc6ce02 100644 --- a/tests/tree_contains_prefix.rs +++ b/tests/tree_contains_prefix.rs @@ -1,4 +1,6 @@ -use lsm_tree::{get_tmp_folder, AbstractTree, Config, SeqNo, SequenceNumberCounter}; +use lsm_tree::{ + get_tmp_folder, AbstractTree, Config, KvSeparationOptions, SeqNo, SequenceNumberCounter, +}; use test_log::test; #[test] @@ -36,7 +38,8 @@ fn tree_contains_prefix_basic() -> lsm_tree::Result<()> { assert!(tree.contains_prefix("abc", 3, None)?); assert!(tree.contains_prefix("def", 3, None)?); assert!(!tree.contains_prefix("xyz", 3, None)?); - assert!(!tree.contains_prefix("ab", 0, None)?); + // "ab" is a valid prefix for "abc:*" keys + assert!(tree.contains_prefix("ab", 3, None)?); Ok(()) } @@ -134,3 +137,42 @@ fn tree_contains_prefix_after_flush() -> lsm_tree::Result<()> { Ok(()) } + +#[test] +fn tree_contains_prefix_blobtree() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + assert!(!tree.contains_prefix("abc", SeqNo::MAX, None)?); + + tree.insert("abc:1", "value1", 0); + tree.insert("abc:2", "value2", 1); + tree.insert("def:1", "value3", 2); + + assert!(tree.contains_prefix("abc", 3, None)?); + assert!(tree.contains_prefix("def", 3, None)?); + assert!(!tree.contains_prefix("xyz", 3, None)?); + + // MVCC visibility + assert!(!tree.contains_prefix("abc", 0, None)?); + assert!(tree.contains_prefix("abc", 1, None)?); + + // After delete + tree.remove("abc:1", 3); + tree.remove("abc:2", 4); + assert!(!tree.contains_prefix("abc", 5, None)?); + + // After flush + tree.insert("ghi:1", "value", 5); + tree.flush_active_memtable(0)?; + assert!(tree.contains_prefix("ghi", 6, None)?); + + Ok(()) +} From c25e6931f6023ae1f61a50b536b7699d706a07a5 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 14:13:23 +0200 Subject: [PATCH 03/30] refactor(blob_tree): accurate contains_prefix override note Delegate to index tree avoids BlobGuard construction overhead, not value-log reads (key() never resolves blob indirections). --- src/blob_tree/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index b144ed915..79ea14f50 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -555,8 +555,9 @@ impl AbstractTree for BlobTree { self.index.contains_key(key, seqno) } - // NOTE: Override the default implementation to not fetch - // data from the value log, so we get much faster prefix checks + // NOTE: Override the default implementation to delegate directly + // to the index tree, avoiding extra iterator/guard overhead for + // prefix checks fn contains_prefix>( &self, prefix: K, From 1962eb52f3d25990f2f3d5cb8ff32a45c2128b0a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 14:42:02 +0200 Subject: [PATCH 04/30] perf: seqno-aware seek in data block point reads Exploit internal key ordering (user_key ASC, seqno DESC) to include seqno in the binary search predicate. This skips entire restart intervals containing only versions newer than the target snapshot, reducing O(versions) linear scan to O(restart_interval) for keys with many MVCC versions. Closes fjall-rs/lsm-tree#237 --- src/table/data_block/iter.rs | 13 +++- src/table/data_block/mod.rs | 130 +++++++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 8 deletions(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index e8c605ba4..977844f7b 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -8,7 +8,7 @@ use crate::{ block::{Decoder, ParsedItem}, data_block::DataBlockParsedItem, }, - InternalValue, + InternalValue, SeqNo, }; /// The data block iterator handles double-ended scans over a data block @@ -34,6 +34,17 @@ impl<'a> Iter<'a> { true } + /// Seeks to the restart interval containing the target (needle, seqno) pair. + /// + /// Exploits internal key ordering (user_key ASC, seqno DESC) to skip + /// restart intervals containing only versions newer than the target seqno. + pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.decoder.inner_mut().seek( + |head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno), + false, + ) + } + pub fn seek(&mut self, needle: &[u8]) -> bool { // Find the restart interval whose head key is the last one strictly below `needle`. // The decoder then performs a linear scan within that interval; we stop as soon as we diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs index 7104ccfaa..00a1121f1 100644 --- a/src/table/data_block/mod.rs +++ b/src/table/data_block/mod.rs @@ -407,7 +407,6 @@ impl DataBlock { .map(|reader| reader.bucket_count()) } - // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno)) #[must_use] pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option { let iter = if let Some(hash_index_reader) = self.get_hash_index_reader() { @@ -416,10 +415,10 @@ impl DataBlock { return None; } MARKER_CONFLICT => { - // NOTE: Fallback to binary search + // NOTE: Fallback to seqno-aware binary search let mut iter = self.iter(); - if !iter.seek(needle) { + if !iter.seek_to_key_seqno(needle, seqno) { return None; } @@ -437,8 +436,9 @@ impl DataBlock { } else { let mut iter = self.iter(); - // NOTE: Fallback to binary search - if !iter.seek(needle) { + // NOTE: Seqno-aware binary search skips restart intervals + // containing only versions newer than the target seqno + if !iter.seek_to_key_seqno(needle, seqno) { return None; } @@ -449,14 +449,14 @@ impl DataBlock { for item in iter { match item.compare_key(needle, &self.inner.data) { std::cmp::Ordering::Greater => { - // We are before our searched key/seqno + // We are past our searched key return None; } std::cmp::Ordering::Equal => { // If key is same as needle, check sequence number } std::cmp::Ordering::Less => { - // We are past our searched key + // We are before our searched key continue; } } @@ -1233,4 +1233,120 @@ mod tests { Ok(()) } + + #[test] + fn data_block_point_read_seqno_aware_seek() -> crate::Result<()> { + // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3) should return v3 + let items = [ + InternalValue::from_components(b"a", b"a5", 5, Value), + InternalValue::from_components(b"a", b"a4", 4, Value), + InternalValue::from_components(b"a", b"a3", 3, Value), + InternalValue::from_components(b"a", b"a2", 2, Value), + InternalValue::from_components(b"a", b"a1", 1, Value), + ]; + + // With restart_interval=1, every item is a restart head, + // so seqno-aware binary search can skip directly to the target version + for restart_interval in 1..=4 { + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + }, + }); + + // seqno=4 → should see version with seqno=3 (first with seqno < 4) + assert_eq!( + Some(items[2].clone()), + data_block.point_read(b"a", 4), + "restart_interval={restart_interval}: seqno=4 should return v3", + ); + + // seqno=3 → should see version with seqno=2 + assert_eq!( + Some(items[3].clone()), + data_block.point_read(b"a", 3), + "restart_interval={restart_interval}: seqno=3 should return v2", + ); + + // seqno=6 → should see latest version (seqno=5) + assert_eq!( + Some(items[0].clone()), + data_block.point_read(b"a", 6), + "restart_interval={restart_interval}: seqno=6 should return v5", + ); + + // seqno=1 → no visible version (all seqno >= 1) + assert!( + data_block.point_read(b"a", 1).is_none(), + "restart_interval={restart_interval}: seqno=1 should return None", + ); + + // Non-existent key + assert!( + data_block.point_read(b"b", SeqNo::MAX).is_none(), + "restart_interval={restart_interval}: key 'b' should not exist", + ); + } + + Ok(()) + } + + #[test] + fn data_block_point_read_seqno_aware_seek_mixed_keys() -> crate::Result<()> { + // Multiple keys with multiple versions + let items = [ + InternalValue::from_components(b"a", b"a3", 3, Value), + InternalValue::from_components(b"a", b"a2", 2, Value), + InternalValue::from_components(b"a", b"a1", 1, Value), + InternalValue::from_components(b"b", b"b5", 5, Value), + InternalValue::from_components(b"b", b"b4", 4, Value), + InternalValue::from_components(b"b", b"b3", 3, Value), + InternalValue::from_components(b"b", b"b2", 2, Value), + InternalValue::from_components(b"b", b"b1", 1, Value), + InternalValue::from_components(b"c", b"c1", 1, Value), + ]; + + for restart_interval in 1..=4 { + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + }, + }); + + // Read "b" at seqno=4 → should return version with seqno=3 + assert_eq!( + Some(items[5].clone()), + data_block.point_read(b"b", 4), + "restart_interval={restart_interval}: b@4 should return b3", + ); + + // Read "a" at seqno=2 → should return version with seqno=1 + assert_eq!( + Some(items[2].clone()), + data_block.point_read(b"a", 2), + "restart_interval={restart_interval}: a@2 should return a1", + ); + + // Read "c" at seqno=2 → should return version with seqno=1 + assert_eq!( + Some(items[8].clone()), + data_block.point_read(b"c", 2), + "restart_interval={restart_interval}: c@2 should return c1", + ); + } + + Ok(()) + } } From c52ec805d985727e95a4c95ce24d8f806ea95c56 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 14:50:43 +0200 Subject: [PATCH 05/30] docs(test): clarify seqno snapshot visibility in test comment --- src/table/data_block/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs index 00a1121f1..33b9fb8c9 100644 --- a/src/table/data_block/mod.rs +++ b/src/table/data_block/mod.rs @@ -1236,7 +1236,8 @@ mod tests { #[test] fn data_block_point_read_seqno_aware_seek() -> crate::Result<()> { - // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3) should return v3 + // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3) + // returns the first version with seqno < 3, i.e., v2 ("a2") let items = [ InternalValue::from_components(b"a", b"a5", 5, Value), InternalValue::from_components(b"a", b"a4", 4, Value), From 0513f336e08e8aa6573374a5592242c5a7486a6c Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 15:11:15 +0200 Subject: [PATCH 06/30] docs(data_block): precise seek_to_key_seqno guarantees --- src/table/data_block/iter.rs | 12 +++++++++--- src/table/data_block/mod.rs | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index 977844f7b..429499327 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -34,10 +34,16 @@ impl<'a> Iter<'a> { true } - /// Seeks to the restart interval containing the target (needle, seqno) pair. + /// Seeks to the last restart interval whose head key is strictly below the + /// target `needle`, or equal to it with a seqno that is at least the given + /// snapshot boundary. /// - /// Exploits internal key ordering (user_key ASC, seqno DESC) to skip - /// restart intervals containing only versions newer than the target seqno. + /// Here `seqno` is a snapshot boundary: point reads return the first item + /// with `item.seqno < seqno`. Using the internal key ordering + /// (user_key ASC, seqno DESC), this skips restart intervals that can only + /// contain versions newer than the snapshot, so any visible version for + /// `needle` will be found within roughly one restart interval of the + /// resulting position. pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool { self.decoder.inner_mut().seek( |head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno), diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs index 33b9fb8c9..c9e6bff1c 100644 --- a/src/table/data_block/mod.rs +++ b/src/table/data_block/mod.rs @@ -436,8 +436,8 @@ impl DataBlock { } else { let mut iter = self.iter(); - // NOTE: Seqno-aware binary search skips restart intervals - // containing only versions newer than the target seqno + // NOTE: Seqno-aware binary search reduces linear scanning by skipping most + // restart intervals that contain only versions newer than the target seqno if !iter.seek_to_key_seqno(needle, seqno) { return None; } From 42d2c642be43b7506692f64f2d000ed269bb8876 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 15:20:58 +0200 Subject: [PATCH 07/30] perf(data_block): single cmp in seek_to_key_seqno predicate --- src/table/data_block/iter.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index 429499327..16bfd92a4 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -46,7 +46,11 @@ impl<'a> Iter<'a> { /// resulting position. pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool { self.decoder.inner_mut().seek( - |head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno), + |head_key, head_seqno| match head_key.cmp(needle) { + std::cmp::Ordering::Less => true, + std::cmp::Ordering::Equal => head_seqno >= seqno, + std::cmp::Ordering::Greater => false, + }, false, ) } From cbf88d396ac09a625bc7b7c65be3d88e8a6a955a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sat, 14 Mar 2026 15:41:30 +0200 Subject: [PATCH 08/30] docs(test): describe restart_interval loop coverage --- src/table/data_block/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs index c9e6bff1c..18da4451c 100644 --- a/src/table/data_block/mod.rs +++ b/src/table/data_block/mod.rs @@ -1246,8 +1246,9 @@ mod tests { InternalValue::from_components(b"a", b"a1", 1, Value), ]; - // With restart_interval=1, every item is a restart head, - // so seqno-aware binary search can skip directly to the target version + // Test across various restart intervals: at restart_interval=1 every item + // is a restart head so binary search lands exactly; at larger intervals it + // may scan within the restart range but must still return the correct version. for restart_interval in 1..=4 { let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; From 1fddda00d80f98065f899400b3acf7ebe113b108 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 11:09:39 +0200 Subject: [PATCH 09/30] perf(data_block): seqno-aware seek for iterator bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Forward seeks (seek, seek_exclusive) use seqno in restart-interval binary search predicate, matching index_block pattern - Backward seeks (seek_upper, seek_upper_exclusive) accept seqno for API uniformity but cannot narrow the binary search — backward iteration visits lower indices only, so a tighter predicate would miss intervals containing the visible version - Wire seqno through OwnedDataBlockIter wrappers, removing all TODOs - Add tests for seqno-aware forward/backward seeks with mixed keys Ref #237 --- src/table/data_block/iter.rs | 62 +++++---- src/table/data_block/iter_test.rs | 207 +++++++++++++++++++++++------- src/table/iter.rs | 16 +-- 3 files changed, 207 insertions(+), 78 deletions(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index 16bfd92a4..a2345c396 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -55,15 +55,19 @@ impl<'a> Iter<'a> { ) } - pub fn seek(&mut self, needle: &[u8]) -> bool { - // Find the restart interval whose head key is the last one strictly below `needle`. - // The decoder then performs a linear scan within that interval; we stop as soon as we - // reach a key ≥ needle. This minimizes parsing work while preserving correctness. - if !self - .decoder - .inner_mut() - .seek(|head_key, _| head_key < needle, false) - { + pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + // Find the last restart interval whose head precedes (needle, seqno) in + // internal key order (user_key ASC, seqno DESC). This lets us skip + // restart intervals that contain only versions newer than the snapshot, + // reducing the subsequent linear scan. + if !self.decoder.inner_mut().seek( + |head_key, head_seqno| match head_key.cmp(needle) { + std::cmp::Ordering::Less => true, + std::cmp::Ordering::Equal => head_seqno >= seqno, + std::cmp::Ordering::Greater => false, + }, + false, + ) { return false; } @@ -96,9 +100,16 @@ impl<'a> Iter<'a> { } } - pub fn seek_upper(&mut self, needle: &[u8]) -> bool { - // Reverse-bound seek: position the high scanner at the first restart whose head key is - // ≤ needle, then walk backwards inside the interval until we find a key ≤ needle. + pub fn seek_upper(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { + // Reverse-bound seek: position the high scanner at the last restart whose + // head key is ≤ needle, then walk backwards inside the interval until we + // find a key ≤ needle. + // + // Note: seqno cannot narrow the backward binary search. Backward + // iteration visits intervals from the selected one toward index 0, so a + // tighter predicate would cause later intervals (higher index, older + // versions of the same key) to be skipped entirely — potentially missing + // the visible version. if !self .decoder .inner_mut() @@ -133,15 +144,18 @@ impl<'a> Iter<'a> { } } - pub fn seek_exclusive(&mut self, needle: &[u8]) -> bool { - // Exclusive lower bound: identical to `seek`, except we must not yield entries equal to - // `needle`. We therefore keep consuming while keys compare equal and only stop once we - // observe a strictly greater key. - if !self - .decoder - .inner_mut() - .seek(|head_key, _| head_key < needle, false) - { + pub fn seek_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + // Exclusive lower bound: identical to `seek`, except we must not yield + // entries equal to `needle`. The seqno-aware binary search still helps + // by landing closer to the target position in the restart index. + if !self.decoder.inner_mut().seek( + |head_key, head_seqno| match head_key.cmp(needle) { + std::cmp::Ordering::Less => true, + std::cmp::Ordering::Equal => head_seqno >= seqno, + std::cmp::Ordering::Greater => false, + }, + false, + ) { return false; } @@ -165,9 +179,9 @@ impl<'a> Iter<'a> { } } - pub fn seek_upper_exclusive(&mut self, needle: &[u8]) -> bool { - // Exclusive upper bound: mirror of `seek_upper`. We must not include entries equal to - // `needle`, so we consume equals from the high end until we see a strictly smaller key. + pub fn seek_upper_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { + // Exclusive upper bound: mirror of `seek_upper`. Same backward-search + // limitation applies — seqno cannot narrow the binary search here. if !self .decoder .inner_mut() diff --git a/src/table/data_block/iter_test.rs b/src/table/data_block/iter_test.rs index 8ff8fcdc0..f5cc81b00 100644 --- a/src/table/data_block/iter_test.rs +++ b/src/table/data_block/iter_test.rs @@ -5,7 +5,7 @@ mod tests { block::{BlockType, Header, ParsedItem}, Block, DataBlock, }, - Checksum, InternalValue, Slice, + Checksum, InternalValue, SeqNo, Slice, ValueType::{Tombstone, Value}, }; use test_log::test; @@ -71,8 +71,8 @@ mod tests { { let mut iter = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&110u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX); let iter = iter.map(|x| x.materialize(data_block.as_slice())); assert_eq!( @@ -83,8 +83,8 @@ mod tests { { let mut iter: crate::table::data_block::Iter<'_> = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&110u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX); let iter = iter.map(|x| x.materialize(data_block.as_slice())); assert_eq!( @@ -95,8 +95,8 @@ mod tests { { let mut iter = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&110u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); let mut count = 0; @@ -145,8 +145,8 @@ mod tests { { let mut iter = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&109u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX); let iter = iter.map(|x| x.materialize(data_block.as_slice())); assert_eq!( @@ -157,8 +157,8 @@ mod tests { { let mut iter: crate::table::data_block::Iter<'_> = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&109u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX); let iter = iter.map(|x| x.materialize(data_block.as_slice())); assert_eq!( @@ -169,8 +169,8 @@ mod tests { { let mut iter = data_block.iter(); - iter.seek(&10u64.to_be_bytes()); - iter.seek_upper(&109u64.to_be_bytes()); + iter.seek(&10u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); let mut count = 0; @@ -218,8 +218,8 @@ mod tests { }); let mut iter = data_block.iter(); - iter.seek(&5u64.to_be_bytes()); - iter.seek_upper(&9u64.to_be_bytes()); + iter.seek(&5u64.to_be_bytes(), SeqNo::MAX); + iter.seek_upper(&9u64.to_be_bytes(), SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); let mut count = 0; @@ -345,7 +345,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -386,7 +386,7 @@ mod tests { { let mut iter = data_block.iter(); - assert!(!iter.seek(b"a"), "should not seek"); + assert!(!iter.seek(b"a", SeqNo::MAX), "should not seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -398,7 +398,7 @@ mod tests { { let mut iter = data_block.iter(); - assert!(!iter.seek_upper(b"g"), "should not seek"); + assert!(!iter.seek_upper(b"g", SeqNo::MAX), "should not seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -410,7 +410,7 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"b"), "should seek"); + assert!(iter.seek_upper(b"b", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -425,7 +425,7 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek(b"f"), "should seek"); + assert!(iter.seek(b"f", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -466,8 +466,8 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek(b"c"), "should seek"); - assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek(b"c", SeqNo::MAX), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -507,7 +507,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"b"), "should seek"); + assert!(iter.seek_upper(b"b", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -548,8 +548,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek(b"d"), "should seek"); - assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek(b"d", SeqNo::MAX), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -564,8 +564,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"d"), "should seek"); - assert!(iter.seek(b"d"), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); + assert!(iter.seek(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -580,8 +580,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek(b"d"), "should seek"); - assert!(iter.seek_upper(b"d"), "should seek"); + assert!(iter.seek(b"d", SeqNo::MAX), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -602,8 +602,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"d"), "should seek"); - assert!(iter.seek(b"d"), "should seek"); + assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek"); + assert!(iter.seek(b"d", SeqNo::MAX), "should seek"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -651,8 +651,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek(b"f"), "should seek"); - iter.seek_upper(b"e"); + assert!(iter.seek(b"f", SeqNo::MAX), "should seek"); + iter.seek_upper(b"e", SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -662,8 +662,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek(b"f"), "should seek"); - iter.seek_upper(b"e"); + assert!(iter.seek(b"f", SeqNo::MAX), "should seek"); + iter.seek_upper(b"e", SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -673,8 +673,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"e"), "should seek"); - iter.seek(b"f"); + assert!(iter.seek_upper(b"e", SeqNo::MAX), "should seek"); + iter.seek(b"f", SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -684,8 +684,8 @@ mod tests { { let mut iter = data_block.iter(); - assert!(iter.seek_upper(b"e"), "should seek"); - iter.seek(b"f"); + assert!(iter.seek_upper(b"e", SeqNo::MAX), "should seek"); + iter.seek(b"f", SeqNo::MAX); let mut iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -721,7 +721,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek(b"b"), "should seek correctly"); + assert!(iter.seek(b"b", SeqNo::MAX), "should seek correctly"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -758,7 +758,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek(b"d"), "should seek correctly"); + assert!(iter.seek(b"d", SeqNo::MAX), "should seek correctly"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -798,7 +798,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(iter.seek(b"f"), "should seek correctly"); + assert!(iter.seek(b"f", SeqNo::MAX), "should seek correctly"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -838,7 +838,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(!iter.seek(b"a"), "should not find exact match"); + assert!(!iter.seek(b"a", SeqNo::MAX), "should not find exact match"); let iter = iter.map(|item| item.materialize(&data_block.inner.data)); @@ -875,7 +875,7 @@ mod tests { let mut iter = data_block.iter(); - assert!(!iter.seek(b"g"), "should not find exact match"); + assert!(!iter.seek(b"g", SeqNo::MAX), "should not find exact match"); assert!(iter.next().is_none(), "should not collect any items"); } @@ -1270,11 +1270,126 @@ mod tests { assert_eq!(data_block.iter().count(), items.len()); let mut iter = data_block.iter(); - iter.seek(&[0]); - iter.seek_upper(&[0]); + iter.seek(&[0], SeqNo::MAX); + iter.seek_upper(&[0], SeqNo::MAX); assert_eq!(0, iter.count()); Ok(()) } + + /// Verifies that `seek(needle, seqno)` with a seqno-aware predicate still + /// positions the iterator correctly when a key has many versions spanning + /// multiple restart intervals. + #[test] + fn data_block_seek_seqno_aware() -> crate::Result<()> { + // Build a block where key "b" has 10 versions (seqno 10..1) with + // restart_interval=2, so versions span 5 restart intervals. + let mut items = Vec::new(); + for seqno in (1..=10).rev() { + items.push(InternalValue::from_components(b"b", b"", seqno, Value)); + } + + for restart_interval in [1, 2, 3, 5] { + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + }, + }); + + // With SeqNo::MAX, seek behaves like key-only (no seqno filtering). + { + let mut iter = data_block.iter(); + assert!( + iter.seek(b"b", SeqNo::MAX), + "should find key with MAX seqno" + ); + let entry = iter.next().expect("should have entry"); + let materialized = entry.materialize(&data_block.inner.data); + assert_eq!(materialized.key.user_key.as_ref(), b"b"); + // First version returned is the newest (seqno 10). + assert_eq!(materialized.key.seqno, 10); + } + + // With a specific snapshot seqno, the binary search skips restart + // intervals that only contain newer versions, but the linear scan + // still finds the first entry with key == needle. + { + let mut iter = data_block.iter(); + assert!(iter.seek(b"b", 5), "should find key with snapshot seqno 5"); + let entry = iter.next().expect("should have entry"); + let materialized = entry.materialize(&data_block.inner.data); + assert_eq!(materialized.key.user_key.as_ref(), b"b"); + // seek returns the first entry with key >= needle; that's still + // the newest version in the landing interval. The seqno-aware + // predicate only narrows which restart interval we land on. + } + } + + Ok(()) + } + + /// Verifies that `seek` with seqno still works correctly when the block + /// contains multiple distinct keys with versions. + #[test] + fn data_block_seek_seqno_aware_mixed_keys() -> crate::Result<()> { + let items = vec![ + InternalValue::from_components(b"a", b"", 10, Value), + InternalValue::from_components(b"a", b"", 5, Value), + InternalValue::from_components(b"b", b"", 10, Value), + InternalValue::from_components(b"b", b"", 7, Value), + InternalValue::from_components(b"b", b"", 3, Value), + InternalValue::from_components(b"c", b"", 10, Value), + ]; + + for restart_interval in [1, 2, 3] { + let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?; + let data_block = DataBlock::new(Block { + data: bytes.into(), + header: Header { + block_type: BlockType::Data, + checksum: Checksum::from_raw(0), + data_length: 0, + uncompressed_length: 0, + }, + }); + + // Forward seek with seqno narrows restart interval selection. + { + let mut iter = data_block.iter(); + assert!(iter.seek(b"b", 5), "should find b at snapshot 5"); + let entry = iter.next().expect("should have entry"); + let mat = entry.materialize(&data_block.inner.data); + assert_eq!(mat.key.user_key.as_ref(), b"b"); + } + + // Exclusive forward seek with seqno. + { + let mut iter = data_block.iter(); + assert!( + iter.seek_exclusive(b"b", 5), + "should find entry > b at snapshot 5" + ); + let entry = iter.next().expect("should have entry"); + let mat = entry.materialize(&data_block.inner.data); + assert_eq!(mat.key.user_key.as_ref(), b"c"); + } + + // Upper seek still works with seqno (predicate unchanged for backward). + { + let mut iter = data_block.iter(); + assert!(iter.seek_upper(b"b", 5), "should find upper bound b"); + let entry = iter.next_back().expect("should have entry"); + let mat = entry.materialize(&data_block.inner.data); + assert_eq!(mat.key.user_key.as_ref(), b"b"); + } + } + + Ok(()) + } } diff --git a/src/table/iter.rs b/src/table/iter.rs index b03b69da1..3809caa9b 100644 --- a/src/table/iter.rs +++ b/src/table/iter.rs @@ -37,20 +37,20 @@ self_cell!( ); impl OwnedDataBlockIter { - fn seek_lower_inclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - self.with_dependent_mut(|_, m| m.seek(needle /* TODO: , seqno */)) + fn seek_lower_inclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek(needle, seqno)) } - fn seek_upper_inclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - self.with_dependent_mut(|_, m| m.seek_upper(needle /* TODO: , seqno */)) + fn seek_upper_inclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek_upper(needle, seqno)) } - fn seek_lower_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - self.with_dependent_mut(|_, m| m.seek_exclusive(needle /* TODO: , seqno */)) + fn seek_lower_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek_exclusive(needle, seqno)) } - fn seek_upper_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - self.with_dependent_mut(|_, m| m.seek_upper_exclusive(needle /* TODO: , seqno */)) + fn seek_upper_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { + self.with_dependent_mut(|_, m| m.seek_upper_exclusive(needle, seqno)) } pub fn seek_lower_bound(&mut self, bound: &Bound, seqno: SeqNo) -> bool { From 2b0b26576af908563379445e28161abd78149dd3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 11:24:03 +0200 Subject: [PATCH 10/30] refactor(data_block): dedup seek predicate, harden seqno tests - seek and seek_exclusive now delegate binary search to seek_to_key_seqno, eliminating predicate duplication - Tests assert landing seqno >= snapshot boundary; with restart_interval=1, exact seqno match proves the optimization distinguishes from key-only seek --- src/table/data_block/iter.rs | 29 +++++------------------- src/table/data_block/iter_test.rs | 37 ++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index a2345c396..f5e372123 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -56,18 +56,9 @@ impl<'a> Iter<'a> { } pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool { - // Find the last restart interval whose head precedes (needle, seqno) in - // internal key order (user_key ASC, seqno DESC). This lets us skip - // restart intervals that contain only versions newer than the snapshot, - // reducing the subsequent linear scan. - if !self.decoder.inner_mut().seek( - |head_key, head_seqno| match head_key.cmp(needle) { - std::cmp::Ordering::Less => true, - std::cmp::Ordering::Equal => head_seqno >= seqno, - std::cmp::Ordering::Greater => false, - }, - false, - ) { + // Reuse the seqno-aware binary search from `seek_to_key_seqno`, then + // follow up with a linear scan to position at the exact key. + if !self.seek_to_key_seqno(needle, seqno) { return false; } @@ -145,17 +136,9 @@ impl<'a> Iter<'a> { } pub fn seek_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool { - // Exclusive lower bound: identical to `seek`, except we must not yield - // entries equal to `needle`. The seqno-aware binary search still helps - // by landing closer to the target position in the restart index. - if !self.decoder.inner_mut().seek( - |head_key, head_seqno| match head_key.cmp(needle) { - std::cmp::Ordering::Less => true, - std::cmp::Ordering::Equal => head_seqno >= seqno, - std::cmp::Ordering::Greater => false, - }, - false, - ) { + // Exclusive lower bound: same seqno-aware binary search, but the linear + // scan below skips entries equal to `needle`. + if !self.seek_to_key_seqno(needle, seqno) { return false; } diff --git a/src/table/data_block/iter_test.rs b/src/table/data_block/iter_test.rs index f5cc81b00..1086d7b65 100644 --- a/src/table/data_block/iter_test.rs +++ b/src/table/data_block/iter_test.rs @@ -1316,18 +1316,31 @@ mod tests { assert_eq!(materialized.key.seqno, 10); } - // With a specific snapshot seqno, the binary search skips restart - // intervals that only contain newer versions, but the linear scan - // still finds the first entry with key == needle. + // With a specific snapshot seqno, the binary search lands on the + // restart interval containing (or nearest to) the target seqno. + // The first entry returned is the head of that interval. { let mut iter = data_block.iter(); assert!(iter.seek(b"b", 5), "should find key with snapshot seqno 5"); let entry = iter.next().expect("should have entry"); let materialized = entry.materialize(&data_block.inner.data); assert_eq!(materialized.key.user_key.as_ref(), b"b"); - // seek returns the first entry with key >= needle; that's still - // the newest version in the landing interval. The seqno-aware - // predicate only narrows which restart interval we land on. + // The landing entry's seqno must be >= the snapshot boundary, + // proving the seqno-aware predicate skipped past older intervals. + assert!( + materialized.key.seqno >= 5, + "restart_interval={restart_interval}: landing seqno {} should be >= snapshot 5", + materialized.key.seqno, + ); + // With restart_interval=1 each entry is its own interval, so + // the predicate lands exactly on the target seqno — a key-only + // seek would land on seqno 10 instead. + if restart_interval == 1 { + assert_eq!( + materialized.key.seqno, 5, + "with restart_interval=1, seqno-aware seek must land exactly on target" + ); + } } } @@ -1366,6 +1379,18 @@ mod tests { let entry = iter.next().expect("should have entry"); let mat = entry.materialize(&data_block.inner.data); assert_eq!(mat.key.user_key.as_ref(), b"b"); + // Landing seqno must be >= snapshot boundary. + assert!( + mat.key.seqno >= 5, + "restart_interval={restart_interval}: seqno {} should be >= 5", + mat.key.seqno, + ); + // With restart_interval=1, seqno-aware seek lands on (b,7) — + // the last head with seqno >= 5 — whereas key-only would land + // on (b,10). + if restart_interval == 1 { + assert_eq!(mat.key.seqno, 7); + } } // Exclusive forward seek with seqno. From 95ae8abda8a0b1b654a965001dc22fb0b05bbbb1 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 14:08:49 +0200 Subject: [PATCH 11/30] fix(docs): add backticks around identifiers in seek_to_key_seqno doc Fixes clippy::doc_markdown warning for bare user_key and seqno identifiers in documentation comment. --- src/table/data_block/iter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index f5e372123..f41ab543d 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -40,7 +40,7 @@ impl<'a> Iter<'a> { /// /// Here `seqno` is a snapshot boundary: point reads return the first item /// with `item.seqno < seqno`. Using the internal key ordering - /// (user_key ASC, seqno DESC), this skips restart intervals that can only + /// (`user_key` ASC, `seqno` DESC), this skips restart intervals that can only /// contain versions newer than the snapshot, so any visible version for /// `needle` will be found within roughly one restart interval of the /// resulting position. From a03b0de8680945e6073cad92a15108c0a089a657 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 14:16:08 +0200 Subject: [PATCH 12/30] ci: add CoordiNode CI and upstream monitor workflows --- .github/workflows/coordinode-ci.yml | 96 ++++++++++++++++++ .github/workflows/upstream-monitor.yml | 130 +++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 .github/workflows/coordinode-ci.yml create mode 100644 .github/workflows/upstream-monitor.yml diff --git a/.github/workflows/coordinode-ci.yml b/.github/workflows/coordinode-ci.yml new file mode 100644 index 000000000..9d4e9c4ab --- /dev/null +++ b/.github/workflows/coordinode-ci.yml @@ -0,0 +1,96 @@ +name: CoordiNode CI + +on: + push: + branches: + - main + - "feat/#*" + - "fix/#*" + pull_request: + branches: + - main + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + timeout-minutes: 20 + strategy: + matrix: + rust_version: + - stable + - "1.90.0" + os: + - ubuntu-latest + - windows-latest + - macos-latest + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + toolchain: ${{ matrix.rust_version }} + components: rustfmt, clippy + + - name: Set up cargo cache + uses: Swatinem/rust-cache@v2 + with: + prefix-key: ${{ runner.os }}-cargo + + - name: Install nextest + uses: taiki-e/install-action@nextest + + - name: Format check + run: cargo fmt --all -- --check + + - name: Clippy (strict) + run: cargo clippy --all-features -- -D warnings + + - name: Run tests + run: cargo nextest run --all-features + + - name: Run doc tests + run: cargo test --doc --features lz4 + + codecov: + timeout-minutes: 20 + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install Rust nightly + uses: dtolnay/rust-toolchain@nightly + with: + components: llvm-tools-preview + + - name: Set up cargo cache + uses: Swatinem/rust-cache@v2 + with: + prefix-key: ${{ runner.os }}-cargo + + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + + - name: Install nextest + uses: taiki-e/install-action@nextest + + - name: Run tests with coverage + run: cargo llvm-cov --no-report nextest --all-features + + - name: Run doc tests with coverage + run: cargo llvm-cov --no-report --doc --features lz4 + + - name: Create coverage report + run: cargo llvm-cov report --doctests --lcov --output-path lcov.info + + - name: Upload to Codecov + uses: codecov/codecov-action@v5 + with: + files: lcov.info + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/upstream-monitor.yml b/.github/workflows/upstream-monitor.yml new file mode 100644 index 000000000..8def494b2 --- /dev/null +++ b/.github/workflows/upstream-monitor.yml @@ -0,0 +1,130 @@ +name: Upstream Monitor + +on: + schedule: + - cron: "0 8 * * 1,4" + workflow_dispatch: + +permissions: + contents: write + pull-requests: write + +jobs: + check-upstream: + timeout-minutes: 10 + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Add upstream remote + run: git remote add upstream https://github.com/fjall-rs/lsm-tree.git + + - name: Fetch upstream and origin + run: | + git fetch upstream main + git fetch origin main + + - name: Check for new upstream commits + id: check + run: | + BEHIND=$(git rev-list origin/main..upstream/main --count) + echo "behind=$BEHIND" >> "$GITHUB_OUTPUT" + echo "Commits behind upstream: $BEHIND" + + - name: Try merge and create PR or issue + if: steps.check.outputs.behind > 0 + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BEHIND: ${{ steps.check.outputs.behind }} + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + SYNC_BRANCH="chore/upstream-sync-$(date +%Y%m%d)" + git checkout -b "$SYNC_BRANCH" origin/main + + if git merge --no-commit --no-ff upstream/main 2>&1; then + git commit -m "chore: sync upstream ($BEHIND new commits)" + git push origin "$SYNC_BRANCH" + + gh pr create \ + --title "chore: sync upstream ($BEHIND new commits)" \ + --body "$(cat <<'EOF' + ## Upstream Sync + + Automated sync from [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree) main branch. + + **Commits behind:** ${{ steps.check.outputs.behind }} + + ### Review checklist + - [ ] Review upstream changes for breaking modifications + - [ ] Verify our patches still apply cleanly + - [ ] Run full test suite + EOF + )" \ + --base main \ + --head "$SYNC_BRANCH" + else + CONFLICTS=$(git diff --name-only --diff-filter=U 2>/dev/null || true) + git merge --abort + + gh issue create \ + --title "Upstream sync conflict ($BEHIND new commits)" \ + --body "$(cat </dev/null; then + echo "Branch '$BRANCH' is fully merged into upstream/main" + + EXISTING=$(gh issue list --search "Upstream merged: $BRANCH" --state open --json number --jq 'length') + if [ "$EXISTING" = "0" ]; then + gh issue create \ + --title "Upstream merged: $BRANCH" \ + --body "$(cat < Date: Sun, 15 Mar 2026 14:24:14 +0200 Subject: [PATCH 13/30] docs: add maintained fork notice and support section --- README.md | 19 +++++++++++++++++-- assets/usdt-qr.svg | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 assets/usdt-qr.svg diff --git a/README.md b/README.md index 026dadeec..182126455 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,14 @@

-[![CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) +[![CI](https://github.com/structured-world/lsm-tree/actions/workflows/coordinode-ci.yml/badge.svg)](https://github.com/structured-world/lsm-tree/actions/workflows/coordinode-ci.yml) +[![Upstream CI](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml/badge.svg)](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml) [![docs.rs](https://img.shields.io/docsrs/lsm-tree?color=green)](https://docs.rs/lsm-tree) [![Crates.io](https://img.shields.io/crates/v/lsm-tree?color=blue)](https://crates.io/crates/lsm-tree) ![MSRV](https://img.shields.io/badge/MSRV-1.90.0-blue) -[![dependency status](https://deps.rs/repo/github/fjall-rs/lsm-tree/status.svg)](https://deps.rs/repo/github/fjall-rs/lsm-tree) + +> **Maintained fork** by [Structured World Foundation](https://sw.foundation) for the [CoordiNode](https://github.com/structured-world/coordinode) database engine. +> Based on [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree). We contribute patches upstream and maintain additional features needed for CoordiNode (zstd compression, custom sequence number generators, batch get, intra-L0 compaction, security hardening). A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust. @@ -68,12 +71,24 @@ Uses [`bytes`](https://github.com/tokio-rs/bytes) as the underlying `Slice` type cargo bench --features lz4 ``` +## Support the Project + +
+ +![USDT TRC-20 Donation QR Code](assets/usdt-qr.svg) + +USDT (TRC-20): `TFDsezHa1cBkoeZT5q2T49Wp66K8t2DmdA` + +
+ ## License All source code is licensed under MIT OR Apache-2.0. All contributions are to be licensed as MIT OR Apache-2.0. +Original project by [fjall-rs](https://github.com/fjall-rs/lsm-tree). This fork is maintained by [Structured World Foundation](https://sw.foundation). + ## Footnotes [1] https://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html diff --git a/assets/usdt-qr.svg b/assets/usdt-qr.svg new file mode 100644 index 000000000..a8dd3dcf5 --- /dev/null +++ b/assets/usdt-qr.svg @@ -0,0 +1 @@ + From d456379787a8208d74ef04fa0802125a99e4ddf4 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 14:32:05 +0200 Subject: [PATCH 14/30] ci: add dependabot configuration for cargo and actions --- .github/dependabot.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..a45d101ac --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,22 @@ +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 10 + groups: + minor-and-patch: + patterns: + - "*" + update-types: + - "minor" + - "patch" + commit-message: + prefix: "chore(deps)" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + commit-message: + prefix: "ci(deps)" From 68faa567361dcfb575534f8fa1dddf757d9ef7f2 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 14:59:44 +0200 Subject: [PATCH 15/30] ci: add release-plz workflow for automated changelog and releases --- .github/workflows/coordinode-release.yml | 31 ++++++++++++++++++++++++ .release-plz.toml | 7 ++++++ 2 files changed, 38 insertions(+) create mode 100644 .github/workflows/coordinode-release.yml create mode 100644 .release-plz.toml diff --git a/.github/workflows/coordinode-release.yml b/.github/workflows/coordinode-release.yml new file mode 100644 index 000000000..0e15c138f --- /dev/null +++ b/.github/workflows/coordinode-release.yml @@ -0,0 +1,31 @@ +name: Release + +on: + push: + branches: + - main + +permissions: + contents: write + pull-requests: write + +jobs: + release-plz: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Run release-plz + uses: release-plz/action@v0.5 + with: + command: release-pr + config: .release-plz.toml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.release-plz.toml b/.release-plz.toml new file mode 100644 index 000000000..2a55084af --- /dev/null +++ b/.release-plz.toml @@ -0,0 +1,7 @@ +[workspace] +# Don't publish to crates.io — we're a maintained fork using git dependencies +publish = false +# Create GitHub releases with changelog +git_release_enable = true +# Use conventional commits for changelog +changelog_update = true From 9bf3cf8542598cbdad7d49f34d4b8ede0121148a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 15:23:34 +0200 Subject: [PATCH 16/30] ci: split PR checks from full matrix, reduce PR to lint + ubuntu test --- .github/workflows/coordinode-ci.yml | 104 +++++++++++++--------------- 1 file changed, 50 insertions(+), 54 deletions(-) diff --git a/.github/workflows/coordinode-ci.yml b/.github/workflows/coordinode-ci.yml index 9d4e9c4ab..eb0ca6396 100644 --- a/.github/workflows/coordinode-ci.yml +++ b/.github/workflows/coordinode-ci.yml @@ -14,82 +14,78 @@ env: CARGO_TERM_COLOR: always jobs: + lint: + # Fast gate — runs on every push and PR + timeout-minutes: 10 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + - name: Format + run: cargo fmt --all -- --check + - name: Clippy (strict) + run: cargo clippy --all-features -- -D warnings + test: + # Full matrix — only on push to main/feature branches (not on PRs) + if: github.event_name == 'push' + needs: lint timeout-minutes: 20 strategy: matrix: - rust_version: - - stable - - "1.90.0" - os: - - ubuntu-latest - - windows-latest - - macos-latest + rust_version: [stable, "1.90.0"] + os: [ubuntu-latest, windows-latest, macos-latest] runs-on: ${{ matrix.os }} steps: - - name: Checkout - uses: actions/checkout@v6 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable with: toolchain: ${{ matrix.rust_version }} - components: rustfmt, clippy - - - name: Set up cargo cache - uses: Swatinem/rust-cache@v2 + - uses: Swatinem/rust-cache@v2 with: prefix-key: ${{ runner.os }}-cargo - - - name: Install nextest - uses: taiki-e/install-action@nextest - - - name: Format check - run: cargo fmt --all -- --check - - - name: Clippy (strict) - run: cargo clippy --all-features -- -D warnings - + - uses: taiki-e/install-action@nextest - name: Run tests run: cargo nextest run --all-features + - name: Run doc tests + run: cargo test --doc --features lz4 + test-pr: + # Lightweight — only on PRs (ubuntu stable) + if: github.event_name == 'pull_request' + needs: lint + timeout-minutes: 15 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - uses: taiki-e/install-action@nextest + - name: Run tests + run: cargo nextest run --all-features - name: Run doc tests run: cargo test --doc --features lz4 codecov: + if: github.event_name == 'push' + needs: lint timeout-minutes: 20 runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v6 - - - name: Install Rust nightly - uses: dtolnay/rust-toolchain@nightly + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@nightly with: components: llvm-tools-preview - - - name: Set up cargo cache - uses: Swatinem/rust-cache@v2 - with: - prefix-key: ${{ runner.os }}-cargo - - - name: Install cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov - - - name: Install nextest - uses: taiki-e/install-action@nextest - - - name: Run tests with coverage - run: cargo llvm-cov --no-report nextest --all-features - - - name: Run doc tests with coverage - run: cargo llvm-cov --no-report --doc --features lz4 - - - name: Create coverage report - run: cargo llvm-cov report --doctests --lcov --output-path lcov.info - - - name: Upload to Codecov - uses: codecov/codecov-action@v5 + - uses: Swatinem/rust-cache@v2 + - uses: taiki-e/install-action@cargo-llvm-cov + - uses: taiki-e/install-action@nextest + - run: cargo llvm-cov --no-report nextest --all-features + - run: cargo llvm-cov --no-report --doc --features lz4 + - run: cargo llvm-cov report --doctests --lcov --output-path lcov.info + - uses: codecov/codecov-action@v5 with: files: lcov.info env: From 994436c7bf287e6cfd6beb4bcf4592dce2cb3b3f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 16:06:01 +0200 Subject: [PATCH 17/30] fix: resolve all clippy warnings for strict -D warnings CI - Replace unsafe Slice::builder_unzeroed with safe vec![0u8; N] in lz4 decompression paths (block and blob reader) - Add #[expect] annotations for expect_used, cast_possible_truncation, too_many_lines, too_many_arguments, significant_drop_tightening - Add #[allow(dead_code)] for unused fields/structs - Add #[allow(unreachable_patterns)] for cfg-gated match arms - Add #[must_use] on builder_unzeroed - Add clippy.toml with allowed-duplicate-crates for hashbrown Closes #2 --- clippy.toml | 1 + src/blob_tree/mod.rs | 1 + src/compaction/leveled/mod.rs | 23 ++++++++++++++++++++--- src/compaction/worker.rs | 4 ++++ src/manifest.rs | 1 + src/slice/slice_bytes/mod.rs | 9 +++++++++ src/table/block/mod.rs | 20 ++++++++++---------- src/table/filter/mod.rs | 4 +++- src/table/iter.rs | 4 ++++ src/table/mod.rs | 4 ++-- src/table/util.rs | 7 ++++++- src/tree/mod.rs | 4 ++++ src/version/run.rs | 5 +++++ src/vlog/blob_file/reader.rs | 17 +++++++++++++---- src/vlog/mod.rs | 4 ++++ 15 files changed, 87 insertions(+), 21 deletions(-) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 000000000..519df83ba --- /dev/null +++ b/clippy.toml @@ -0,0 +1 @@ +allowed-duplicate-crates = ["hashbrown"] diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 73ea1c119..111632dbd 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -332,6 +332,7 @@ impl AbstractTree for BlobTree { self.index.get_flush_lock() } + #[expect(clippy::too_many_lines, reason = "flush logic is inherently complex")] fn flush_to_tables( &self, stream: impl Iterator>, diff --git a/src/compaction/leveled/mod.rs b/src/compaction/leveled/mod.rs index f4bd190ab..213707d27 100644 --- a/src/compaction/leveled/mod.rs +++ b/src/compaction/leveled/mod.rs @@ -279,18 +279,31 @@ impl CompactionStrategy for Strategy { // Trivial move into Lmax 'trivial_lmax: { + #[expect( + clippy::expect_used, + reason = "level 0 is guaranteed to exist in a valid version" + )] let l0 = version.level(0).expect("first level should exist"); if !l0.is_empty() && l0.is_disjoint() { let lmax_index = version.level_count() - 1; - if (1..lmax_index) - .any(|idx| !version.level(idx).expect("level should exist").is_empty()) - { + if (1..lmax_index).any(|idx| { + #[expect( + clippy::expect_used, + reason = "levels within level_count are guaranteed to exist" + )] + let level = version.level(idx).expect("level should exist"); + !level.is_empty() + }) { // There are intermediary levels with data, cannot trivially move to Lmax break 'trivial_lmax; } + #[expect( + clippy::expect_used, + reason = "lmax_index is derived from level_count so level is guaranteed to exist" + )] let lmax = version.level(lmax_index).expect("last level should exist"); if !lmax @@ -299,6 +312,10 @@ impl CompactionStrategy for Strategy { { return Choice::Move(CompactionInput { table_ids: l0.list_ids(), + #[expect( + clippy::cast_possible_truncation, + reason = "level count is at most 7, fits in u8" + )] dest_level: lmax_index as u8, canonical_level: 1, target_size: self.target_size, diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index b1f7052b4..35ddccf11 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -181,6 +181,10 @@ fn create_compaction_stream<'a>( }) } +#[expect( + clippy::significant_drop_tightening, + reason = "version_history_lock must be held across upgrade_version and maintenance" +)] fn move_tables( compaction_state: &MutexGuard<'_, CompactionState>, opts: &Options, diff --git a/src/manifest.rs b/src/manifest.rs index 38ae44ec3..c0a9b7733 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -8,6 +8,7 @@ use std::{io::Read, path::Path}; pub struct Manifest { pub version: FormatVersion, + #[allow(dead_code)] // used during deserialization/validation, retained for future use pub tree_type: TreeType, pub level_count: u8, } diff --git a/src/slice/slice_bytes/mod.rs b/src/slice/slice_bytes/mod.rs index 1872460cb..a229e3adb 100644 --- a/src/slice/slice_bytes/mod.rs +++ b/src/slice/slice_bytes/mod.rs @@ -26,6 +26,7 @@ impl Slice { } #[doc(hidden)] + #[must_use] pub unsafe fn builder_unzeroed(len: usize) -> Builder { // Use `with_capacity` & `set_len`` to avoid zeroing the buffer let mut builder = Builder::with_capacity(len); @@ -57,7 +58,15 @@ impl Slice { { let mut writer = &mut builder[..]; + #[expect( + clippy::expect_used, + reason = "writing into a pre-allocated buffer of exact size cannot fail" + )] writer.write_all(left).expect("should write"); + #[expect( + clippy::expect_used, + reason = "writing into a pre-allocated buffer of exact size cannot fail" + )] writer.write_all(right).expect("should write"); } diff --git a/src/table/block/mod.rs b/src/table/block/mod.rs index c7f8e3c66..39be12c1d 100644 --- a/src/table/block/mod.rs +++ b/src/table/block/mod.rs @@ -106,14 +106,14 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => { - #[warn(unsafe_code)] - let mut builder = - unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; + let mut buf = vec![0u8; header.uncompressed_length as usize]; - lz4_flex::decompress_into(&raw_data, &mut builder) + let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) .map_err(|_| crate::Error::Decompress(compression))?; - builder.freeze().into() + debug_assert_eq!(bytes_written, header.uncompressed_length as usize); + + Slice::from(buf) } }; @@ -167,14 +167,14 @@ impl Block { #[expect(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; - #[warn(unsafe_code)] - let mut builder = - unsafe { Slice::builder_unzeroed(header.uncompressed_length as usize) }; + let mut decompressed = vec![0u8; header.uncompressed_length as usize]; - lz4_flex::decompress_into(raw_data, &mut builder) + let bytes_written = lz4_flex::decompress_into(raw_data, &mut decompressed) .map_err(|_| crate::Error::Decompress(compression))?; - builder.freeze().into() + debug_assert_eq!(bytes_written, header.uncompressed_length as usize); + + Slice::from(decompressed) } }; diff --git a/src/table/filter/mod.rs b/src/table/filter/mod.rs index fdff471e4..ca4b0c3c2 100644 --- a/src/table/filter/mod.rs +++ b/src/table/filter/mod.rs @@ -45,7 +45,9 @@ impl BloomConstructionPolicy { pub fn estimated_filter_size(&self, n: usize) -> usize { #[expect( clippy::cast_precision_loss, - reason = "truncation is fine because this is an estimation" + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + reason = "truncation and precision loss are fine because this is an estimation" )] match self { Self::BitsPerKey(bpk) => (*bpk * (n as f32)) as usize / 8, diff --git a/src/table/iter.rs b/src/table/iter.rs index b03b69da1..cb6462849 100644 --- a/src/table/iter.rs +++ b/src/table/iter.rs @@ -119,6 +119,10 @@ pub struct Iter { } impl Iter { + #[expect( + clippy::too_many_arguments, + reason = "iterator requires full context for block loading" + )] pub fn new( table_id: GlobalTableId, global_seqno: SeqNo, diff --git a/src/table/mod.rs b/src/table/mod.rs index 4b6e73648..f8bc98a7f 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -440,10 +440,10 @@ impl Table { } /// Tries to recover a table from a file. - #[warn( + #[expect( clippy::too_many_arguments, clippy::too_many_lines, - reason = "TODO: refactor" + reason = "recovery requires many context parameters and is inherently complex" )] pub fn recover( file_path: PathBuf, diff --git a/src/table/util.rs b/src/table/util.rs index 45862d152..61d2eed92 100644 --- a/src/table/util.rs +++ b/src/table/util.rs @@ -28,7 +28,10 @@ pub struct SliceIndexes(pub usize, pub usize); /// Loads a block from disk or block cache, if cached. /// /// Also handles file descriptor opening and caching. -#[warn(clippy::too_many_arguments)] +#[expect( + clippy::too_many_arguments, + reason = "block loading requires many context parameters" +)] pub fn load_block( table_id: GlobalTableId, path: &Path, @@ -56,6 +59,7 @@ pub fn load_block( BlockType::Data | BlockType::Meta => { metrics.data_block_load_cached.fetch_add(1, Relaxed); } + #[allow(unreachable_patterns)] _ => {} } @@ -108,6 +112,7 @@ pub fn load_block( .data_block_io_requested .fetch_add(handle.size().into(), Relaxed); } + #[allow(unreachable_patterns)] _ => {} } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 453e9891c..290b5bc00 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -1012,6 +1012,10 @@ impl Tree { } /// Recovers the level manifest, loading all tables from disk. + #[expect( + clippy::too_many_lines, + reason = "recovery logic is inherently complex" + )] fn recover_levels>( tree_path: P, tree_id: TreeId, diff --git a/src/version/run.rs b/src/version/run.rs index 9318f45fe..224e6f0d6 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -12,6 +12,7 @@ pub trait Ranged { /// Item inside a run /// /// May point to an interval [min, max] of tables in the next run. +#[allow(dead_code)] // struct is part of planned cascading index optimization pub struct Indexed { inner: T, // cascade_indexes: (u32, u32), @@ -138,6 +139,10 @@ impl Run { // find last index where pred holds let end = s.iter().rposition(&pred).map_or(start, |i| i + 1); + #[expect( + clippy::expect_used, + reason = "start..end are derived from position/rposition on the same slice" + )] s.get(start..end).expect("should be in range") } diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index c1e3e8739..916e67ce3 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -31,6 +31,10 @@ impl<'a> Reader<'a> { let add_size = (BLOB_HEADER_LEN as u64) + (key.len() as u64); + #[expect( + clippy::cast_possible_truncation, + reason = "blob sizes are bounded well within usize on supported 64-bit platforms" + )] let value = crate::file::read_exact( self.file, vhandle.offset, @@ -58,6 +62,10 @@ impl<'a> Reader<'a> { reader.seek(std::io::SeekFrom::Current(key_len.into()))?; + #[expect( + clippy::cast_possible_truncation, + reason = "add_size is header_len + key_len, both bounded well within usize" + )] let raw_data = value.slice((add_size as usize)..); { @@ -86,13 +94,14 @@ impl<'a> Reader<'a> { #[cfg(feature = "lz4")] CompressionType::Lz4 => { - #[warn(unsafe_code)] - let mut builder = unsafe { UserValue::builder_unzeroed(real_val_len as usize) }; + let mut buf = vec![0u8; real_val_len]; - lz4_flex::decompress_into(&raw_data, &mut builder) + let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) .map_err(|_| crate::Error::Decompress(self.blob_file.0.meta.compression))?; - builder.freeze().into() + debug_assert_eq!(bytes_written, real_val_len); + + UserValue::from(buf) } }; diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index 5f59a0a7d..c0d4c90af 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -90,6 +90,10 @@ pub fn recover_blob_files( log::error!("meta section in blob file #{blob_file_id} is missing - maybe the file is corrupted?"); })?; + #[expect( + clippy::cast_possible_truncation, + reason = "metadata section length is bounded well within usize on 64-bit platforms" + )] let metadata_slice = crate::file::read_exact( &file, metadata_section.pos(), From c21d2726b31d7cf95cfa2fc4456b2e7c7b41d6a3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 16:37:17 +0200 Subject: [PATCH 18/30] fix(decompress): use runtime validation instead of debug_assert for byte count Replace debug_assert_eq!(bytes_written, expected_len) with runtime if-check that returns Error::Decompress on mismatch. debug_assert is stripped in release builds, leaving corrupted data undetected. Applies to lz4 decompression in Block::from_reader, Block::from_file, and blob file Reader::get. --- src/table/block/mod.rs | 17 ++++++++--------- src/vlog/blob_file/reader.rs | 5 ++++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/table/block/mod.rs b/src/table/block/mod.rs index 39be12c1d..ca1ada35c 100644 --- a/src/table/block/mod.rs +++ b/src/table/block/mod.rs @@ -111,19 +111,15 @@ impl Block { let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) .map_err(|_| crate::Error::Decompress(compression))?; - debug_assert_eq!(bytes_written, header.uncompressed_length as usize); + // Runtime validation: corrupted data may decompress to fewer bytes + if bytes_written != header.uncompressed_length as usize { + return Err(crate::Error::Decompress(compression)); + } Slice::from(buf) } }; - debug_assert_eq!(header.uncompressed_length, { - #[expect(clippy::cast_possible_truncation, reason = "values are u32 length max")] - { - data.len() as u32 - } - }); - Ok(Self { header, data }) } @@ -172,7 +168,10 @@ impl Block { let bytes_written = lz4_flex::decompress_into(raw_data, &mut decompressed) .map_err(|_| crate::Error::Decompress(compression))?; - debug_assert_eq!(bytes_written, header.uncompressed_length as usize); + // Runtime validation: corrupted data may decompress to fewer bytes + if bytes_written != header.uncompressed_length as usize { + return Err(crate::Error::Decompress(compression)); + } Slice::from(decompressed) } diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 916e67ce3..42316795d 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -99,7 +99,10 @@ impl<'a> Reader<'a> { let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) .map_err(|_| crate::Error::Decompress(self.blob_file.0.meta.compression))?; - debug_assert_eq!(bytes_written, real_val_len); + // Runtime validation: corrupted data may decompress to fewer bytes + if bytes_written != real_val_len { + return Err(crate::Error::Decompress(self.blob_file.0.meta.compression)); + } UserValue::from(buf) } From cb85fd47f94328f731d02bcc995d7a53d3b106a3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 16:57:43 +0200 Subject: [PATCH 19/30] test(block): add corruption test for lz4 byte count validation Construct a block with tampered uncompressed_length header field and verify that Block::from_reader returns Error::Decompress instead of silently producing a partially-filled buffer. --- src/table/block/mod.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/table/block/mod.rs b/src/table/block/mod.rs index ca1ada35c..60929148b 100644 --- a/src/table/block/mod.rs +++ b/src/table/block/mod.rs @@ -227,4 +227,42 @@ mod tests { Ok(()) } + + #[test] + #[cfg(feature = "lz4")] + fn lz4_corrupted_uncompressed_length_triggers_decompress_error() { + use crate::coding::Encode; + use std::io::Cursor; + + let payload: &[u8] = b"hello world"; + + // Compress with lz4 using the block format + let compressed = lz4_flex::compress_prepend_size(payload); + + // Build a header with corrupted uncompressed_length (1 byte too large) + let data_length = compressed.len() as u32; + let uncompressed_length_correct = payload.len() as u32; + let uncompressed_length_corrupted = uncompressed_length_correct + 1; + + let checksum = Checksum::from_raw(crate::hash::hash128(&compressed)); + + let header = Header { + data_length, + uncompressed_length: uncompressed_length_corrupted, + checksum, + block_type: BlockType::Data, + }; + + let mut buf = header.encode_into_vec(); + buf.extend_from_slice(&compressed); + + let mut cursor = Cursor::new(buf); + let result = Block::from_reader(&mut cursor, CompressionType::Lz4); + + match result { + Err(crate::Error::Decompress(CompressionType::Lz4)) => { /* expected */ } + Ok(_) => panic!("expected Error::Decompress, but got Ok(Block)"), + Err(other) => panic!("expected Error::Decompress, got different error: {other:?}"), + } + } } From a6a675ac3b5d60b779d3e3775b9487e0c6d04308 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 17:01:28 +0200 Subject: [PATCH 20/30] test(vlog): add corruption test for lz4 blob reader byte count validation Tamper real_val_len in blob header (not covered by checksum) and verify that Reader::get returns Error::Decompress when decompressed byte count mismatches the declared length. --- src/vlog/blob_file/reader.rs | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 42316795d..01333a6e3 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -164,4 +164,49 @@ mod tests { Ok(()) } + + #[test] + #[cfg(feature = "lz4")] + fn blob_reader_lz4_corrupted_real_val_len_triggers_decompress_error() -> crate::Result<()> { + use byteorder::WriteBytesExt; + + let id_generator = SequenceNumberCounter::default(); + + let folder = tempfile::tempdir()?; + let mut writer = crate::vlog::BlobFileWriter::new(id_generator, folder.path(), 0, None)? + .use_target_size(u64::MAX) + .use_compression(CompressionType::Lz4); + + let handle = writer.write(b"a", 0, b"abcdef")?; + + let blob_file = writer.finish()?; + let blob_file = blob_file.first().unwrap(); + + // Tamper the real_val_len field in the blob file. + // Header layout: MAGIC(4) + Checksum(16) + SeqNo(8) + KeyLen(2) + RealValLen(4) + ... + // RealValLen is at offset 30 from the blob start. + let real_val_len_offset = handle.offset + 4 + 16 + 8 + 2; + + { + use std::io::{Seek, Write}; + let mut file = std::fs::OpenOptions::new() + .write(true) + .open(&blob_file.0.path)?; + file.seek(std::io::SeekFrom::Start(real_val_len_offset))?; + // Write a corrupted value: original len + 1 + file.write_u32::(b"abcdef".len() as u32 + 1)?; + file.flush()?; + } + + let file = File::open(&blob_file.0.path)?; + let reader = Reader::new(blob_file, &file); + + match reader.get(b"a", &handle) { + Err(crate::Error::Decompress(_)) => { /* expected */ } + Ok(_) => panic!("expected Error::Decompress, but got Ok"), + Err(other) => panic!("expected Error::Decompress, got: {other:?}"), + } + + Ok(()) + } } From 8f8a154b78ff913fafc3117298ce33910b24fbcb Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 17:04:22 +0200 Subject: [PATCH 21/30] fix(filter,vlog): guard zero-key division and use checked cast - Add early return for n==0 in estimated_filter_size to prevent division by zero panic in FalsePositiveRate path - Replace truncating `as usize` cast with `usize::try_from()` for metadata section length (safe on 32-bit platforms) --- src/table/filter/mod.rs | 4 ++++ src/vlog/mod.rs | 13 ++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/table/filter/mod.rs b/src/table/filter/mod.rs index ca4b0c3c2..89d829174 100644 --- a/src/table/filter/mod.rs +++ b/src/table/filter/mod.rs @@ -43,6 +43,10 @@ impl BloomConstructionPolicy { /// Returns the estimated filter size in bytes. #[must_use] pub fn estimated_filter_size(&self, n: usize) -> usize { + if n == 0 { + return 0; + } + #[expect( clippy::cast_precision_loss, clippy::cast_possible_truncation, diff --git a/src/vlog/mod.rs b/src/vlog/mod.rs index c0d4c90af..c0ecb725c 100644 --- a/src/vlog/mod.rs +++ b/src/vlog/mod.rs @@ -90,15 +90,10 @@ pub fn recover_blob_files( log::error!("meta section in blob file #{blob_file_id} is missing - maybe the file is corrupted?"); })?; - #[expect( - clippy::cast_possible_truncation, - reason = "metadata section length is bounded well within usize on 64-bit platforms" - )] - let metadata_slice = crate::file::read_exact( - &file, - metadata_section.pos(), - metadata_section.len() as usize, - )?; + let metadata_len = usize::try_from(metadata_section.len()) + .map_err(|_| crate::Error::Unrecoverable)?; + let metadata_slice = + crate::file::read_exact(&file, metadata_section.pos(), metadata_len)?; Metadata::from_slice(&metadata_slice)? }; From 56072590517c2e234496cc156b9c14bcb4f7dc2e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 17:07:52 +0200 Subject: [PATCH 22/30] fix(test): use lz4_flex::compress instead of compress_prepend_size compress_prepend_size adds a 4-byte size prefix that decompress_into does not expect. Use compress() to match production decompression format. --- src/table/block/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table/block/mod.rs b/src/table/block/mod.rs index 60929148b..75c0dc856 100644 --- a/src/table/block/mod.rs +++ b/src/table/block/mod.rs @@ -237,7 +237,7 @@ mod tests { let payload: &[u8] = b"hello world"; // Compress with lz4 using the block format - let compressed = lz4_flex::compress_prepend_size(payload); + let compressed = lz4_flex::compress(payload); // Build a header with corrupted uncompressed_length (1 byte too large) let data_length = compressed.len() as u32; From 0376989fd1e8ef55cdcfc00ef80361f03678dd23 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 17:17:10 +0200 Subject: [PATCH 23/30] docs: add Copilot review instructions with scope and issue-suggestion rules --- .github/copilot-instructions.md | 70 +++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 000000000..87712678c --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,70 @@ +# GitHub Copilot Instructions for lsm-tree (structured-world fork) + +## Project Overview + +This is a **maintained fork** of [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree) — a K.I.S.S. LSM-tree implementation in Rust. We maintain additional features and hardening for the [CoordiNode](https://github.com/structured-world/coordinode) database engine while contributing patches upstream. + +## Review Scope Rules (CRITICAL) + +**Review ONLY code within the PR's diff.** For issues found in code outside the diff: +- Do NOT suggest inline code fixes for unchanged lines +- Instead, suggest creating a **separate issue** with the finding (e.g., "Consider opening an issue to add size validation in `from_reader` — this is outside the scope of this clippy-fix PR") + +**Each PR has a defined scope in its description.** Read the "out of scope" section before reviewing. If something is listed as out of scope, do not flag it — it is tracked in another PR. + +**Cross-PR awareness:** This fork has multiple feature branches in parallel. If a hardening or feature seems missing, check whether it exists in another open PR before suggesting it. Reference the other PR number if known. + +**Prefer issue suggestions over code suggestions for out-of-scope findings.** This keeps PRs focused and reviewable. + +## Rust Code Standards + +- **Unsafe code:** Prefer safe alternatives. If `unsafe` is required, it must have a `// SAFETY:` comment explaining the invariant. +- **Error handling:** No `unwrap()` or `expect()` on I/O paths. Use `Result` propagation. `expect()` is acceptable for programmer invariants (e.g., lock poisoning) with `#[expect(clippy::expect_used, reason = "...")]`. +- **Clippy:** Code must pass `cargo clippy --all-features -- -D warnings`. Use `#[expect(...)]` (not `#[allow(...)]`) for justified suppressions — `#[expect]` warns if the suppression becomes unnecessary. +- **Casts:** Prefer `TryFrom`/`TryInto` for fallible conversions. `as` casts are acceptable for infallible cases (e.g., `u32` to `u64`) with `#[expect(clippy::cast_possible_truncation)]` and a reason. +- **Feature gates:** Code behind `#[cfg(feature = "...")]` must compile with any combination of features. Variables used only in feature-gated branches must also be feature-gated. + +## Testing Standards + +- **Corruption tests:** When adding validation for on-disk data, add a test that tampers the relevant field and asserts the error. Use the same serialization path as production (e.g., `lz4_flex::compress` not `compress_prepend_size`). +- **No mocks for storage:** Tests use real on-disk files via `tempfile::tempdir()`. +- **Test naming:** `fn __()` — e.g., `fn lz4_corrupted_header_triggers_decompress_error()`. + +## Commit Message Format + +``` +(scope): + +- Detail 1 +- Detail 2 +``` + +Types: `feat`, `fix`, `refactor`, `test`, `docs`, `style`, `chore`, `perf`, `ci`, `build`, `revert` + +**Forbidden patterns:** "address review", "fix PR comments", "WIP", "temporary" + +## Build and Test + +```bash +cargo clippy --all-features -- -D warnings # Lint (strict) +cargo test --features lz4 # Tests with lz4 +cargo test --all-features # Tests with all features +cargo fmt --all -- --check # Format check +``` + +## Feature Flags + +| Flag | Description | +|------|-------------| +| `lz4` | LZ4 compression (enabled by default in fjall) | +| `zstd` | Zstd compression (PR #1) | +| `bytes_1` | Use `bytes` crate for Slice type | +| `metrics` | Expose prometheus metrics | + +## Architecture Notes + +- `src/table/block/` — On-disk block format (header + compressed payload) +- `src/vlog/blob_file/` — Value log for large values (separate from LSM blocks) +- `src/compaction/` — Compaction strategies (leveled, FIFO, tiered) +- `src/seqno.rs` — Sequence number generator (MVCC versioning) +- Compression is pluggable via `CompressionType` enum with `#[cfg(feature)]` variants From b22f937f4a92287681c7debdb8e7af59d37a1a57 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 17:59:43 +0200 Subject: [PATCH 24/30] ci: add Copilot code review instructions with scope rules --- .../instructions/code-review.instructions.md | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/instructions/code-review.instructions.md diff --git a/.github/instructions/code-review.instructions.md b/.github/instructions/code-review.instructions.md new file mode 100644 index 000000000..bff87e21f --- /dev/null +++ b/.github/instructions/code-review.instructions.md @@ -0,0 +1,27 @@ +--- +applyTo: "**/*.rs" +--- + +# Code Review Instructions for lsm-tree (structured-world fork) + +## Scope Rules (CRITICAL) + +- **Review ONLY code within the PR's diff.** Do not suggest inline fixes for unchanged lines. +- For issues in code **outside the diff**, suggest creating a **separate issue** instead of proposing code changes. Example: "Consider opening an issue to add size validation here — this is outside this PR's scope." +- **Read the PR description carefully.** If the PR body has an "out of scope" section listing items handled by other PRs, do NOT flag those items. +- This fork has **multiple feature branches in parallel**. A hardening that seems missing in one PR may already exist in another open PR. Check the "out of scope" section for cross-references. + +## Rust Standards + +- `unsafe` blocks require `// SAFETY:` comments explaining the invariant +- Prefer `#[expect(lint)]` over `#[allow(lint)]` — `#[expect]` warns when suppression becomes unnecessary +- Use `TryFrom`/`TryInto` for fallible conversions; `as` casts need `#[expect(clippy::cast_possible_truncation)]` with reason +- No `unwrap()` / `expect()` on I/O paths — use `Result` propagation +- `expect()` is acceptable for programmer invariants (lock poisoning) with `#[expect(clippy::expect_used, reason = "...")]` +- Code must pass `cargo clippy --all-features -- -D warnings` + +## Testing + +- Corruption/validation tests: tamper the relevant on-disk field and assert the error +- Use same serialization as production (e.g., `lz4_flex::compress` not `compress_prepend_size`) +- Test naming: `fn __()` From dbb763a0e3ca4640b6abeb6a06581c6edb61d219 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 18:46:39 +0200 Subject: [PATCH 25/30] refactor: upgrade #[allow] to #[expect] with reasons on all suppressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - dead_code: Indexed struct, Manifest::tree_type → #[expect] with reason - unreachable_patterns: BlockType wildcard arms → #[expect] referencing issue #13 --- src/manifest.rs | 5 ++++- src/table/util.rs | 5 ++++- src/version/run.rs | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/manifest.rs b/src/manifest.rs index c0a9b7733..f5a9886b8 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -8,7 +8,10 @@ use std::{io::Read, path::Path}; pub struct Manifest { pub version: FormatVersion, - #[allow(dead_code)] // used during deserialization/validation, retained for future use + #[expect( + dead_code, + reason = "deserialized from on-disk manifest, retained for validation" + )] pub tree_type: TreeType, pub level_count: u8, } diff --git a/src/table/util.rs b/src/table/util.rs index 61d2eed92..ab9c94447 100644 --- a/src/table/util.rs +++ b/src/table/util.rs @@ -59,7 +59,10 @@ pub fn load_block( BlockType::Data | BlockType::Meta => { metrics.data_block_load_cached.fetch_add(1, Relaxed); } - #[allow(unreachable_patterns)] + #[expect( + unreachable_patterns, + reason = "Filter variant has no metrics counter yet, see issue #13" + )] _ => {} } diff --git a/src/version/run.rs b/src/version/run.rs index 224e6f0d6..6f81a3bc0 100644 --- a/src/version/run.rs +++ b/src/version/run.rs @@ -12,7 +12,7 @@ pub trait Ranged { /// Item inside a run /// /// May point to an interval [min, max] of tables in the next run. -#[allow(dead_code)] // struct is part of planned cascading index optimization +#[expect(dead_code, reason = "planned for cascading index optimization")] pub struct Indexed { inner: T, // cascade_indexes: (u32, u32), From 5a0575e700af080e57eb0999fef8952c84d262a9 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 19:17:43 +0200 Subject: [PATCH 26/30] docs(table): expand get_highest_seqno docstring, add mixed insert+ingest test - Document flush vs ingestion seqno semantics in get_highest_seqno - Add test covering regular inserts followed by bulk ingestion, verifying persisted seqno reflects global offset and data is visible --- src/table/mod.rs | 10 +++++++- tests/ingestion_seqno.rs | 52 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/table/mod.rs b/src/table/mod.rs index 42e6bc7dc..69182266b 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -617,7 +617,15 @@ impl Table { self.metadata.key_range.overlaps_with_bounds(bounds) } - /// Returns the highest sequence number in the table. + /// Returns the highest effective sequence number in the table. + /// + /// For tables produced by flush/compaction (`global_seqno == 0`), this + /// returns the highest item seqno directly. + /// + /// For tables produced by bulk ingestion (`global_seqno > 0`), items + /// are written with local seqno 0 and the table carries a global offset. + /// The effective seqno of each item is `global_seqno + local_seqno`, + /// which mirrors the translation in [`Table::get`]. #[must_use] pub fn get_highest_seqno(&self) -> SeqNo { self.metadata.seqnos.1 + self.global_seqno() diff --git a/tests/ingestion_seqno.rs b/tests/ingestion_seqno.rs index 105965ed9..87e36abd6 100644 --- a/tests/ingestion_seqno.rs +++ b/tests/ingestion_seqno.rs @@ -1,4 +1,4 @@ -use lsm_tree::{get_tmp_folder, AbstractTree, Config, SequenceNumberCounter}; +use lsm_tree::{get_tmp_folder, AbstractTree, Config, SeqNo, SequenceNumberCounter}; #[test] fn ingestion_persisted_seqno() -> lsm_tree::Result<()> { @@ -23,3 +23,53 @@ fn ingestion_persisted_seqno() -> lsm_tree::Result<()> { Ok(()) } + +/// Verify that get_highest_persisted_seqno reflects the global offset +/// after mixed insert + ingest, and that ingested data is visible. +#[test] +fn ingestion_seqno_after_regular_inserts() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let seqno = SequenceNumberCounter::default(); + let visible_seqno = SequenceNumberCounter::default(); + + let tree = Config::new(&folder, seqno.clone(), visible_seqno.clone()).open()?; + + // Regular inserts advance the seqno counter + let s0 = seqno.next(); + tree.insert("x", "x0", s0); + visible_seqno.fetch_max(s0 + 1); + + let s1 = seqno.next(); + tree.insert("y", "y0", s1); + visible_seqno.fetch_max(s1 + 1); + + tree.flush_active_memtable(0)?; + assert_eq!(tree.get_highest_persisted_seqno(), Some(s1)); + + // Capture counter before ingestion — ingestion allocates this + // value as global_seqno via seqno.next() + let ingest_global_seqno = seqno.get(); + + // Bulk-ingest: items get local seqno 0 but the table carries + // a global_seqno offset + let mut ingestion = tree.ingestion()?; + ingestion.write("a", "a0")?; + ingestion.write("b", "b0")?; + ingestion.finish()?; + + // effective = global_seqno + local_max (0) + let expected_seqno = ingest_global_seqno; + + assert_eq!( + tree.get_highest_persisted_seqno(), + Some(expected_seqno), + "ingested table must report effective seqno (global_seqno + local_seqno)" + ); + + // Verify data is visible + assert!(tree.get("a", SeqNo::MAX)?.is_some()); + assert!(tree.get("b", SeqNo::MAX)?.is_some()); + + Ok(()) +} From 3f65399c3b7813d905f060f276c244eeb6b93ca9 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 19:45:34 +0200 Subject: [PATCH 27/30] refactor: compute add_size as usize, remove unreachable wildcard arms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - blob/reader.rs: compute add_size in usize directly (BLOB_HEADER_LEN + key.len()), eliminating two #[expect(cast_possible_truncation)] annotations - table/util.rs: remove _ => {} wildcard arms — BlockType match is already exhaustive with Filter/Index/Data|Meta covering all 4 variants --- src/table/util.rs | 7 ------- src/vlog/blob_file/reader.rs | 21 +++++---------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/src/table/util.rs b/src/table/util.rs index ab9c94447..a85b31e2e 100644 --- a/src/table/util.rs +++ b/src/table/util.rs @@ -59,11 +59,6 @@ pub fn load_block( BlockType::Data | BlockType::Meta => { metrics.data_block_load_cached.fetch_add(1, Relaxed); } - #[expect( - unreachable_patterns, - reason = "Filter variant has no metrics counter yet, see issue #13" - )] - _ => {} } return Ok(block); @@ -115,8 +110,6 @@ pub fn load_block( .data_block_io_requested .fetch_add(handle.size().into(), Relaxed); } - #[allow(unreachable_patterns)] - _ => {} } // Cache FD diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 01333a6e3..3857c69e5 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -29,17 +29,10 @@ impl<'a> Reader<'a> { pub fn get(&self, key: &'a [u8], vhandle: &'a ValueHandle) -> crate::Result { debug_assert_eq!(vhandle.blob_file_id, self.blob_file.id()); - let add_size = (BLOB_HEADER_LEN as u64) + (key.len() as u64); - - #[expect( - clippy::cast_possible_truncation, - reason = "blob sizes are bounded well within usize on supported 64-bit platforms" - )] - let value = crate::file::read_exact( - self.file, - vhandle.offset, - (u64::from(vhandle.on_disk_size) + add_size) as usize, - )?; + let add_size = BLOB_HEADER_LEN + key.len(); + let read_len = vhandle.on_disk_size as usize + add_size; + + let value = crate::file::read_exact(self.file, vhandle.offset, read_len)?; let mut reader = Cursor::new(&value[..]); @@ -62,11 +55,7 @@ impl<'a> Reader<'a> { reader.seek(std::io::SeekFrom::Current(key_len.into()))?; - #[expect( - clippy::cast_possible_truncation, - reason = "add_size is header_len + key_len, both bounded well within usize" - )] - let raw_data = value.slice((add_size as usize)..); + let raw_data = value.slice(add_size..); { let checksum = { From 1a7995a58a754a8224e95ce92297ebf20417dd0f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:34:25 +0200 Subject: [PATCH 28/30] fix(blob,block): use checked_add for read_len, document size cap scope - blob reader: checked_add for on_disk_size + add_size (32-bit overflow) - block/blob: add NOTE comments clarifying size cap is in separate branch --- src/table/block/mod.rs | 3 +++ src/vlog/blob_file/reader.rs | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/table/block/mod.rs b/src/table/block/mod.rs index 75c0dc856..fb42423b6 100644 --- a/src/table/block/mod.rs +++ b/src/table/block/mod.rs @@ -106,6 +106,8 @@ impl Block { #[cfg(feature = "lz4")] CompressionType::Lz4 => { + // NOTE: size cap validation for uncompressed_length is in PR #7 + // (feat/#258-security-validate-uncompressedlength-before-decomp) let mut buf = vec![0u8; header.uncompressed_length as usize]; let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) @@ -163,6 +165,7 @@ impl Block { #[expect(clippy::indexing_slicing)] let raw_data = &buf[Header::serialized_len()..]; + // NOTE: size cap validation for uncompressed_length is in PR #7 let mut decompressed = vec![0u8; header.uncompressed_length as usize]; let bytes_written = lz4_flex::decompress_into(raw_data, &mut decompressed) diff --git a/src/vlog/blob_file/reader.rs b/src/vlog/blob_file/reader.rs index 3857c69e5..43563acda 100644 --- a/src/vlog/blob_file/reader.rs +++ b/src/vlog/blob_file/reader.rs @@ -30,7 +30,9 @@ impl<'a> Reader<'a> { debug_assert_eq!(vhandle.blob_file_id, self.blob_file.id()); let add_size = BLOB_HEADER_LEN + key.len(); - let read_len = vhandle.on_disk_size as usize + add_size; + let read_len = (vhandle.on_disk_size as usize) + .checked_add(add_size) + .ok_or(crate::Error::Unrecoverable)?; let value = crate::file::read_exact(self.file, vhandle.offset, read_len)?; @@ -83,6 +85,8 @@ impl<'a> Reader<'a> { #[cfg(feature = "lz4")] CompressionType::Lz4 => { + // NOTE: size cap validation for real_val_len is in PR #7 + // (feat/#258-security-validate-uncompressedlength-before-decomp) let mut buf = vec![0u8; real_val_len]; let bytes_written = lz4_flex::decompress_into(&raw_data, &mut buf) From 4d71fb1c6ecb12423b0e8b69ed6aac9a9dbc735e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 22:20:27 +0200 Subject: [PATCH 29/30] fix: address review feedback on contains_prefix - Simplify iterator chain to match/guard pattern for clarity - Add empty-prefix edge case test on non-empty tree (MVCC + deletes) --- src/abstract_tree.rs | 10 ++++------ tests/tree_contains_prefix.rs | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/abstract_tree.rs b/src/abstract_tree.rs index bddc8f453..691332283 100644 --- a/src/abstract_tree.rs +++ b/src/abstract_tree.rs @@ -543,12 +543,10 @@ pub trait AbstractTree { seqno: SeqNo, index: Option<(Arc, SeqNo)>, ) -> crate::Result { - Ok(self - .prefix(prefix, seqno, index) - .next() - .map(crate::Guard::key) - .transpose()? - .is_some()) + match self.prefix(prefix, seqno, index).next() { + Some(guard) => guard.key().map(|_| true), + None => Ok(false), + } } /// Inserts a key-value pair into the tree. diff --git a/tests/tree_contains_prefix.rs b/tests/tree_contains_prefix.rs index 9edc6ce02..a9c486661 100644 --- a/tests/tree_contains_prefix.rs +++ b/tests/tree_contains_prefix.rs @@ -20,6 +20,35 @@ fn tree_contains_prefix_empty_tree() -> lsm_tree::Result<()> { Ok(()) } +#[test] +fn tree_contains_prefix_empty_prefix_nonempty_tree() -> lsm_tree::Result<()> { + let folder = get_tmp_folder(); + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("abc:1", "value1", 0); + tree.insert("def:1", "value2", 1); + + // Empty prefix matches all keys + assert!(tree.contains_prefix("", 2, None)?); + + // Still respects MVCC visibility + assert!(!tree.contains_prefix("", 0, None)?); + assert!(tree.contains_prefix("", 1, None)?); + + // After deleting all keys, empty prefix should not match + tree.remove("abc:1", 2); + tree.remove("def:1", 3); + assert!(!tree.contains_prefix("", 4, None)?); + + Ok(()) +} + #[test] fn tree_contains_prefix_basic() -> lsm_tree::Result<()> { let folder = get_tmp_folder(); From 2590eb9d7b7253ee3d865826de75058a48711b43 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 22:26:44 +0200 Subject: [PATCH 30/30] docs(data_block): document why reverse seeks accept but ignore seqno Add doc-comments to `seek_upper` and `seek_upper_exclusive` explaining that seqno is accepted for API uniformity with forward seek methods but is intentionally unused because backward binary search cannot leverage it without risking skipping the visible version. --- src/table/data_block/iter.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs index f41ab543d..c8b4ba663 100644 --- a/src/table/data_block/iter.rs +++ b/src/table/data_block/iter.rs @@ -91,16 +91,14 @@ impl<'a> Iter<'a> { } } + /// Reverse inclusive seek: position at the last key `<= needle`. + /// + /// `seqno` is accepted for API uniformity with the forward seek methods + /// ([`seek`], [`seek_exclusive`]) but is **intentionally unused** here. + /// Backward binary search cannot leverage seqno because intervals are + /// visited from the selected one toward index 0 — a tighter predicate + /// would skip intervals that may contain the visible version. pub fn seek_upper(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - // Reverse-bound seek: position the high scanner at the last restart whose - // head key is ≤ needle, then walk backwards inside the interval until we - // find a key ≤ needle. - // - // Note: seqno cannot narrow the backward binary search. Backward - // iteration visits intervals from the selected one toward index 0, so a - // tighter predicate would cause later intervals (higher index, older - // versions of the same key) to be skipped entirely — potentially missing - // the visible version. if !self .decoder .inner_mut() @@ -162,9 +160,11 @@ impl<'a> Iter<'a> { } } + /// Reverse exclusive seek: position at the last key `< needle`. + /// + /// See [`seek_upper`] for why `seqno` is accepted but unused in reverse + /// seeks. pub fn seek_upper_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool { - // Exclusive upper bound: mirror of `seek_upper`. Same backward-search - // limitation applies — seqno cannot narrow the binary search here. if !self .decoder .inner_mut()