From dbb4ee44e259377a5b747a781b565d7ef80866d0 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 13:53:19 +0200
Subject: [PATCH 01/30] feat: add optimized contains_prefix() method
Add contains_prefix() to AbstractTree trait that checks if any key
with the given prefix exists, stopping at the first match instead
of materializing a full iterator.
- Default implementation on AbstractTree uses prefix().next()
- BlobTree overrides to delegate to index tree, avoiding value log reads
- MVCC-correct: respects seqno visibility and tombstones
Closes fjall-rs/lsm-tree#138
---
src/abstract_tree.rs | 38 ++++++++++
src/blob_tree/mod.rs | 11 +++
tests/tree_contains_prefix.rs | 136 ++++++++++++++++++++++++++++++++++
3 files changed, 185 insertions(+)
create mode 100644 tests/tree_contains_prefix.rs
diff --git a/src/abstract_tree.rs b/src/abstract_tree.rs
index 0a3f123a2..917e8ec80 100644
--- a/src/abstract_tree.rs
+++ b/src/abstract_tree.rs
@@ -511,6 +511,44 @@ pub trait AbstractTree {
self.get(key, seqno).map(|x| x.is_some())
}
+ /// Returns `true` if the tree contains any key with the given prefix.
+ ///
+ /// This is more efficient than `prefix().next().is_some()` as it avoids
+ /// materializing the full iterator guard and can stop at the first match.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # let folder = tempfile::tempdir()?;
+ /// use lsm_tree::{AbstractTree, Config, Tree};
+ ///
+ /// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
+ /// assert!(!tree.contains_prefix("abc", 0, None)?);
+ ///
+ /// tree.insert("abc:1", "value", 0);
+ /// assert!(tree.contains_prefix("abc", 1, None)?);
+ /// assert!(!tree.contains_prefix("xyz", 1, None)?);
+ /// #
+ /// # Ok::<(), lsm_tree::Error>(())
+ /// ```
+ ///
+ /// # Errors
+ ///
+ /// Will return `Err` if an IO error occurs.
+ fn contains_prefix>(
+ &self,
+ prefix: K,
+ seqno: SeqNo,
+ index: Option<(Arc, SeqNo)>,
+ ) -> crate::Result {
+ Ok(self
+ .prefix(prefix, seqno, index)
+ .next()
+ .map(crate::Guard::key)
+ .transpose()?
+ .is_some())
+ }
+
/// Inserts a key-value pair into the tree.
///
/// If the key already exists, the item will be overwritten.
diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs
index 73ea1c119..b144ed915 100644
--- a/src/blob_tree/mod.rs
+++ b/src/blob_tree/mod.rs
@@ -555,6 +555,17 @@ impl AbstractTree for BlobTree {
self.index.contains_key(key, seqno)
}
+ // NOTE: Override the default implementation to not fetch
+ // data from the value log, so we get much faster prefix checks
+ fn contains_prefix>(
+ &self,
+ prefix: K,
+ seqno: SeqNo,
+ index: Option<(Arc, SeqNo)>,
+ ) -> crate::Result {
+ self.index.contains_prefix(prefix, seqno, index)
+ }
+
// NOTE: Override the default implementation to not fetch
// data from the value log, so we get much faster scans
fn len(&self, seqno: SeqNo, index: Option<(Arc, SeqNo)>) -> crate::Result {
diff --git a/tests/tree_contains_prefix.rs b/tests/tree_contains_prefix.rs
new file mode 100644
index 000000000..b4bc72eab
--- /dev/null
+++ b/tests/tree_contains_prefix.rs
@@ -0,0 +1,136 @@
+use lsm_tree::{get_tmp_folder, AbstractTree, Config, SeqNo, SequenceNumberCounter};
+use test_log::test;
+
+#[test]
+fn tree_contains_prefix_empty_tree() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ assert!(!tree.contains_prefix("abc", SeqNo::MAX, None)?);
+ assert!(!tree.contains_prefix("", SeqNo::MAX, None)?);
+
+ Ok(())
+}
+
+#[test]
+fn tree_contains_prefix_basic() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ tree.insert("abc:1", "value1", 0);
+ tree.insert("abc:2", "value2", 1);
+ tree.insert("def:1", "value3", 2);
+
+ assert!(tree.contains_prefix("abc", 3, None)?);
+ assert!(tree.contains_prefix("def", 3, None)?);
+ assert!(!tree.contains_prefix("xyz", 3, None)?);
+ assert!(!tree.contains_prefix("ab", 0, None)?);
+
+ Ok(())
+}
+
+#[test]
+fn tree_contains_prefix_no_match() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ tree.insert("abc", "value", 0);
+ tree.insert("abd", "value", 1);
+
+ assert!(!tree.contains_prefix("xyz", 2, None)?);
+ assert!(!tree.contains_prefix("abe", 2, None)?);
+ assert!(!tree.contains_prefix("abca", 2, None)?);
+
+ Ok(())
+}
+
+#[test]
+fn tree_contains_prefix_mvcc() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ // Insert at seqno 4
+ tree.insert("abc:1", "value", 4);
+
+ // Not visible at seqno 3 (seqno filter is item_seqno < query_seqno)
+ assert!(!tree.contains_prefix("abc", 3, None)?);
+
+ // Not visible at seqno 4 (strict less-than)
+ assert!(!tree.contains_prefix("abc", 4, None)?);
+
+ // Visible at seqno 5
+ assert!(tree.contains_prefix("abc", 5, None)?);
+
+ // Visible at MAX
+ assert!(tree.contains_prefix("abc", SeqNo::MAX, None)?);
+
+ Ok(())
+}
+
+#[test]
+fn tree_contains_prefix_after_delete() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ tree.insert("abc:1", "value", 0);
+ tree.remove("abc:1", 1);
+
+ // After deletion, prefix should not match
+ assert!(!tree.contains_prefix("abc", 2, None)?);
+
+ // But at seqno 1 (before delete), it should still be visible
+ assert!(tree.contains_prefix("abc", 1, None)?);
+
+ Ok(())
+}
+
+#[test]
+fn tree_contains_prefix_after_flush() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .open()?;
+
+ tree.insert("abc:1", "value1", 0);
+ tree.insert("abc:2", "value2", 1);
+ tree.flush_active_memtable(0)?;
+
+ assert!(tree.contains_prefix("abc", 2, None)?);
+ assert!(!tree.contains_prefix("xyz", 2, None)?);
+
+ Ok(())
+}
From 453e729991af96f7bf32da280518aa7d2593cb82 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 14:03:12 +0200
Subject: [PATCH 02/30] refactor(contains_prefix): accurate doc wording and
test corrections
- Doc describes convenience and error propagation, not false optimization claim
- Test asserts "ab" matches "abc:*" keys at visible seqno
- Add BlobTree test covering delegated index-only path
---
src/abstract_tree.rs | 6 +++--
tests/tree_contains_prefix.rs | 46 +++++++++++++++++++++++++++++++++--
2 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/src/abstract_tree.rs b/src/abstract_tree.rs
index 917e8ec80..bddc8f453 100644
--- a/src/abstract_tree.rs
+++ b/src/abstract_tree.rs
@@ -513,8 +513,10 @@ pub trait AbstractTree {
/// Returns `true` if the tree contains any key with the given prefix.
///
- /// This is more efficient than `prefix().next().is_some()` as it avoids
- /// materializing the full iterator guard and can stop at the first match.
+ /// This is a convenience method that checks whether the corresponding
+ /// prefix iterator yields at least one item, while surfacing any IO
+ /// errors via the `Result` return type. Implementations may override
+ /// this method to provide a more efficient prefix-existence check.
///
/// # Examples
///
diff --git a/tests/tree_contains_prefix.rs b/tests/tree_contains_prefix.rs
index b4bc72eab..9edc6ce02 100644
--- a/tests/tree_contains_prefix.rs
+++ b/tests/tree_contains_prefix.rs
@@ -1,4 +1,6 @@
-use lsm_tree::{get_tmp_folder, AbstractTree, Config, SeqNo, SequenceNumberCounter};
+use lsm_tree::{
+ get_tmp_folder, AbstractTree, Config, KvSeparationOptions, SeqNo, SequenceNumberCounter,
+};
use test_log::test;
#[test]
@@ -36,7 +38,8 @@ fn tree_contains_prefix_basic() -> lsm_tree::Result<()> {
assert!(tree.contains_prefix("abc", 3, None)?);
assert!(tree.contains_prefix("def", 3, None)?);
assert!(!tree.contains_prefix("xyz", 3, None)?);
- assert!(!tree.contains_prefix("ab", 0, None)?);
+ // "ab" is a valid prefix for "abc:*" keys
+ assert!(tree.contains_prefix("ab", 3, None)?);
Ok(())
}
@@ -134,3 +137,42 @@ fn tree_contains_prefix_after_flush() -> lsm_tree::Result<()> {
Ok(())
}
+
+#[test]
+fn tree_contains_prefix_blobtree() -> lsm_tree::Result<()> {
+ let folder = get_tmp_folder();
+
+ let tree = Config::new(
+ &folder,
+ SequenceNumberCounter::default(),
+ SequenceNumberCounter::default(),
+ )
+ .with_kv_separation(Some(KvSeparationOptions::default()))
+ .open()?;
+
+ assert!(!tree.contains_prefix("abc", SeqNo::MAX, None)?);
+
+ tree.insert("abc:1", "value1", 0);
+ tree.insert("abc:2", "value2", 1);
+ tree.insert("def:1", "value3", 2);
+
+ assert!(tree.contains_prefix("abc", 3, None)?);
+ assert!(tree.contains_prefix("def", 3, None)?);
+ assert!(!tree.contains_prefix("xyz", 3, None)?);
+
+ // MVCC visibility
+ assert!(!tree.contains_prefix("abc", 0, None)?);
+ assert!(tree.contains_prefix("abc", 1, None)?);
+
+ // After delete
+ tree.remove("abc:1", 3);
+ tree.remove("abc:2", 4);
+ assert!(!tree.contains_prefix("abc", 5, None)?);
+
+ // After flush
+ tree.insert("ghi:1", "value", 5);
+ tree.flush_active_memtable(0)?;
+ assert!(tree.contains_prefix("ghi", 6, None)?);
+
+ Ok(())
+}
From c25e6931f6023ae1f61a50b536b7699d706a07a5 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 14:13:23 +0200
Subject: [PATCH 03/30] refactor(blob_tree): accurate contains_prefix override
note
Delegate to index tree avoids BlobGuard construction overhead,
not value-log reads (key() never resolves blob indirections).
---
src/blob_tree/mod.rs | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs
index b144ed915..79ea14f50 100644
--- a/src/blob_tree/mod.rs
+++ b/src/blob_tree/mod.rs
@@ -555,8 +555,9 @@ impl AbstractTree for BlobTree {
self.index.contains_key(key, seqno)
}
- // NOTE: Override the default implementation to not fetch
- // data from the value log, so we get much faster prefix checks
+ // NOTE: Override the default implementation to delegate directly
+ // to the index tree, avoiding extra iterator/guard overhead for
+ // prefix checks
fn contains_prefix>(
&self,
prefix: K,
From 1962eb52f3d25990f2f3d5cb8ff32a45c2128b0a Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 14:42:02 +0200
Subject: [PATCH 04/30] perf: seqno-aware seek in data block point reads
Exploit internal key ordering (user_key ASC, seqno DESC) to include
seqno in the binary search predicate. This skips entire restart
intervals containing only versions newer than the target snapshot,
reducing O(versions) linear scan to O(restart_interval) for keys
with many MVCC versions.
Closes fjall-rs/lsm-tree#237
---
src/table/data_block/iter.rs | 13 +++-
src/table/data_block/mod.rs | 130 +++++++++++++++++++++++++++++++++--
2 files changed, 135 insertions(+), 8 deletions(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index e8c605ba4..977844f7b 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -8,7 +8,7 @@ use crate::{
block::{Decoder, ParsedItem},
data_block::DataBlockParsedItem,
},
- InternalValue,
+ InternalValue, SeqNo,
};
/// The data block iterator handles double-ended scans over a data block
@@ -34,6 +34,17 @@ impl<'a> Iter<'a> {
true
}
+ /// Seeks to the restart interval containing the target (needle, seqno) pair.
+ ///
+ /// Exploits internal key ordering (user_key ASC, seqno DESC) to skip
+ /// restart intervals containing only versions newer than the target seqno.
+ pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ self.decoder.inner_mut().seek(
+ |head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno),
+ false,
+ )
+ }
+
pub fn seek(&mut self, needle: &[u8]) -> bool {
// Find the restart interval whose head key is the last one strictly below `needle`.
// The decoder then performs a linear scan within that interval; we stop as soon as we
diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs
index 7104ccfaa..00a1121f1 100644
--- a/src/table/data_block/mod.rs
+++ b/src/table/data_block/mod.rs
@@ -407,7 +407,6 @@ impl DataBlock {
.map(|reader| reader.bucket_count())
}
- // TODO: handle seqno more nicely (make Key generic, so we can do binary search over (key, seqno))
#[must_use]
pub fn point_read(&self, needle: &[u8], seqno: SeqNo) -> Option {
let iter = if let Some(hash_index_reader) = self.get_hash_index_reader() {
@@ -416,10 +415,10 @@ impl DataBlock {
return None;
}
MARKER_CONFLICT => {
- // NOTE: Fallback to binary search
+ // NOTE: Fallback to seqno-aware binary search
let mut iter = self.iter();
- if !iter.seek(needle) {
+ if !iter.seek_to_key_seqno(needle, seqno) {
return None;
}
@@ -437,8 +436,9 @@ impl DataBlock {
} else {
let mut iter = self.iter();
- // NOTE: Fallback to binary search
- if !iter.seek(needle) {
+ // NOTE: Seqno-aware binary search skips restart intervals
+ // containing only versions newer than the target seqno
+ if !iter.seek_to_key_seqno(needle, seqno) {
return None;
}
@@ -449,14 +449,14 @@ impl DataBlock {
for item in iter {
match item.compare_key(needle, &self.inner.data) {
std::cmp::Ordering::Greater => {
- // We are before our searched key/seqno
+ // We are past our searched key
return None;
}
std::cmp::Ordering::Equal => {
// If key is same as needle, check sequence number
}
std::cmp::Ordering::Less => {
- // We are past our searched key
+ // We are before our searched key
continue;
}
}
@@ -1233,4 +1233,120 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn data_block_point_read_seqno_aware_seek() -> crate::Result<()> {
+ // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3) should return v3
+ let items = [
+ InternalValue::from_components(b"a", b"a5", 5, Value),
+ InternalValue::from_components(b"a", b"a4", 4, Value),
+ InternalValue::from_components(b"a", b"a3", 3, Value),
+ InternalValue::from_components(b"a", b"a2", 2, Value),
+ InternalValue::from_components(b"a", b"a1", 1, Value),
+ ];
+
+ // With restart_interval=1, every item is a restart head,
+ // so seqno-aware binary search can skip directly to the target version
+ for restart_interval in 1..=4 {
+ let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?;
+
+ let data_block = DataBlock::new(Block {
+ data: bytes.into(),
+ header: Header {
+ block_type: BlockType::Data,
+ checksum: Checksum::from_raw(0),
+ data_length: 0,
+ uncompressed_length: 0,
+ },
+ });
+
+ // seqno=4 → should see version with seqno=3 (first with seqno < 4)
+ assert_eq!(
+ Some(items[2].clone()),
+ data_block.point_read(b"a", 4),
+ "restart_interval={restart_interval}: seqno=4 should return v3",
+ );
+
+ // seqno=3 → should see version with seqno=2
+ assert_eq!(
+ Some(items[3].clone()),
+ data_block.point_read(b"a", 3),
+ "restart_interval={restart_interval}: seqno=3 should return v2",
+ );
+
+ // seqno=6 → should see latest version (seqno=5)
+ assert_eq!(
+ Some(items[0].clone()),
+ data_block.point_read(b"a", 6),
+ "restart_interval={restart_interval}: seqno=6 should return v5",
+ );
+
+ // seqno=1 → no visible version (all seqno >= 1)
+ assert!(
+ data_block.point_read(b"a", 1).is_none(),
+ "restart_interval={restart_interval}: seqno=1 should return None",
+ );
+
+ // Non-existent key
+ assert!(
+ data_block.point_read(b"b", SeqNo::MAX).is_none(),
+ "restart_interval={restart_interval}: key 'b' should not exist",
+ );
+ }
+
+ Ok(())
+ }
+
+ #[test]
+ fn data_block_point_read_seqno_aware_seek_mixed_keys() -> crate::Result<()> {
+ // Multiple keys with multiple versions
+ let items = [
+ InternalValue::from_components(b"a", b"a3", 3, Value),
+ InternalValue::from_components(b"a", b"a2", 2, Value),
+ InternalValue::from_components(b"a", b"a1", 1, Value),
+ InternalValue::from_components(b"b", b"b5", 5, Value),
+ InternalValue::from_components(b"b", b"b4", 4, Value),
+ InternalValue::from_components(b"b", b"b3", 3, Value),
+ InternalValue::from_components(b"b", b"b2", 2, Value),
+ InternalValue::from_components(b"b", b"b1", 1, Value),
+ InternalValue::from_components(b"c", b"c1", 1, Value),
+ ];
+
+ for restart_interval in 1..=4 {
+ let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?;
+
+ let data_block = DataBlock::new(Block {
+ data: bytes.into(),
+ header: Header {
+ block_type: BlockType::Data,
+ checksum: Checksum::from_raw(0),
+ data_length: 0,
+ uncompressed_length: 0,
+ },
+ });
+
+ // Read "b" at seqno=4 → should return version with seqno=3
+ assert_eq!(
+ Some(items[5].clone()),
+ data_block.point_read(b"b", 4),
+ "restart_interval={restart_interval}: b@4 should return b3",
+ );
+
+ // Read "a" at seqno=2 → should return version with seqno=1
+ assert_eq!(
+ Some(items[2].clone()),
+ data_block.point_read(b"a", 2),
+ "restart_interval={restart_interval}: a@2 should return a1",
+ );
+
+ // Read "c" at seqno=2 → should return version with seqno=1
+ assert_eq!(
+ Some(items[8].clone()),
+ data_block.point_read(b"c", 2),
+ "restart_interval={restart_interval}: c@2 should return c1",
+ );
+ }
+
+ Ok(())
+ }
}
From c52ec805d985727e95a4c95ce24d8f806ea95c56 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 14:50:43 +0200
Subject: [PATCH 05/30] docs(test): clarify seqno snapshot visibility in test
comment
---
src/table/data_block/mod.rs | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs
index 00a1121f1..33b9fb8c9 100644
--- a/src/table/data_block/mod.rs
+++ b/src/table/data_block/mod.rs
@@ -1236,7 +1236,8 @@ mod tests {
#[test]
fn data_block_point_read_seqno_aware_seek() -> crate::Result<()> {
- // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3) should return v3
+ // Key "a" with seqno 5,4,3,2,1 — point_read("a", seqno=3)
+ // returns the first version with seqno < 3, i.e., v2 ("a2")
let items = [
InternalValue::from_components(b"a", b"a5", 5, Value),
InternalValue::from_components(b"a", b"a4", 4, Value),
From 0513f336e08e8aa6573374a5592242c5a7486a6c Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 15:11:15 +0200
Subject: [PATCH 06/30] docs(data_block): precise seek_to_key_seqno guarantees
---
src/table/data_block/iter.rs | 12 +++++++++---
src/table/data_block/mod.rs | 4 ++--
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index 977844f7b..429499327 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -34,10 +34,16 @@ impl<'a> Iter<'a> {
true
}
- /// Seeks to the restart interval containing the target (needle, seqno) pair.
+ /// Seeks to the last restart interval whose head key is strictly below the
+ /// target `needle`, or equal to it with a seqno that is at least the given
+ /// snapshot boundary.
///
- /// Exploits internal key ordering (user_key ASC, seqno DESC) to skip
- /// restart intervals containing only versions newer than the target seqno.
+ /// Here `seqno` is a snapshot boundary: point reads return the first item
+ /// with `item.seqno < seqno`. Using the internal key ordering
+ /// (user_key ASC, seqno DESC), this skips restart intervals that can only
+ /// contain versions newer than the snapshot, so any visible version for
+ /// `needle` will be found within roughly one restart interval of the
+ /// resulting position.
pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
self.decoder.inner_mut().seek(
|head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno),
diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs
index 33b9fb8c9..c9e6bff1c 100644
--- a/src/table/data_block/mod.rs
+++ b/src/table/data_block/mod.rs
@@ -436,8 +436,8 @@ impl DataBlock {
} else {
let mut iter = self.iter();
- // NOTE: Seqno-aware binary search skips restart intervals
- // containing only versions newer than the target seqno
+ // NOTE: Seqno-aware binary search reduces linear scanning by skipping most
+ // restart intervals that contain only versions newer than the target seqno
if !iter.seek_to_key_seqno(needle, seqno) {
return None;
}
From 42d2c642be43b7506692f64f2d000ed269bb8876 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 15:20:58 +0200
Subject: [PATCH 07/30] perf(data_block): single cmp in seek_to_key_seqno
predicate
---
src/table/data_block/iter.rs | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index 429499327..16bfd92a4 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -46,7 +46,11 @@ impl<'a> Iter<'a> {
/// resulting position.
pub fn seek_to_key_seqno(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
self.decoder.inner_mut().seek(
- |head_key, head_seqno| head_key < needle || (head_key == needle && head_seqno >= seqno),
+ |head_key, head_seqno| match head_key.cmp(needle) {
+ std::cmp::Ordering::Less => true,
+ std::cmp::Ordering::Equal => head_seqno >= seqno,
+ std::cmp::Ordering::Greater => false,
+ },
false,
)
}
From cbf88d396ac09a625bc7b7c65be3d88e8a6a955a Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sat, 14 Mar 2026 15:41:30 +0200
Subject: [PATCH 08/30] docs(test): describe restart_interval loop coverage
---
src/table/data_block/mod.rs | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/table/data_block/mod.rs b/src/table/data_block/mod.rs
index c9e6bff1c..18da4451c 100644
--- a/src/table/data_block/mod.rs
+++ b/src/table/data_block/mod.rs
@@ -1246,8 +1246,9 @@ mod tests {
InternalValue::from_components(b"a", b"a1", 1, Value),
];
- // With restart_interval=1, every item is a restart head,
- // so seqno-aware binary search can skip directly to the target version
+ // Test across various restart intervals: at restart_interval=1 every item
+ // is a restart head so binary search lands exactly; at larger intervals it
+ // may scan within the restart range but must still return the correct version.
for restart_interval in 1..=4 {
let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?;
From 1fddda00d80f98065f899400b3acf7ebe113b108 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sun, 15 Mar 2026 11:09:39 +0200
Subject: [PATCH 09/30] perf(data_block): seqno-aware seek for iterator bounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Forward seeks (seek, seek_exclusive) use seqno in restart-interval
binary search predicate, matching index_block pattern
- Backward seeks (seek_upper, seek_upper_exclusive) accept seqno for
API uniformity but cannot narrow the binary search — backward
iteration visits lower indices only, so a tighter predicate would
miss intervals containing the visible version
- Wire seqno through OwnedDataBlockIter wrappers, removing all TODOs
- Add tests for seqno-aware forward/backward seeks with mixed keys
Ref #237
---
src/table/data_block/iter.rs | 62 +++++----
src/table/data_block/iter_test.rs | 207 +++++++++++++++++++++++-------
src/table/iter.rs | 16 +--
3 files changed, 207 insertions(+), 78 deletions(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index 16bfd92a4..a2345c396 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -55,15 +55,19 @@ impl<'a> Iter<'a> {
)
}
- pub fn seek(&mut self, needle: &[u8]) -> bool {
- // Find the restart interval whose head key is the last one strictly below `needle`.
- // The decoder then performs a linear scan within that interval; we stop as soon as we
- // reach a key ≥ needle. This minimizes parsing work while preserving correctness.
- if !self
- .decoder
- .inner_mut()
- .seek(|head_key, _| head_key < needle, false)
- {
+ pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ // Find the last restart interval whose head precedes (needle, seqno) in
+ // internal key order (user_key ASC, seqno DESC). This lets us skip
+ // restart intervals that contain only versions newer than the snapshot,
+ // reducing the subsequent linear scan.
+ if !self.decoder.inner_mut().seek(
+ |head_key, head_seqno| match head_key.cmp(needle) {
+ std::cmp::Ordering::Less => true,
+ std::cmp::Ordering::Equal => head_seqno >= seqno,
+ std::cmp::Ordering::Greater => false,
+ },
+ false,
+ ) {
return false;
}
@@ -96,9 +100,16 @@ impl<'a> Iter<'a> {
}
}
- pub fn seek_upper(&mut self, needle: &[u8]) -> bool {
- // Reverse-bound seek: position the high scanner at the first restart whose head key is
- // ≤ needle, then walk backwards inside the interval until we find a key ≤ needle.
+ pub fn seek_upper(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
+ // Reverse-bound seek: position the high scanner at the last restart whose
+ // head key is ≤ needle, then walk backwards inside the interval until we
+ // find a key ≤ needle.
+ //
+ // Note: seqno cannot narrow the backward binary search. Backward
+ // iteration visits intervals from the selected one toward index 0, so a
+ // tighter predicate would cause later intervals (higher index, older
+ // versions of the same key) to be skipped entirely — potentially missing
+ // the visible version.
if !self
.decoder
.inner_mut()
@@ -133,15 +144,18 @@ impl<'a> Iter<'a> {
}
}
- pub fn seek_exclusive(&mut self, needle: &[u8]) -> bool {
- // Exclusive lower bound: identical to `seek`, except we must not yield entries equal to
- // `needle`. We therefore keep consuming while keys compare equal and only stop once we
- // observe a strictly greater key.
- if !self
- .decoder
- .inner_mut()
- .seek(|head_key, _| head_key < needle, false)
- {
+ pub fn seek_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ // Exclusive lower bound: identical to `seek`, except we must not yield
+ // entries equal to `needle`. The seqno-aware binary search still helps
+ // by landing closer to the target position in the restart index.
+ if !self.decoder.inner_mut().seek(
+ |head_key, head_seqno| match head_key.cmp(needle) {
+ std::cmp::Ordering::Less => true,
+ std::cmp::Ordering::Equal => head_seqno >= seqno,
+ std::cmp::Ordering::Greater => false,
+ },
+ false,
+ ) {
return false;
}
@@ -165,9 +179,9 @@ impl<'a> Iter<'a> {
}
}
- pub fn seek_upper_exclusive(&mut self, needle: &[u8]) -> bool {
- // Exclusive upper bound: mirror of `seek_upper`. We must not include entries equal to
- // `needle`, so we consume equals from the high end until we see a strictly smaller key.
+ pub fn seek_upper_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
+ // Exclusive upper bound: mirror of `seek_upper`. Same backward-search
+ // limitation applies — seqno cannot narrow the binary search here.
if !self
.decoder
.inner_mut()
diff --git a/src/table/data_block/iter_test.rs b/src/table/data_block/iter_test.rs
index 8ff8fcdc0..f5cc81b00 100644
--- a/src/table/data_block/iter_test.rs
+++ b/src/table/data_block/iter_test.rs
@@ -5,7 +5,7 @@ mod tests {
block::{BlockType, Header, ParsedItem},
Block, DataBlock,
},
- Checksum, InternalValue, Slice,
+ Checksum, InternalValue, SeqNo, Slice,
ValueType::{Tombstone, Value},
};
use test_log::test;
@@ -71,8 +71,8 @@ mod tests {
{
let mut iter = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&110u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX);
let iter = iter.map(|x| x.materialize(data_block.as_slice()));
assert_eq!(
@@ -83,8 +83,8 @@ mod tests {
{
let mut iter: crate::table::data_block::Iter<'_> = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&110u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX);
let iter = iter.map(|x| x.materialize(data_block.as_slice()));
assert_eq!(
@@ -95,8 +95,8 @@ mod tests {
{
let mut iter = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&110u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&110u64.to_be_bytes(), SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
let mut count = 0;
@@ -145,8 +145,8 @@ mod tests {
{
let mut iter = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&109u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX);
let iter = iter.map(|x| x.materialize(data_block.as_slice()));
assert_eq!(
@@ -157,8 +157,8 @@ mod tests {
{
let mut iter: crate::table::data_block::Iter<'_> = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&109u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX);
let iter = iter.map(|x| x.materialize(data_block.as_slice()));
assert_eq!(
@@ -169,8 +169,8 @@ mod tests {
{
let mut iter = data_block.iter();
- iter.seek(&10u64.to_be_bytes());
- iter.seek_upper(&109u64.to_be_bytes());
+ iter.seek(&10u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&109u64.to_be_bytes(), SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
let mut count = 0;
@@ -218,8 +218,8 @@ mod tests {
});
let mut iter = data_block.iter();
- iter.seek(&5u64.to_be_bytes());
- iter.seek_upper(&9u64.to_be_bytes());
+ iter.seek(&5u64.to_be_bytes(), SeqNo::MAX);
+ iter.seek_upper(&9u64.to_be_bytes(), SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
let mut count = 0;
@@ -345,7 +345,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"d"), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -386,7 +386,7 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(!iter.seek(b"a"), "should not seek");
+ assert!(!iter.seek(b"a", SeqNo::MAX), "should not seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -398,7 +398,7 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(!iter.seek_upper(b"g"), "should not seek");
+ assert!(!iter.seek_upper(b"g", SeqNo::MAX), "should not seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -410,7 +410,7 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"b"), "should seek");
+ assert!(iter.seek_upper(b"b", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -425,7 +425,7 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek(b"f"), "should seek");
+ assert!(iter.seek(b"f", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -466,8 +466,8 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek(b"c"), "should seek");
- assert!(iter.seek_upper(b"d"), "should seek");
+ assert!(iter.seek(b"c", SeqNo::MAX), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -507,7 +507,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"b"), "should seek");
+ assert!(iter.seek_upper(b"b", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -548,8 +548,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek(b"d"), "should seek");
- assert!(iter.seek_upper(b"d"), "should seek");
+ assert!(iter.seek(b"d", SeqNo::MAX), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -564,8 +564,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"d"), "should seek");
- assert!(iter.seek(b"d"), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
+ assert!(iter.seek(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -580,8 +580,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek(b"d"), "should seek");
- assert!(iter.seek_upper(b"d"), "should seek");
+ assert!(iter.seek(b"d", SeqNo::MAX), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -602,8 +602,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"d"), "should seek");
- assert!(iter.seek(b"d"), "should seek");
+ assert!(iter.seek_upper(b"d", SeqNo::MAX), "should seek");
+ assert!(iter.seek(b"d", SeqNo::MAX), "should seek");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -651,8 +651,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek(b"f"), "should seek");
- iter.seek_upper(b"e");
+ assert!(iter.seek(b"f", SeqNo::MAX), "should seek");
+ iter.seek_upper(b"e", SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -662,8 +662,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek(b"f"), "should seek");
- iter.seek_upper(b"e");
+ assert!(iter.seek(b"f", SeqNo::MAX), "should seek");
+ iter.seek_upper(b"e", SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -673,8 +673,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"e"), "should seek");
- iter.seek(b"f");
+ assert!(iter.seek_upper(b"e", SeqNo::MAX), "should seek");
+ iter.seek(b"f", SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -684,8 +684,8 @@ mod tests {
{
let mut iter = data_block.iter();
- assert!(iter.seek_upper(b"e"), "should seek");
- iter.seek(b"f");
+ assert!(iter.seek_upper(b"e", SeqNo::MAX), "should seek");
+ iter.seek(b"f", SeqNo::MAX);
let mut iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -721,7 +721,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek(b"b"), "should seek correctly");
+ assert!(iter.seek(b"b", SeqNo::MAX), "should seek correctly");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -758,7 +758,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek(b"d"), "should seek correctly");
+ assert!(iter.seek(b"d", SeqNo::MAX), "should seek correctly");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -798,7 +798,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(iter.seek(b"f"), "should seek correctly");
+ assert!(iter.seek(b"f", SeqNo::MAX), "should seek correctly");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -838,7 +838,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(!iter.seek(b"a"), "should not find exact match");
+ assert!(!iter.seek(b"a", SeqNo::MAX), "should not find exact match");
let iter = iter.map(|item| item.materialize(&data_block.inner.data));
@@ -875,7 +875,7 @@ mod tests {
let mut iter = data_block.iter();
- assert!(!iter.seek(b"g"), "should not find exact match");
+ assert!(!iter.seek(b"g", SeqNo::MAX), "should not find exact match");
assert!(iter.next().is_none(), "should not collect any items");
}
@@ -1270,11 +1270,126 @@ mod tests {
assert_eq!(data_block.iter().count(), items.len());
let mut iter = data_block.iter();
- iter.seek(&[0]);
- iter.seek_upper(&[0]);
+ iter.seek(&[0], SeqNo::MAX);
+ iter.seek_upper(&[0], SeqNo::MAX);
assert_eq!(0, iter.count());
Ok(())
}
+
+ /// Verifies that `seek(needle, seqno)` with a seqno-aware predicate still
+ /// positions the iterator correctly when a key has many versions spanning
+ /// multiple restart intervals.
+ #[test]
+ fn data_block_seek_seqno_aware() -> crate::Result<()> {
+ // Build a block where key "b" has 10 versions (seqno 10..1) with
+ // restart_interval=2, so versions span 5 restart intervals.
+ let mut items = Vec::new();
+ for seqno in (1..=10).rev() {
+ items.push(InternalValue::from_components(b"b", b"", seqno, Value));
+ }
+
+ for restart_interval in [1, 2, 3, 5] {
+ let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?;
+ let data_block = DataBlock::new(Block {
+ data: bytes.into(),
+ header: Header {
+ block_type: BlockType::Data,
+ checksum: Checksum::from_raw(0),
+ data_length: 0,
+ uncompressed_length: 0,
+ },
+ });
+
+ // With SeqNo::MAX, seek behaves like key-only (no seqno filtering).
+ {
+ let mut iter = data_block.iter();
+ assert!(
+ iter.seek(b"b", SeqNo::MAX),
+ "should find key with MAX seqno"
+ );
+ let entry = iter.next().expect("should have entry");
+ let materialized = entry.materialize(&data_block.inner.data);
+ assert_eq!(materialized.key.user_key.as_ref(), b"b");
+ // First version returned is the newest (seqno 10).
+ assert_eq!(materialized.key.seqno, 10);
+ }
+
+ // With a specific snapshot seqno, the binary search skips restart
+ // intervals that only contain newer versions, but the linear scan
+ // still finds the first entry with key == needle.
+ {
+ let mut iter = data_block.iter();
+ assert!(iter.seek(b"b", 5), "should find key with snapshot seqno 5");
+ let entry = iter.next().expect("should have entry");
+ let materialized = entry.materialize(&data_block.inner.data);
+ assert_eq!(materialized.key.user_key.as_ref(), b"b");
+ // seek returns the first entry with key >= needle; that's still
+ // the newest version in the landing interval. The seqno-aware
+ // predicate only narrows which restart interval we land on.
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Verifies that `seek` with seqno still works correctly when the block
+ /// contains multiple distinct keys with versions.
+ #[test]
+ fn data_block_seek_seqno_aware_mixed_keys() -> crate::Result<()> {
+ let items = vec![
+ InternalValue::from_components(b"a", b"", 10, Value),
+ InternalValue::from_components(b"a", b"", 5, Value),
+ InternalValue::from_components(b"b", b"", 10, Value),
+ InternalValue::from_components(b"b", b"", 7, Value),
+ InternalValue::from_components(b"b", b"", 3, Value),
+ InternalValue::from_components(b"c", b"", 10, Value),
+ ];
+
+ for restart_interval in [1, 2, 3] {
+ let bytes = DataBlock::encode_into_vec(&items, restart_interval, 0.0)?;
+ let data_block = DataBlock::new(Block {
+ data: bytes.into(),
+ header: Header {
+ block_type: BlockType::Data,
+ checksum: Checksum::from_raw(0),
+ data_length: 0,
+ uncompressed_length: 0,
+ },
+ });
+
+ // Forward seek with seqno narrows restart interval selection.
+ {
+ let mut iter = data_block.iter();
+ assert!(iter.seek(b"b", 5), "should find b at snapshot 5");
+ let entry = iter.next().expect("should have entry");
+ let mat = entry.materialize(&data_block.inner.data);
+ assert_eq!(mat.key.user_key.as_ref(), b"b");
+ }
+
+ // Exclusive forward seek with seqno.
+ {
+ let mut iter = data_block.iter();
+ assert!(
+ iter.seek_exclusive(b"b", 5),
+ "should find entry > b at snapshot 5"
+ );
+ let entry = iter.next().expect("should have entry");
+ let mat = entry.materialize(&data_block.inner.data);
+ assert_eq!(mat.key.user_key.as_ref(), b"c");
+ }
+
+ // Upper seek still works with seqno (predicate unchanged for backward).
+ {
+ let mut iter = data_block.iter();
+ assert!(iter.seek_upper(b"b", 5), "should find upper bound b");
+ let entry = iter.next_back().expect("should have entry");
+ let mat = entry.materialize(&data_block.inner.data);
+ assert_eq!(mat.key.user_key.as_ref(), b"b");
+ }
+ }
+
+ Ok(())
+ }
}
diff --git a/src/table/iter.rs b/src/table/iter.rs
index b03b69da1..3809caa9b 100644
--- a/src/table/iter.rs
+++ b/src/table/iter.rs
@@ -37,20 +37,20 @@ self_cell!(
);
impl OwnedDataBlockIter {
- fn seek_lower_inclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
- self.with_dependent_mut(|_, m| m.seek(needle /* TODO: , seqno */))
+ fn seek_lower_inclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ self.with_dependent_mut(|_, m| m.seek(needle, seqno))
}
- fn seek_upper_inclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
- self.with_dependent_mut(|_, m| m.seek_upper(needle /* TODO: , seqno */))
+ fn seek_upper_inclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ self.with_dependent_mut(|_, m| m.seek_upper(needle, seqno))
}
- fn seek_lower_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
- self.with_dependent_mut(|_, m| m.seek_exclusive(needle /* TODO: , seqno */))
+ fn seek_lower_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ self.with_dependent_mut(|_, m| m.seek_exclusive(needle, seqno))
}
- fn seek_upper_exclusive(&mut self, needle: &[u8], _seqno: SeqNo) -> bool {
- self.with_dependent_mut(|_, m| m.seek_upper_exclusive(needle /* TODO: , seqno */))
+ fn seek_upper_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
+ self.with_dependent_mut(|_, m| m.seek_upper_exclusive(needle, seqno))
}
pub fn seek_lower_bound(&mut self, bound: &Bound, seqno: SeqNo) -> bool {
From 2b0b26576af908563379445e28161abd78149dd3 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sun, 15 Mar 2026 11:24:03 +0200
Subject: [PATCH 10/30] refactor(data_block): dedup seek predicate, harden
seqno tests
- seek and seek_exclusive now delegate binary search to
seek_to_key_seqno, eliminating predicate duplication
- Tests assert landing seqno >= snapshot boundary; with
restart_interval=1, exact seqno match proves the optimization
distinguishes from key-only seek
---
src/table/data_block/iter.rs | 29 +++++-------------------
src/table/data_block/iter_test.rs | 37 ++++++++++++++++++++++++++-----
2 files changed, 37 insertions(+), 29 deletions(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index a2345c396..f5e372123 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -56,18 +56,9 @@ impl<'a> Iter<'a> {
}
pub fn seek(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
- // Find the last restart interval whose head precedes (needle, seqno) in
- // internal key order (user_key ASC, seqno DESC). This lets us skip
- // restart intervals that contain only versions newer than the snapshot,
- // reducing the subsequent linear scan.
- if !self.decoder.inner_mut().seek(
- |head_key, head_seqno| match head_key.cmp(needle) {
- std::cmp::Ordering::Less => true,
- std::cmp::Ordering::Equal => head_seqno >= seqno,
- std::cmp::Ordering::Greater => false,
- },
- false,
- ) {
+ // Reuse the seqno-aware binary search from `seek_to_key_seqno`, then
+ // follow up with a linear scan to position at the exact key.
+ if !self.seek_to_key_seqno(needle, seqno) {
return false;
}
@@ -145,17 +136,9 @@ impl<'a> Iter<'a> {
}
pub fn seek_exclusive(&mut self, needle: &[u8], seqno: SeqNo) -> bool {
- // Exclusive lower bound: identical to `seek`, except we must not yield
- // entries equal to `needle`. The seqno-aware binary search still helps
- // by landing closer to the target position in the restart index.
- if !self.decoder.inner_mut().seek(
- |head_key, head_seqno| match head_key.cmp(needle) {
- std::cmp::Ordering::Less => true,
- std::cmp::Ordering::Equal => head_seqno >= seqno,
- std::cmp::Ordering::Greater => false,
- },
- false,
- ) {
+ // Exclusive lower bound: same seqno-aware binary search, but the linear
+ // scan below skips entries equal to `needle`.
+ if !self.seek_to_key_seqno(needle, seqno) {
return false;
}
diff --git a/src/table/data_block/iter_test.rs b/src/table/data_block/iter_test.rs
index f5cc81b00..1086d7b65 100644
--- a/src/table/data_block/iter_test.rs
+++ b/src/table/data_block/iter_test.rs
@@ -1316,18 +1316,31 @@ mod tests {
assert_eq!(materialized.key.seqno, 10);
}
- // With a specific snapshot seqno, the binary search skips restart
- // intervals that only contain newer versions, but the linear scan
- // still finds the first entry with key == needle.
+ // With a specific snapshot seqno, the binary search lands on the
+ // restart interval containing (or nearest to) the target seqno.
+ // The first entry returned is the head of that interval.
{
let mut iter = data_block.iter();
assert!(iter.seek(b"b", 5), "should find key with snapshot seqno 5");
let entry = iter.next().expect("should have entry");
let materialized = entry.materialize(&data_block.inner.data);
assert_eq!(materialized.key.user_key.as_ref(), b"b");
- // seek returns the first entry with key >= needle; that's still
- // the newest version in the landing interval. The seqno-aware
- // predicate only narrows which restart interval we land on.
+ // The landing entry's seqno must be >= the snapshot boundary,
+ // proving the seqno-aware predicate skipped past older intervals.
+ assert!(
+ materialized.key.seqno >= 5,
+ "restart_interval={restart_interval}: landing seqno {} should be >= snapshot 5",
+ materialized.key.seqno,
+ );
+ // With restart_interval=1 each entry is its own interval, so
+ // the predicate lands exactly on the target seqno — a key-only
+ // seek would land on seqno 10 instead.
+ if restart_interval == 1 {
+ assert_eq!(
+ materialized.key.seqno, 5,
+ "with restart_interval=1, seqno-aware seek must land exactly on target"
+ );
+ }
}
}
@@ -1366,6 +1379,18 @@ mod tests {
let entry = iter.next().expect("should have entry");
let mat = entry.materialize(&data_block.inner.data);
assert_eq!(mat.key.user_key.as_ref(), b"b");
+ // Landing seqno must be >= snapshot boundary.
+ assert!(
+ mat.key.seqno >= 5,
+ "restart_interval={restart_interval}: seqno {} should be >= 5",
+ mat.key.seqno,
+ );
+ // With restart_interval=1, seqno-aware seek lands on (b,7) —
+ // the last head with seqno >= 5 — whereas key-only would land
+ // on (b,10).
+ if restart_interval == 1 {
+ assert_eq!(mat.key.seqno, 7);
+ }
}
// Exclusive forward seek with seqno.
From 95ae8abda8a0b1b654a965001dc22fb0b05bbbb1 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sun, 15 Mar 2026 14:08:49 +0200
Subject: [PATCH 11/30] fix(docs): add backticks around identifiers in
seek_to_key_seqno doc
Fixes clippy::doc_markdown warning for bare user_key and seqno
identifiers in documentation comment.
---
src/table/data_block/iter.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/table/data_block/iter.rs b/src/table/data_block/iter.rs
index f5e372123..f41ab543d 100644
--- a/src/table/data_block/iter.rs
+++ b/src/table/data_block/iter.rs
@@ -40,7 +40,7 @@ impl<'a> Iter<'a> {
///
/// Here `seqno` is a snapshot boundary: point reads return the first item
/// with `item.seqno < seqno`. Using the internal key ordering
- /// (user_key ASC, seqno DESC), this skips restart intervals that can only
+ /// (`user_key` ASC, `seqno` DESC), this skips restart intervals that can only
/// contain versions newer than the snapshot, so any visible version for
/// `needle` will be found within roughly one restart interval of the
/// resulting position.
From a03b0de8680945e6073cad92a15108c0a089a657 Mon Sep 17 00:00:00 2001
From: Dmitry Prudnikov
Date: Sun, 15 Mar 2026 14:16:08 +0200
Subject: [PATCH 12/30] ci: add CoordiNode CI and upstream monitor workflows
---
.github/workflows/coordinode-ci.yml | 96 ++++++++++++++++++
.github/workflows/upstream-monitor.yml | 130 +++++++++++++++++++++++++
2 files changed, 226 insertions(+)
create mode 100644 .github/workflows/coordinode-ci.yml
create mode 100644 .github/workflows/upstream-monitor.yml
diff --git a/.github/workflows/coordinode-ci.yml b/.github/workflows/coordinode-ci.yml
new file mode 100644
index 000000000..9d4e9c4ab
--- /dev/null
+++ b/.github/workflows/coordinode-ci.yml
@@ -0,0 +1,96 @@
+name: CoordiNode CI
+
+on:
+ push:
+ branches:
+ - main
+ - "feat/#*"
+ - "fix/#*"
+ pull_request:
+ branches:
+ - main
+
+env:
+ CARGO_TERM_COLOR: always
+
+jobs:
+ test:
+ timeout-minutes: 20
+ strategy:
+ matrix:
+ rust_version:
+ - stable
+ - "1.90.0"
+ os:
+ - ubuntu-latest
+ - windows-latest
+ - macos-latest
+ runs-on: ${{ matrix.os }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v6
+
+ - name: Install Rust toolchain
+ uses: dtolnay/rust-toolchain@stable
+ with:
+ toolchain: ${{ matrix.rust_version }}
+ components: rustfmt, clippy
+
+ - name: Set up cargo cache
+ uses: Swatinem/rust-cache@v2
+ with:
+ prefix-key: ${{ runner.os }}-cargo
+
+ - name: Install nextest
+ uses: taiki-e/install-action@nextest
+
+ - name: Format check
+ run: cargo fmt --all -- --check
+
+ - name: Clippy (strict)
+ run: cargo clippy --all-features -- -D warnings
+
+ - name: Run tests
+ run: cargo nextest run --all-features
+
+ - name: Run doc tests
+ run: cargo test --doc --features lz4
+
+ codecov:
+ timeout-minutes: 20
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v6
+
+ - name: Install Rust nightly
+ uses: dtolnay/rust-toolchain@nightly
+ with:
+ components: llvm-tools-preview
+
+ - name: Set up cargo cache
+ uses: Swatinem/rust-cache@v2
+ with:
+ prefix-key: ${{ runner.os }}-cargo
+
+ - name: Install cargo-llvm-cov
+ uses: taiki-e/install-action@cargo-llvm-cov
+
+ - name: Install nextest
+ uses: taiki-e/install-action@nextest
+
+ - name: Run tests with coverage
+ run: cargo llvm-cov --no-report nextest --all-features
+
+ - name: Run doc tests with coverage
+ run: cargo llvm-cov --no-report --doc --features lz4
+
+ - name: Create coverage report
+ run: cargo llvm-cov report --doctests --lcov --output-path lcov.info
+
+ - name: Upload to Codecov
+ uses: codecov/codecov-action@v5
+ with:
+ files: lcov.info
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.github/workflows/upstream-monitor.yml b/.github/workflows/upstream-monitor.yml
new file mode 100644
index 000000000..8def494b2
--- /dev/null
+++ b/.github/workflows/upstream-monitor.yml
@@ -0,0 +1,130 @@
+name: Upstream Monitor
+
+on:
+ schedule:
+ - cron: "0 8 * * 1,4"
+ workflow_dispatch:
+
+permissions:
+ contents: write
+ pull-requests: write
+
+jobs:
+ check-upstream:
+ timeout-minutes: 10
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+
+ - name: Add upstream remote
+ run: git remote add upstream https://github.com/fjall-rs/lsm-tree.git
+
+ - name: Fetch upstream and origin
+ run: |
+ git fetch upstream main
+ git fetch origin main
+
+ - name: Check for new upstream commits
+ id: check
+ run: |
+ BEHIND=$(git rev-list origin/main..upstream/main --count)
+ echo "behind=$BEHIND" >> "$GITHUB_OUTPUT"
+ echo "Commits behind upstream: $BEHIND"
+
+ - name: Try merge and create PR or issue
+ if: steps.check.outputs.behind > 0
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ BEHIND: ${{ steps.check.outputs.behind }}
+ run: |
+ git config user.name "github-actions[bot]"
+ git config user.email "github-actions[bot]@users.noreply.github.com"
+
+ SYNC_BRANCH="chore/upstream-sync-$(date +%Y%m%d)"
+ git checkout -b "$SYNC_BRANCH" origin/main
+
+ if git merge --no-commit --no-ff upstream/main 2>&1; then
+ git commit -m "chore: sync upstream ($BEHIND new commits)"
+ git push origin "$SYNC_BRANCH"
+
+ gh pr create \
+ --title "chore: sync upstream ($BEHIND new commits)" \
+ --body "$(cat <<'EOF'
+ ## Upstream Sync
+
+ Automated sync from [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree) main branch.
+
+ **Commits behind:** ${{ steps.check.outputs.behind }}
+
+ ### Review checklist
+ - [ ] Review upstream changes for breaking modifications
+ - [ ] Verify our patches still apply cleanly
+ - [ ] Run full test suite
+ EOF
+ )" \
+ --base main \
+ --head "$SYNC_BRANCH"
+ else
+ CONFLICTS=$(git diff --name-only --diff-filter=U 2>/dev/null || true)
+ git merge --abort
+
+ gh issue create \
+ --title "Upstream sync conflict ($BEHIND new commits)" \
+ --body "$(cat </dev/null; then
+ echo "Branch '$BRANCH' is fully merged into upstream/main"
+
+ EXISTING=$(gh issue list --search "Upstream merged: $BRANCH" --state open --json number --jq 'length')
+ if [ "$EXISTING" = "0" ]; then
+ gh issue create \
+ --title "Upstream merged: $BRANCH" \
+ --body "$(cat <
Date: Sun, 15 Mar 2026 14:24:14 +0200
Subject: [PATCH 13/30] docs: add maintained fork notice and support section
---
README.md | 19 +++++++++++++++++--
assets/usdt-qr.svg | 1 +
2 files changed, 18 insertions(+), 2 deletions(-)
create mode 100644 assets/usdt-qr.svg
diff --git a/README.md b/README.md
index 026dadeec..182126455 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,14 @@
-[](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml)
+[](https://github.com/structured-world/lsm-tree/actions/workflows/coordinode-ci.yml)
+[](https://github.com/fjall-rs/lsm-tree/actions/workflows/test.yml)
[](https://docs.rs/lsm-tree)
[](https://crates.io/crates/lsm-tree)

-[](https://deps.rs/repo/github/fjall-rs/lsm-tree)
+
+> **Maintained fork** by [Structured World Foundation](https://sw.foundation) for the [CoordiNode](https://github.com/structured-world/coordinode) database engine.
+> Based on [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree). We contribute patches upstream and maintain additional features needed for CoordiNode (zstd compression, custom sequence number generators, batch get, intra-L0 compaction, security hardening).
A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust.
@@ -68,12 +71,24 @@ Uses [`bytes`](https://github.com/tokio-rs/bytes) as the underlying `Slice` type
cargo bench --features lz4
```
+## Support the Project
+
+