structured-world · polaz · Mar 22, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/src/range.rs b/src/range.rs
@@ -236,21 +236,42 @@ impl TreeIter {
                 }
             }
 
+            // Sort SST-sourced RTs by start key for binary search in
+            // table-skip below. This is intentionally a separate sort from
+            // the full sort+dedup later: table-skip runs here (before memtable
+            // RTs are collected), so only SST RTs are present. The later sort
+            // covers the complete list. Both sorts are O(n log n) on their
+            // respective subsets; the SST-only subset is typically small.
+            all_range_tombstones.sort_unstable_by(|(a, _), (b, _)| a.start.cmp(&b.start));
+
             for table in single_tables {
                 // Table-skip: if a range tombstone fully covers this table
                 // with a higher seqno, skip it entirely (avoid I/O).
-                // NOTE: get_highest_seqno() includes RT seqnos, so a covering
-                // RT stored in the same table won't trigger skip (conservative
-                // but correct). Separate KV/RT seqno bounds would improve this.
-                // key_range.max() is inclusive, fully_covers uses half-open: max < rt.end
-                let is_covered = all_range_tombstones.iter().any(|(rt, cutoff)| {
-                    rt.visible_at(*cutoff)
-                        && rt.fully_covers(
-                            table.metadata.key_range.min(),
-                            table.metadata.key_range.max(),
-                        )
-                        && rt.seqno > table.get_highest_seqno()
-                });
+                //
+                // Uses get_highest_kv_seqno() which excludes RT seqnos, so a
+                // covering RT stored in the same table can now trigger skip.
+                //
+                // Binary search on sorted RT list: partition_point finds the
+                // first RT with start > table_min; only the prefix [0..idx]
+                // can have start <= table_min (required for fully_covers).
+                // key_range.max() is inclusive; fully_covers checks max < rt.end
+                // (half-open), so this is correct for inclusive upper bounds.
+                let table_min: &[u8] = table.metadata.key_range.min().as_ref();
+                let table_max: &[u8] = table.metadata.key_range.max().as_ref();
+                let table_kv_seqno = table.get_highest_kv_seqno();
+
+                let candidate_end =
+                    all_range_tombstones.partition_point(|(rt, _)| rt.start.as_ref() <= table_min);
+
+                let is_covered =
+                    all_range_tombstones
+                        .iter()
+                        .take(candidate_end)
+                        .any(|(rt, cutoff)| {
+                            rt.visible_at(*cutoff)
+                                && rt.fully_covers(table_min, table_max)
+                                && rt.seqno > table_kv_seqno
+                        });
 
                 if !is_covered {
                     let reader = table

diff --git a/src/table/meta.rs b/src/table/meta.rs
@@ -42,6 +42,12 @@ pub struct ParsedMeta {
     pub index_block_count: u64,
     pub key_range: KeyRange,
     pub(super) seqnos: (SeqNo, SeqNo),
+
+    /// Highest seqno from KV entries only (excludes range tombstones).
+    ///
+    /// Falls back to `seqnos.1` (overall max) for tables written before
+    /// this field was introduced, which is conservative but correct.
+    pub(super) highest_kv_seqno: SeqNo,
     pub file_size: u64,
     pub item_count: u64,
     pub tombstone_count: u64,
@@ -74,6 +80,21 @@ macro_rules! read_u64 {
     }};
 }
 
+/// Validates that `kv_seqno` does not exceed `max_seqno`.
+///
+/// KV-only seqno must be ≤ overall max (which includes both KV and RT seqnos).
+/// A value above `max_seqno` indicates on-disk corruption.
+fn validated_kv_seqno(kv_seqno: SeqNo, max_seqno: SeqNo) -> crate::Result<SeqNo> {
+    if kv_seqno > max_seqno {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::InvalidData,
+            "seqno#kv_max exceeds seqno#max",
+        )
+        .into());
+    }
+    Ok(kv_seqno)
+}
+
 impl ParsedMeta {
     #[expect(clippy::expect_used, clippy::too_many_lines)]
     pub fn load_with_handle(file: &File, handle: &BlockHandle) -> crate::Result<Self> {
@@ -189,6 +210,20 @@ impl ParsedMeta {
             (min, max)
         };
 
+        // Optional field introduced for table-skip optimization.
+        // Old tables lack this key; fall back to overall max seqno
+        // (conservative: table-skip compares rt.seqno > highest_kv_seqno,
+        // so falling back to the higher overall max just disables the
+        // optimization for legacy tables — correct but not optimal).
+        // If the key exists but is truncated, propagate the I/O error to
+        // surface metadata corruption rather than silently falling back.
+        let highest_kv_seqno = if let Some(item) = block.point_read(b"seqno#kv_max", SeqNo::MAX) {
+            let mut bytes = &item.value[..];
+            validated_kv_seqno(bytes.read_u64::<LittleEndian>()?, seqnos.1)?
+        } else {
+            seqnos.1
+        };
+
         let data_block_compression = {
             let bytes = block
                 .point_read(b"compression#data", SeqNo::MAX)
@@ -214,6 +249,7 @@ impl ParsedMeta {
             index_block_count,
             key_range,
             seqnos,
+            highest_kv_seqno,
             file_size,
             item_count,
             tombstone_count,
@@ -224,3 +260,29 @@ impl ParsedMeta {
         })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn validated_kv_seqno_within_bounds() {
+        assert_eq!(validated_kv_seqno(5, 10).unwrap(), 5);
+    }
+
+    #[test]
+    fn validated_kv_seqno_equal_to_max() {
+        assert_eq!(validated_kv_seqno(10, 10).unwrap(), 10);
+    }
+
+    #[test]
+    fn validated_kv_seqno_zero() {
+        assert_eq!(validated_kv_seqno(0, 10).unwrap(), 0);
+    }
+
+    #[test]
+    fn validated_kv_seqno_exceeds_max_returns_error() {
+        let err = validated_kv_seqno(11, 10).unwrap_err();
+        assert!(matches!(err, crate::Error::Io(e) if e.kind() == std::io::ErrorKind::InvalidData));
+    }
+}
diff --git a/src/table/mod.rs b/src/table/mod.rs
@@ -579,7 +579,14 @@ impl Table {
                 )));
             }
 
-            Self::decode_range_tombstones(&block)?
+            let mut rts = Self::decode_range_tombstones(&block)?;
+            // Sort range tombstones by (start asc, seqno desc) to enable
+            // binary search in point-read suppression paths. Uses explicit
+            // comparator so the partition_point invariant is independent of
+            // Ord changes. The seqno-desc tiebreaker ensures higher-seqno
+            // RTs are checked first when multiple share the same start key.
+            rts.sort_unstable_by(|a, b| a.start.cmp(&b.start).then_with(|| b.seqno.cmp(&a.seqno)));
+            rts
         } else {
             Vec::new()
         };
@@ -787,6 +794,20 @@ impl Table {
         self.metadata.seqnos.1 + self.global_seqno()
     }
 
+    /// Returns the highest sequence number from KV entries only,
+    /// excluding range tombstone seqnos.
+    ///
+    /// This enables more aggressive table-skip: a covering RT stored
+    /// in the same table can trigger skip because its seqno may exceed
+    /// the KV-only max even though it doesn't exceed the overall max.
+    ///
+    /// For tables written before this field was introduced, falls back
+    /// to `get_highest_seqno()` (conservative but correct).
+    #[must_use]
+    pub fn get_highest_kv_seqno(&self) -> SeqNo {
+        self.metadata.highest_kv_seqno + self.global_seqno()
+    }
+
     /// Returns the number of tombstone markers in the `Table`.
     #[must_use]
     #[doc(hidden)]

diff --git a/src/table/writer/meta.rs b/src/table/writer/meta.rs
@@ -38,8 +38,15 @@ pub struct Metadata {
     /// Lowest encountered seqno
     pub lowest_seqno: SeqNo,
 
-    /// Highest encountered seqno
+    /// Highest encountered seqno (includes both KV and RT)
     pub highest_seqno: SeqNo,
+
+    /// Highest encountered seqno from KV entries only (excludes range tombstones).
+    ///
+    /// Used for table-skip decisions: a covering RT stored in the same table
+    /// can now trigger skip because `rt.seqno > highest_kv_seqno` may be true
+    /// even when `rt.seqno <= highest_seqno`.
+    pub highest_kv_seqno: SeqNo,
 }
 
 impl Default for Metadata {
@@ -60,6 +67,7 @@ impl Default for Metadata {
 
             lowest_seqno: SeqNo::MAX,
             highest_seqno: 0,
+            highest_kv_seqno: 0,
         }
     }
 }
diff --git a/src/table/writer/mod.rs b/src/table/writer/mod.rs
@@ -303,6 +303,13 @@ impl Writer {
 
         self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno);
         self.meta.highest_seqno = self.meta.highest_seqno.max(seqno);
+        // highest_kv_seqno tracks the highest seqno among user KV entries
+        // written via write() (values, point tombstones, weak tombstones).
+        // Range tombstones (via write_range_tombstone) are excluded. In
+        // RT-only tables, finish() writes a synthetic sentinel via write()
+        // but restores highest_kv_seqno afterwards, so this bound reflects
+        // only actual user KV items.
+        self.meta.highest_kv_seqno = self.meta.highest_kv_seqno.max(seqno);
 
         Ok(())
     }
@@ -436,6 +443,7 @@ impl Writer {
             {
                 let saved_lo = self.meta.lowest_seqno;
                 let saved_hi = self.meta.highest_seqno;
+                let saved_kv_hi = self.meta.highest_kv_seqno;
 
                 // Write a sentinel key to force index block creation in RT-only
                 // tables. The sentinel must use the start key of the same
@@ -455,6 +463,7 @@ impl Writer {
                 // actual block contents for consistency with recovery/tests.
                 self.meta.lowest_seqno = saved_lo;
                 self.meta.highest_seqno = saved_hi;
+                self.meta.highest_kv_seqno = saved_kv_hi;
 
                 // Ensure the table's key range covers all range tombstones.
                 self.meta.first_key = Some(start);
@@ -598,6 +607,7 @@ impl Writer {
                     "restart_interval#index",
                     &self.index_block_restart_interval.to_le_bytes(),
                 ),
+                meta("seqno#kv_max", &self.meta.highest_kv_seqno.to_le_bytes()),
                 meta("seqno#max", &self.meta.highest_seqno.to_le_bytes()),
                 meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes()),
                 meta("table_id", &self.table_id.to_le_bytes()),

diff --git a/src/tree/mod.rs b/src/tree/mod.rs
@@ -816,6 +816,9 @@ impl Tree {
         // or widens metadata in the inclusive-upper-bound fallback. That makes
         // `metadata.key_range.contains_key(key)` a sound early reject here and
         // avoids scanning RT blocks for unrelated SSTs on point reads.
+        //
+        // Per-table RT lists are sorted by start key on load,
+        // so binary search narrows candidates to RTs with start <= key.
         for table in super_version
             .version
             .iter_levels()
@@ -824,7 +827,12 @@ impl Tree {
             .filter(|t| !t.range_tombstones().is_empty())
             .filter(|t| t.metadata.key_range.contains_key(key))
         {
-            for rt in table.range_tombstones() {
+            let rts = table.range_tombstones();
+            let candidate_end = rts.partition_point(|rt| rt.start.as_ref() <= key);
+
+            for rt in rts.iter().take(candidate_end) {
+                // Binary search already narrowed to start <= key; should_suppress
+                // re-checks contains_key (harmless) and avoids semantic drift.
                 if rt.should_suppress(key, key_seqno, read_seqno) {
                     return true;
                 }