diff --git a/UNSAFE.md b/UNSAFE.md index facaef443..0f4f70cfe 100644 --- a/UNSAFE.md +++ b/UNSAFE.md @@ -6,24 +6,29 @@ ```bash cd fuzz/data_block -mkdir in +mkdir -p in cat /dev/random | head -n 100 > in/input cargo afl build && cargo afl fuzz -i in -o out target/debug/data_block cd fuzz/index_block -mkdir in +mkdir -p in cat /dev/random | head -n 100 > in/input cargo afl build && cargo afl fuzz -i in -o out target/debug/index_block cd fuzz/table_read -mkdir in +mkdir -p in cat /dev/random | head -n 100 > in/input cargo afl build && cargo afl fuzz -i in -o out target/debug/table_read cd fuzz/compare_prefixed_slice -mkdir in +mkdir -p in cat /dev/random | head -n 100 > in/input cargo afl build && cargo afl fuzz -i in -o out target/debug/compare_prefixed_slice + +cd fuzz/prefix_filter +mkdir -p in +cat /dev/random | head -n 100 > in/input +cargo afl build && cargo afl fuzz -i in -o out target/debug/prefix_filter ``` ## Run mutation testing diff --git a/benches/run_reader.rs b/benches/run_reader.rs new file mode 100644 index 000000000..6f15d6ace --- /dev/null +++ b/benches/run_reader.rs @@ -0,0 +1,267 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use lsm_tree::prefix::FixedPrefixExtractor; +use lsm_tree::{AbstractTree, Config}; +use std::sync::Arc; +use std::time::Instant; +use tempfile::TempDir; + +fn create_tree_with_segments( + segment_count: usize, + with_prefix_extractor: bool, +) -> (TempDir, lsm_tree::Tree) { + let tempdir = tempfile::tempdir().unwrap(); + + let mut config = Config::new(&tempdir); + if with_prefix_extractor { + config = config.prefix_extractor(Arc::new(FixedPrefixExtractor::new(8))); + } + + let tree = config.open().unwrap(); + + // Create segments with distinct prefixes + for segment_idx in 0..segment_count { + let prefix = format!("seg{:04}", segment_idx); + + // Add 100 keys per segment + for key_idx in 0..100 { + let key = format!("{}_{:04}", prefix, key_idx); + tree.insert(key.as_bytes(), vec![0u8; 100], 0); + } + + // Flush to create a segment + tree.flush_active_memtable(0).unwrap(); + } + + (tempdir, tree) +} + +fn benchmark_range_query(c: &mut Criterion) { + let mut group = c.benchmark_group("range_query"); + + // Test different segment counts + for segment_count in [10, 100, 500, 1000] { + // Benchmark without prefix extractor + group.bench_with_input( + BenchmarkId::new("no_prefix", segment_count), + &segment_count, + |b, &count| { + let (_tempdir, tree) = create_tree_with_segments(count, false); + + b.iter(|| { + // Query for a range that doesn't exist + let start: &[u8] = b"zzz_0000"; + let end: &[u8] = b"zzz_9999"; + let iter = tree.range(start..=end, 0, None); + // Force evaluation by counting + let count = iter.count(); + black_box(count); + }); + }, + ); + + // Benchmark with prefix extractor + group.bench_with_input( + BenchmarkId::new("with_prefix", segment_count), + &segment_count, + |b, &count| { + let (_tempdir, tree) = create_tree_with_segments(count, true); + + b.iter(|| { + // Query for a range that doesn't exist (will check filters) + let start: &[u8] = b"zzz_0000"; + let end: &[u8] = b"zzz_9999"; + let iter = tree.range(start..=end, 0, None); + // Force evaluation by counting + let count = iter.count(); + black_box(count); + }); + }, + ); + + // Benchmark with prefix extractor - existing prefix + group.bench_with_input( + BenchmarkId::new("with_prefix_exists", segment_count), + &segment_count, + |b, &count| { + let (_tempdir, tree) = create_tree_with_segments(count, true); + + b.iter(|| { + // Query for a range that exists in the middle + let mid = count / 2; + let prefix = format!("seg{:04}", mid); + let start_str = format!("{}_0000", prefix); + let end_str = format!("{}_0099", prefix); + let start: &[u8] = start_str.as_bytes(); + let end: &[u8] = end_str.as_bytes(); + let iter = tree.range(start..=end, 0, None); + // Force evaluation by counting + let count = iter.count(); + black_box(count); + }); + }, + ); + } + + group.finish(); +} + +fn benchmark_timing_comparison(_c: &mut Criterion) { + println!("\n=== RunReader Performance Benchmark ==="); + println!("Testing impact of prefix filter checks on large runs\n"); + + for segment_count in [100, 500, 1000] { + println!("\n--- Testing with {} segments ---", segment_count); + + // Test without prefix extractor + let (_tempdir_no_prefix, tree_no_prefix) = create_tree_with_segments(segment_count, false); + + let start = Instant::now(); + for _ in 0..100 { + let start_key: &[u8] = b"zzz_0000"; + let end_key: &[u8] = b"zzz_9999"; + let iter = tree_no_prefix.range(start_key..=end_key, 0, None); + let _ = iter.count(); + } + let no_prefix_time = start.elapsed(); + let avg_no_prefix = no_prefix_time.as_nanos() / 100; + + println!(" Without prefix extractor: {:>8} ns/query", avg_no_prefix); + + // Test with prefix extractor + let (_tempdir_with_prefix, tree_with_prefix) = + create_tree_with_segments(segment_count, true); + + let start = Instant::now(); + for _ in 0..100 { + let start_key: &[u8] = b"zzz_0000"; + let end_key: &[u8] = b"zzz_9999"; + let iter = tree_with_prefix.range(start_key..=end_key, 0, None); + let _ = iter.count(); + } + let with_prefix_time = start.elapsed(); + let avg_with_prefix = with_prefix_time.as_nanos() / 100; + + println!( + " With prefix extractor: {:>8} ns/query", + avg_with_prefix + ); + + if avg_with_prefix > avg_no_prefix { + let overhead = avg_with_prefix - avg_no_prefix; + println!( + " Overhead: {} ns ({:.1}%)", + overhead, + (overhead as f64 / avg_no_prefix as f64) * 100.0 + ); + } else { + let savings = avg_no_prefix - avg_with_prefix; + println!( + " Savings: {} ns ({:.1}%)", + savings, + (savings as f64 / avg_no_prefix as f64) * 100.0 + ); + } + + // Check CPU cost per segment + if segment_count > 0 { + let per_segment_overhead = if avg_with_prefix > avg_no_prefix { + (avg_with_prefix - avg_no_prefix) / segment_count as u128 + } else { + 0 + }; + println!(" Per-segment overhead: ~{} ns", per_segment_overhead); + } + } + + println!("\n=== Summary ==="); + println!("MAX_UPFRONT_CHECKS optimization limits overhead to checking at most 10 segments."); + println!( + "For runs with >10 segments, remaining segments are filtered lazily during iteration.\n" + ); +} + +fn run_timing_benchmark() { + println!("\n=== RunReader Performance Benchmark ==="); + println!("Testing impact of prefix filter checks on large runs\n"); + + for segment_count in [100, 500, 1000] { + println!("\n--- Testing with {} segments ---", segment_count); + + // Test without prefix extractor + let (_tempdir_no_prefix, tree_no_prefix) = create_tree_with_segments(segment_count, false); + + let start = Instant::now(); + for _ in 0..100 { + let start_key: &[u8] = b"zzz_0000"; + let end_key: &[u8] = b"zzz_9999"; + let iter = tree_no_prefix.range(start_key..=end_key, 0, None); + let _ = iter.count(); + } + let no_prefix_time = start.elapsed(); + let avg_no_prefix = no_prefix_time.as_nanos() / 100; + + println!(" Without prefix extractor: {:>8} ns/query", avg_no_prefix); + + // Test with prefix extractor + let (_tempdir_with_prefix, tree_with_prefix) = + create_tree_with_segments(segment_count, true); + + let start = Instant::now(); + for _ in 0..100 { + let start_key: &[u8] = b"zzz_0000"; + let end_key: &[u8] = b"zzz_9999"; + let iter = tree_with_prefix.range(start_key..=end_key, 0, None); + let _ = iter.count(); + } + let with_prefix_time = start.elapsed(); + let avg_with_prefix = with_prefix_time.as_nanos() / 100; + + println!( + " With prefix extractor: {:>8} ns/query", + avg_with_prefix + ); + + if avg_with_prefix > avg_no_prefix { + let overhead = avg_with_prefix - avg_no_prefix; + println!( + " Overhead: {} ns ({:.1}%)", + overhead, + (overhead as f64 / avg_no_prefix as f64) * 100.0 + ); + } else { + let savings = avg_no_prefix - avg_with_prefix; + println!( + " Savings: {} ns ({:.1}%)", + savings, + (savings as f64 / avg_no_prefix as f64) * 100.0 + ); + } + + // Check CPU cost per segment + if segment_count > 0 { + let per_segment_overhead = if avg_with_prefix > avg_no_prefix { + (avg_with_prefix - avg_no_prefix) / segment_count as u128 + } else { + 0 + }; + println!(" Per-segment overhead: ~{} ns", per_segment_overhead); + } + } + + println!("\n=== Summary ==="); + println!("MAX_UPFRONT_CHECKS optimization limits overhead to checking at most 10 segments."); + println!( + "For runs with >10 segments, remaining segments are filtered lazily during iteration.\n" + ); +} + +fn benchmark_all(c: &mut Criterion) { + // Run standard benchmarks + benchmark_range_query(c); + + // Run the detailed timing comparison + run_timing_benchmark(); +} + +criterion_group!(benches, benchmark_range_query); +criterion_main!(benches); diff --git a/fuzz/prefix_filter/.gitignore b/fuzz/prefix_filter/.gitignore new file mode 100644 index 000000000..f4ee534d3 --- /dev/null +++ b/fuzz/prefix_filter/.gitignore @@ -0,0 +1,2 @@ +in* +out* diff --git a/fuzz/prefix_filter/Cargo.toml b/fuzz/prefix_filter/Cargo.toml new file mode 100644 index 000000000..b51e7b2d5 --- /dev/null +++ b/fuzz/prefix_filter/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "prefix_filter" +version = "0.1.0" +edition = "2024" + +[dependencies] +afl = "*" +arbitrary = { version = "1", features = ["derive"] } +lsm-tree = { path = "../.." } +tempfile = "3.23.0" diff --git a/fuzz/prefix_filter/src/main.rs b/fuzz/prefix_filter/src/main.rs new file mode 100644 index 000000000..8438a6d9a --- /dev/null +++ b/fuzz/prefix_filter/src/main.rs @@ -0,0 +1,759 @@ +#[macro_use] +extern crate afl; + +use arbitrary::{Arbitrary, Unstructured}; +use lsm_tree::config::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry, PinningPolicy}; +use lsm_tree::prefix::{ + FixedLengthExtractor, FixedPrefixExtractor, FullKeyExtractor, PrefixExtractor, + SharedPrefixExtractor, +}; +use lsm_tree::{AbstractTree, AnyTree, Guard, IterGuardImpl, SequenceNumberCounter}; +use std::sync::Arc; + +// --------------------------------------------------------------------------- +// Structured input derived from AFL's raw bytes via Arbitrary +// --------------------------------------------------------------------------- + +/// Multi-prefix extractor that returns both a 2-byte and a 4-byte prefix +/// for keys >= 4 bytes, only a 2-byte prefix for keys 2-3 bytes, and nothing +/// for shorter keys. This exercises the interleaved hash dedup path where +/// consecutive keys produce [hash(2-byte), hash(4-byte), hash(2-byte), ...]. +struct HierarchicalExtractor; + +impl PrefixExtractor for HierarchicalExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + if key.len() >= 4 { + Box::new(vec![&key[..2], &key[..4]].into_iter()) + } else if key.len() >= 2 { + Box::new(std::iter::once(&key[..2])) + } else { + Box::new(std::iter::empty()) + } + } + + fn name(&self) -> &str { + "hierarchical:2:4" + } +} + +#[derive(Arbitrary, Debug, Clone)] +enum ExtractorChoice { + FixedLength1, + FixedLength2, + FixedLength3, + FixedLength4, + FixedPrefix1, + FixedPrefix2, + FixedPrefix3, + FixedPrefix4, + FullKey, + Hierarchical, +} + +impl ExtractorChoice { + fn into_extractor(&self) -> SharedPrefixExtractor { + match self { + Self::FixedLength1 => Arc::new(FixedLengthExtractor::new(1)), + Self::FixedLength2 => Arc::new(FixedLengthExtractor::new(2)), + Self::FixedLength3 => Arc::new(FixedLengthExtractor::new(3)), + Self::FixedLength4 => Arc::new(FixedLengthExtractor::new(4)), + Self::FixedPrefix1 => Arc::new(FixedPrefixExtractor::new(1)), + Self::FixedPrefix2 => Arc::new(FixedPrefixExtractor::new(2)), + Self::FixedPrefix3 => Arc::new(FixedPrefixExtractor::new(3)), + Self::FixedPrefix4 => Arc::new(FixedPrefixExtractor::new(4)), + Self::FullKey => Arc::new(FullKeyExtractor), + Self::Hierarchical => Arc::new(HierarchicalExtractor), + } + } +} + +#[derive(Arbitrary, Debug, Clone)] +enum BpkChoice { + Low, + Default, + High, +} + +impl BpkChoice { + fn value(&self) -> f32 { + match self { + Self::Low => 1.0, + Self::Default => 10.0, + Self::High => 50.0, + } + } +} + +/// Whether filter blocks are partitioned (two-level index) or full (single block). +/// The bug we found (empty tli_handles panic) was specifically in the partitioned +/// writer, so we want AFL to control this directly. +#[derive(Arbitrary, Debug, Clone)] +enum FilterPartitioningChoice { + /// Default policy: full on L0-L2, partitioned on L3+. + Default, + /// Partitioned on ALL levels — forces partitioned writer even at flush time. + AllPartitioned, + /// Never partitioned — full filter on all levels. + NeverPartitioned, +} + +// --------------------------------------------------------------------------- +// Clustered key/prefix types — small alphabet, bounded length +// --------------------------------------------------------------------------- + +/// A key with first byte drawn from a small alphabet (0..8) and bounded +/// length (1..=9). This ensures keys cluster into a small number of prefix +/// groups, so prefix scans and filter lookups frequently hit real data. +/// +/// With extractors up to length 4 and keys as short as 1 byte, AFL naturally +/// explores both in-domain and out-of-domain keys. +#[derive(Debug, Clone)] +struct ClusteredKey(Vec); + +impl<'a> Arbitrary<'a> for ClusteredKey { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + let len: usize = u.int_in_range(1..=9)?; + let first_byte: u8 = u.int_in_range(0..=7)?; + let mut key = Vec::with_capacity(len); + key.push(first_byte); + for _ in 1..len { + key.push(u8::arbitrary(u)?); + } + Ok(ClusteredKey(key)) + } +} + +/// A prefix with length 0..=5, covering shorter-than, equal-to, and +/// longer-than all extractor lengths (extractors go up to 4). +/// Each byte is drawn from a slightly wider alphabet (0..=9) so that +/// values 8 and 9 produce prefixes that don't match any key's first byte +/// (keys use 0..=7), exercising the "absent prefix" filter path. +#[derive(Debug, Clone)] +struct ClusteredPrefix(Vec); + +impl<'a> Arbitrary<'a> for ClusteredPrefix { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + let len: usize = u.int_in_range(0..=5)?; + let mut prefix = Vec::with_capacity(len); + for _ in 0..len { + prefix.push(u.int_in_range(0..=9)?); + } + Ok(ClusteredPrefix(prefix)) + } +} + +// --------------------------------------------------------------------------- +// Operations +// --------------------------------------------------------------------------- + +#[derive(Arbitrary, Debug, Clone)] +enum Op { + // --- Writes --- + Insert { + key: ClusteredKey, + value_len: u8, + value_seed: u8, + }, + Delete { + key: ClusteredKey, + }, + /// Weak tombstone: marks a key as weakly deleted. During compaction GC, + /// a weak tombstone paired with a value below the GC watermark causes the + /// weak tombstone to be dropped. Tests that prefix filter correctness is + /// preserved after weak tombstone GC. + WeakDelete { + key: ClusteredKey, + }, + + // --- Structure ops --- + Flush, + Compact, + MajorCompact { + /// Target table size for major compaction. Small values produce many + /// tables per run, increasing the chance of exercising RunReader's + /// lazy per-table filter skip (which requires 3+ tables). + small_tables: bool, + }, + /// Clean close + reopen with the same extractor. + Reopen, + /// Close + reopen with a DIFFERENT extractor. Tests the + /// `prefix_filter_allowed()` compatibility gating: old tables keep their + /// old extractor metadata, new flushes use the new extractor. + ReopenNewExtractor { + new_extractor: ExtractorChoice, + }, + + // --- Point reads --- + Get { + key: ClusteredKey, + }, + ContainsKey { + key: ClusteredKey, + }, + + // --- Scans --- + PrefixScan { + prefix: ClusteredPrefix, + }, + PrefixScanRev { + prefix: ClusteredPrefix, + }, + /// Bidirectional iterator stepping on a prefix scan. + /// Each bool in `directions` controls: true = next_back, false = next. + PrefixPingPong { + prefix: ClusteredPrefix, + directions: Vec, + }, + RangeScan { + start: ClusteredKey, + end: ClusteredKey, + }, + RangeScanRev { + start: ClusteredKey, + end: ClusteredKey, + }, + /// Unbounded iteration endpoints. Tests first_key_value / last_key_value + /// which use `range(..)` internally — prefix filters should not interfere + /// with unbounded scans. + FirstKV, + LastKV, + + /// Prefix scan using the first N bytes of a previously inserted key. + /// This guarantees the prefix overlaps real data, forcing the filter + /// to make a meaningful decision rather than trivially matching nothing. + PrefixScanExistingKey { + prefix_len: u8, + }, + + /// Composite operation: flush + compact with a small target size to + /// produce many small tables, then reopen with a different extractor + /// and compact again. This creates the specific structural condition + /// (mixed-extractor multi-table run) needed to exercise the lazy + /// per-table filter skip in RunReader. + FlushCompactReopenCompact { + new_extractor: ExtractorChoice, + }, + + // --- MVCC --- + /// Capture the current visibility seqno as a snapshot. Subsequent + /// SnapshotGet / SnapshotPrefixScan ops read at this frozen point + /// while newer writes continue advancing the seqno. + TakeSnapshot, + /// Point read at the most recently taken snapshot seqno. + SnapshotGet { + key: ClusteredKey, + }, + /// Prefix scan at the most recently taken snapshot seqno. + SnapshotPrefixScan { + prefix: ClusteredPrefix, + }, +} + +#[derive(Arbitrary, Debug)] +struct FuzzInput { + extractor: ExtractorChoice, + bpk: BpkChoice, + filter_partitioning: FilterPartitioningChoice, + ops: Vec, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn collect_kv(iter: impl Iterator) -> Vec<(Vec, Vec)> { + iter.map(|g| { + let (k, v) = g.into_inner().expect("iterator item should be readable"); + (k.to_vec(), v.to_vec()) + }) + .collect() +} + +fn open_tree( + dir: &tempfile::TempDir, + seqno: &SequenceNumberCounter, + vis: &SequenceNumberCounter, + extractor: Option, + bloom_bpk: f32, + filter_partitioning: &FilterPartitioningChoice, +) -> AnyTree { + let mut config = + lsm_tree::Config::new(dir, seqno.clone(), vis.clone()).filter_policy(FilterPolicy::all( + FilterPolicyEntry::Bloom(BloomConstructionPolicy::BitsPerKey(bloom_bpk)), + )); + if let Some(ex) = extractor { + config = config.prefix_extractor(ex); + } + config = match filter_partitioning { + FilterPartitioningChoice::Default => config, + FilterPartitioningChoice::AllPartitioned => { + config.filter_block_partitioning_policy(PinningPolicy::all(true)) + } + FilterPartitioningChoice::NeverPartitioned => { + config.filter_block_partitioning_policy(PinningPolicy::all(false)) + } + }; + config.open().unwrap() +} + +fn ordered_range<'a>(a: &'a [u8], b: &'a [u8]) -> (&'a [u8], &'a [u8]) { + if a <= b { + (a, b) + } else { + (b, a) + } +} + +fn make_value(len: u8, seed: u8) -> Vec { + (0..len).map(|i| seed.wrapping_add(i)).collect() +} + +// --------------------------------------------------------------------------- +// Oracle test: tree with prefix extractor vs tree without +// +// Both trees receive identical operations. Any read that returns different +// results means the prefix filter wrongly excluded data = silent data loss. +// --------------------------------------------------------------------------- + +fn run_oracle_test( + initial_extractor: &ExtractorChoice, + bloom_bpk: f32, + filter_partitioning: &FilterPartitioningChoice, + ops: &[Op], +) { + let dir_with = tempfile::tempdir().unwrap(); + let dir_without = tempfile::tempdir().unwrap(); + + let seqno_with = SequenceNumberCounter::default(); + let seqno_without = SequenceNumberCounter::default(); + let vis_with = SequenceNumberCounter::default(); + let vis_without = SequenceNumberCounter::default(); + + // Current extractor for tree_with (may change via ReopenNewExtractor). + let mut current_extractor = initial_extractor.into_extractor(); + + let mut tree_with = open_tree( + &dir_with, + &seqno_with, + &vis_with, + Some(current_extractor.clone()), + bloom_bpk, + filter_partitioning, + ); + let mut tree_without = open_tree( + &dir_without, + &seqno_without, + &vis_without, + None, + bloom_bpk, + filter_partitioning, + ); + + let compaction_strategy = Arc::new(lsm_tree::compaction::Leveled::default()); + + // MVCC snapshot seqnos. `None` until TakeSnapshot is hit. + let mut snapshot_seqno_with: Option = None; + let mut snapshot_seqno_without: Option = None; + + // Track inserted keys so PrefixScanExistingKey can derive prefixes + // from real data. + let mut inserted_keys: Vec> = Vec::new(); + + for (i, op) in ops.iter().enumerate() { + match op { + // ----- Writes ----- + Op::Insert { + key, + value_len, + value_seed, + } => { + let key = &key.0; + let value = make_value(*value_len, *value_seed); + let s1 = seqno_with.next(); + let s2 = seqno_without.next(); + tree_with.insert(key.clone(), value.clone(), s1); + tree_without.insert(key.clone(), value.clone(), s2); + vis_with.fetch_max(s1 + 1); + vis_without.fetch_max(s2 + 1); + if inserted_keys.len() < 100 { + inserted_keys.push(key.clone()); + } + } + + Op::Delete { key } => { + let key = &key.0; + let s1 = seqno_with.next(); + let s2 = seqno_without.next(); + tree_with.remove(key.as_slice(), s1); + tree_without.remove(key.as_slice(), s2); + vis_with.fetch_max(s1 + 1); + vis_without.fetch_max(s2 + 1); + } + + Op::WeakDelete { key } => { + let key = &key.0; + let s1 = seqno_with.next(); + let s2 = seqno_without.next(); + tree_with.remove_weak(key.clone(), s1); + tree_without.remove_weak(key.clone(), s2); + vis_with.fetch_max(s1 + 1); + vis_without.fetch_max(s2 + 1); + } + + // ----- Structure ops ----- + Op::Flush => { + tree_with.flush_active_memtable(0).unwrap(); + tree_without.flush_active_memtable(0).unwrap(); + } + + Op::Compact => { + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let _ = tree_with.compact(compaction_strategy.clone(), s1); + let _ = tree_without.compact(compaction_strategy.clone(), s2); + // Compaction may GC old super versions, invalidating the + // raw snapshot seqno we captured (we don't pin it via the + // tree's snapshot API, so the version can be purged). + snapshot_seqno_with = None; + snapshot_seqno_without = None; + } + + Op::MajorCompact { small_tables } => { + let target = if *small_tables { 128 } else { 4_096 }; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let _ = tree_with.major_compact(target, s1); + let _ = tree_without.major_compact(target, s2); + snapshot_seqno_with = None; + snapshot_seqno_without = None; + } + + Op::Reopen => { + drop(tree_with); + drop(tree_without); + tree_with = open_tree( + &dir_with, + &seqno_with, + &vis_with, + Some(current_extractor.clone()), + bloom_bpk, + filter_partitioning, + ); + tree_without = open_tree( + &dir_without, + &seqno_without, + &vis_without, + None, + bloom_bpk, + filter_partitioning, + ); + // Reopen rebuilds the super version list from scratch; + // old snapshot seqnos are no longer valid. + snapshot_seqno_with = None; + snapshot_seqno_without = None; + } + + Op::ReopenNewExtractor { new_extractor } => { + current_extractor = new_extractor.into_extractor(); + drop(tree_with); + drop(tree_without); + tree_with = open_tree( + &dir_with, + &seqno_with, + &vis_with, + Some(current_extractor.clone()), + bloom_bpk, + filter_partitioning, + ); + tree_without = open_tree( + &dir_without, + &seqno_without, + &vis_without, + None, + bloom_bpk, + filter_partitioning, + ); + snapshot_seqno_with = None; + snapshot_seqno_without = None; + } + + // ----- Point reads ----- + Op::Get { key } => { + let key = &key.0; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let r1 = tree_with.get(key.as_slice(), s1).unwrap(); + let r2 = tree_without.get(key.as_slice(), s2).unwrap(); + assert_eq!(r1, r2, "op {i}: point read mismatch for key {key:?}"); + } + + Op::ContainsKey { key } => { + let key = &key.0; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let r1 = tree_with.contains_key(key.as_slice(), s1).unwrap(); + let r2 = tree_without.contains_key(key.as_slice(), s2).unwrap(); + assert_eq!(r1, r2, "op {i}: contains_key mismatch for key {key:?}"); + } + + // ----- Scans ----- + Op::PrefixScan { prefix } => { + let prefix = &prefix.0; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let a = collect_kv(tree_with.prefix(prefix.clone(), s1, None)); + let b = collect_kv(tree_without.prefix(prefix.clone(), s2, None)); + assert_eq!(a, b, "op {i}: prefix scan mismatch for {prefix:?}"); + } + + Op::PrefixScanRev { prefix } => { + let prefix = &prefix.0; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let a = collect_kv(tree_with.prefix(prefix.clone(), s1, None).rev()); + let b = collect_kv(tree_without.prefix(prefix.clone(), s2, None).rev()); + assert_eq!(a, b, "op {i}: reverse prefix scan mismatch for {prefix:?}"); + } + + Op::PrefixPingPong { prefix, directions } => { + let prefix = &prefix.0; + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let mut iter_with = tree_with.prefix(prefix.clone(), s1, None); + let mut iter_without = tree_without.prefix(prefix.clone(), s2, None); + + for (j, &go_back) in directions.iter().enumerate() { + let item_with = if go_back { + iter_with.next_back() + } else { + iter_with.next() + }; + let item_without = if go_back { + iter_without.next_back() + } else { + iter_without.next() + }; + + let kv_with = + item_with.map(|g| g.into_inner().expect("iter item should be readable")); + let kv_without = + item_without.map(|g| g.into_inner().expect("iter item should be readable")); + + match (&kv_with, &kv_without) { + (Some((k1, v1)), Some((k2, v2))) => { + assert_eq!( + (k1.as_ref(), v1.as_ref()), + (k2.as_ref(), v2.as_ref()), + "op {i} step {j}: ping-pong mismatch for prefix {prefix:?}, go_back={go_back}", + ); + } + (None, None) => {} + _ => { + panic!( + "op {i} step {j}: ping-pong length mismatch for prefix {prefix:?}: \ + with={kv_with:?}, without={kv_without:?}", + ); + } + } + } + } + + Op::RangeScan { start, end } => { + let (lo, hi) = ordered_range(&start.0, &end.0); + if lo == hi { + continue; + } + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let a = collect_kv(tree_with.range(lo..hi, s1, None)); + let b = collect_kv(tree_without.range(lo..hi, s2, None)); + assert_eq!(a, b, "op {i}: range scan mismatch for {lo:?}..{hi:?}"); + } + + Op::RangeScanRev { start, end } => { + let (lo, hi) = ordered_range(&start.0, &end.0); + if lo == hi { + continue; + } + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let a = collect_kv(tree_with.range(lo..hi, s1, None).rev()); + let b = collect_kv(tree_without.range(lo..hi, s2, None).rev()); + assert_eq!( + a, b, + "op {i}: reverse range scan mismatch for {lo:?}..{hi:?}" + ); + } + + Op::FirstKV => { + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let r1 = tree_with.first_key_value(s1, None).map(|g| { + g.into_inner() + .expect("first_key_value item should be readable") + }); + let r2 = tree_without.first_key_value(s2, None).map(|g| { + g.into_inner() + .expect("first_key_value item should be readable") + }); + match (&r1, &r2) { + (Some((k1, v1)), Some((k2, v2))) => { + assert_eq!( + (k1.as_ref(), v1.as_ref()), + (k2.as_ref(), v2.as_ref()), + "op {i}: first_key_value mismatch" + ); + } + (None, None) => {} + _ => panic!( + "op {i}: first_key_value presence mismatch: with={r1:?}, without={r2:?}" + ), + } + } + + Op::LastKV => { + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let r1 = tree_with.last_key_value(s1, None).map(|g| { + g.into_inner() + .expect("last_key_value item should be readable") + }); + let r2 = tree_without.last_key_value(s2, None).map(|g| { + g.into_inner() + .expect("last_key_value item should be readable") + }); + match (&r1, &r2) { + (Some((k1, v1)), Some((k2, v2))) => { + assert_eq!( + (k1.as_ref(), v1.as_ref()), + (k2.as_ref(), v2.as_ref()), + "op {i}: last_key_value mismatch" + ); + } + (None, None) => {} + _ => panic!( + "op {i}: last_key_value presence mismatch: with={r1:?}, without={r2:?}" + ), + } + } + + Op::PrefixScanExistingKey { prefix_len } => { + if inserted_keys.is_empty() { + continue; + } + // Pick a key deterministically from the inserted set + let key = &inserted_keys[i % inserted_keys.len()]; + let plen = (*prefix_len as usize).min(key.len()); + let prefix = &key[..plen]; + + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let a = collect_kv(tree_with.prefix(prefix.to_vec(), s1, None)); + let b = collect_kv(tree_without.prefix(prefix.to_vec(), s2, None)); + assert_eq!( + a, b, + "op {i}: existing-key prefix scan mismatch for prefix {prefix:?} (from key {key:?})" + ); + } + + Op::FlushCompactReopenCompact { new_extractor } => { + // Phase 1: flush and compact current data with small target + // to produce many tables + tree_with.flush_active_memtable(0).unwrap(); + tree_without.flush_active_memtable(0).unwrap(); + let s1 = vis_with.get(); + let s2 = vis_without.get(); + let _ = tree_with.major_compact(128, s1); + let _ = tree_without.major_compact(128, s2); + + // Phase 2: reopen with different extractor + current_extractor = new_extractor.into_extractor(); + drop(tree_with); + drop(tree_without); + tree_with = open_tree( + &dir_with, + &seqno_with, + &vis_with, + Some(current_extractor.clone()), + bloom_bpk, + filter_partitioning, + ); + tree_without = open_tree( + &dir_without, + &seqno_without, + &vis_without, + None, + bloom_bpk, + filter_partitioning, + ); + snapshot_seqno_with = None; + snapshot_seqno_without = None; + } + + // ----- MVCC ----- + Op::TakeSnapshot => { + snapshot_seqno_with = Some(vis_with.get()); + snapshot_seqno_without = Some(vis_without.get()); + } + + Op::SnapshotGet { key } => { + let (Some(sw), Some(so)) = (snapshot_seqno_with, snapshot_seqno_without) else { + continue; // No snapshot taken yet, skip + }; + let key = &key.0; + let r1 = tree_with.get(key.as_slice(), sw).unwrap(); + let r2 = tree_without.get(key.as_slice(), so).unwrap(); + assert_eq!( + r1, r2, + "op {i}: snapshot point read mismatch for key {key:?} at seqno ({sw}, {so})" + ); + } + + Op::SnapshotPrefixScan { prefix } => { + let (Some(sw), Some(so)) = (snapshot_seqno_with, snapshot_seqno_without) else { + continue; // No snapshot taken yet, skip + }; + let prefix = &prefix.0; + let a = collect_kv(tree_with.prefix(prefix.clone(), sw, None)); + let b = collect_kv(tree_without.prefix(prefix.clone(), so, None)); + assert_eq!( + a, b, + "op {i}: snapshot prefix scan mismatch for {prefix:?} at seqno ({sw}, {so})" + ); + } + } + } +} + +// --------------------------------------------------------------------------- +// AFL entry point +// --------------------------------------------------------------------------- + +fn main() { + fuzz!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let Ok(input) = FuzzInput::arbitrary(&mut u) else { + return; + }; + + // Limit op count so each iteration stays fast for AFL. + if input.ops.is_empty() || input.ops.len() > 200 { + return; + } + // Cap PrefixPingPong directions to avoid very slow iterations. + for op in &input.ops { + if let Op::PrefixPingPong { directions, .. } = op { + if directions.len() > 50 { + return; + } + } + } + + run_oracle_test( + &input.extractor, + input.bpk.value(), + &input.filter_partitioning, + &input.ops, + ); + }); +} diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 73ea1c119..c68c83b3b 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -216,18 +216,25 @@ impl AbstractTree for BlobTree { let super_version = self.index.get_version_for_snapshot(seqno); let tree = self.clone(); - let range = prefix_to_range(prefix.as_ref()); + let prefix_bytes = prefix.as_ref(); + let range = prefix_to_range(prefix_bytes); Box::new( - crate::Tree::create_internal_range(super_version.clone(), &range, seqno, index).map( - move |kv| { - IterGuardImpl::Blob(Guard { - tree: tree.clone(), - version: super_version.version.clone(), - kv, - }) - }, - ), + crate::Tree::create_internal_range( + super_version.clone(), + &range, + seqno, + index, + self.index.config.prefix_extractor.clone(), + Some(prefix_bytes), + ) + .map(move |kv| { + IterGuardImpl::Blob(Guard { + tree: tree.clone(), + version: super_version.version.clone(), + kv, + }) + }), ) } @@ -241,15 +248,21 @@ impl AbstractTree for BlobTree { let tree = self.clone(); Box::new( - crate::Tree::create_internal_range(super_version.clone(), &range, seqno, index).map( - move |kv| { - IterGuardImpl::Blob(Guard { - tree: tree.clone(), - version: super_version.version.clone(), - kv, - }) - }, - ), + crate::Tree::create_internal_range( + super_version.clone(), + &range, + seqno, + index, + self.index.config.prefix_extractor.clone(), + None, + ) + .map(move |kv| { + IterGuardImpl::Blob(Guard { + tree: tree.clone(), + version: super_version.version.clone(), + kv, + }) + }), ) } @@ -384,7 +397,12 @@ impl AbstractTree for BlobTree { Bloom(policy) => policy, None => BloomConstructionPolicy::BitsPerKey(0.0), } - }); + }) + // Ensure tables built during blob tree flush carry the configured extractor. + // This lets writers register prefixes and persist the extractor name in metadata + // for compatibility checks at read time. + .use_prefix_extractor(self.index.config.prefix_extractor.clone()) + .use_whole_key_filtering(self.index.config.whole_key_filtering); if index_partitioning { table_writer = table_writer.use_partitioned_index(); @@ -594,7 +612,12 @@ impl AbstractTree for BlobTree { .expect("lock is poisoned") .get_version_for_snapshot(seqno); - let Some(item) = crate::Tree::get_internal_entry_from_version(&super_version, key, seqno)? + let Some(item) = crate::Tree::get_internal_entry_from_version( + &super_version, + key, + seqno, + &self.index.config, + )? else { return Ok(None); }; diff --git a/src/compaction/flavour.rs b/src/compaction/flavour.rs index 9a5b34723..e70fbfd46 100644 --- a/src/compaction/flavour.rs +++ b/src/compaction/flavour.rs @@ -113,7 +113,9 @@ pub(super) fn prepare_table_writer( None => BloomConstructionPolicy::BitsPerKey(0.0), } } - })) + }) + .use_prefix_extractor(opts.config.prefix_extractor.clone()) + .use_whole_key_filtering(opts.config.whole_key_filtering)) } // TODO: find a better name diff --git a/src/config/mod.rs b/src/config/mod.rs index 8b8aac8dd..44adcdfcb 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -20,8 +20,9 @@ pub use restart_interval::RestartIntervalPolicy; pub type PartitioningPolicy = PinningPolicy; use crate::{ - compaction::filter::Factory, path::absolute_path, version::DEFAULT_LEVEL_COUNT, AnyTree, - BlobTree, Cache, CompressionType, DescriptorTable, SequenceNumberCounter, Tree, + compaction::filter::Factory, path::absolute_path, prefix::SharedPrefixExtractor, + version::DEFAULT_LEVEL_COUNT, AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, + SequenceNumberCounter, Tree, }; use std::{ path::{Path, PathBuf}, @@ -229,6 +230,19 @@ pub struct Config { /// Compaction filter factory pub compaction_filter_factory: Option>, + /// Optional prefix extractor used to construct prefix-aware filters. + /// When set, the table writer will add extracted prefixes (instead of full keys) + /// to filters and persist the extractor name in table metadata for compatibility checks. + pub prefix_extractor: Option, + + /// When true (the default), full-key hashes are added to the filter alongside + /// prefix hashes. This allows point reads to use the full-key Bloom for precise + /// filtering, while prefix scans use the coarser prefix filter. + /// + /// Set to false for workloads that only perform prefix scans and never point + /// lookups, saving filter space by omitting full-key hashes. + pub whole_key_filtering: bool, + #[doc(hidden)] pub kv_separation_opts: Option, @@ -290,6 +304,8 @@ impl Default for Config { )), compaction_filter_factory: None, + prefix_extractor: None, + whole_key_filtering: true, expect_point_read_hits: false, @@ -313,6 +329,30 @@ impl Config { } } + /// Sets the prefix extractor for building prefix-aware filters. + /// If set, extracted prefixes are added to filters and the extractor name is stored in table metadata. + #[must_use] + pub fn prefix_extractor(mut self, extractor: SharedPrefixExtractor) -> Self { + self.prefix_extractor = Some(extractor); + self + } + + /// Controls whether full-key hashes are added to the filter alongside + /// prefix hashes when a prefix extractor is configured. + /// + /// Defaults to `true`. When enabled, point reads benefit from precise + /// full-key Bloom filtering in addition to coarse prefix filtering. + /// + /// Set to `false` for seek-only workloads that never perform point lookups, + /// reducing filter size by omitting full-key hashes. + /// + /// Has no effect when no prefix extractor is configured. + #[must_use] + pub fn whole_key_filtering(mut self, enabled: bool) -> Self { + self.whole_key_filtering = enabled; + self + } + /// Sets the global cache. /// /// You can create a global [`Cache`] and share it between multiple diff --git a/src/lib.rs b/src/lib.rs index d429d71c5..760193043 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -122,6 +122,9 @@ pub mod mvcc_stream; mod path; +/// Prefix extraction for filters +pub mod prefix; + #[doc(hidden)] pub mod range; diff --git a/src/prefix.rs b/src/prefix.rs new file mode 100644 index 000000000..705004769 --- /dev/null +++ b/src/prefix.rs @@ -0,0 +1,376 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use std::sync::Arc; + +/// Trait for extracting prefixes from keys for prefix filters. +/// +/// A prefix extractor allows the filter to index prefixes of keys +/// instead of (or in addition to) the full keys. +/// This enables efficient filtering for prefix-based queries. +/// +/// # Examples +/// +/// ## Simple fixed-length +/// +/// ``` +/// use lsm_tree::prefix::{PrefixExtractor, FixedPrefixExtractor}; +/// +/// let ex = FixedPrefixExtractor::new(3); +/// assert_eq!(ex.extract_first(b"abcdef"), Some(b"abc".as_ref())); +/// assert_eq!(ex.extract_first(b"ab"), None); // shorter than prefix length +/// ``` +/// +/// ## Segmented prefixes (e.g., `account_id#user_id)` +/// +/// ``` +/// use lsm_tree::prefix::PrefixExtractor; +/// +/// struct SegmentedPrefixExtractor; +/// +/// impl PrefixExtractor for SegmentedPrefixExtractor { +/// fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { +/// let mut prefixes = vec![]; +/// let mut end = 0; +/// for (i, &byte) in key.iter().enumerate() { +/// if byte == b'#' { +/// prefixes.push(&key[0..i]); +/// end = i; +/// } +/// } +/// if end < key.len() { +/// prefixes.push(key); +/// } +/// Box::new(prefixes.into_iter()) +/// } +/// +/// fn name(&self) -> &str { +/// "segmented_prefix" +/// } +/// } +/// +/// let ex = SegmentedPrefixExtractor; +/// assert_eq!(ex.name(), "segmented_prefix"); +/// let prefixes: Vec<_> = ex.extract(b"acc#usr#data").collect(); +/// assert_eq!(prefixes, vec![b"acc".as_ref(), b"acc#usr", b"acc#usr#data"]); +/// let prefixes: Vec<_> = ex.extract(b"plain_key").collect(); +/// assert_eq!(prefixes, vec![b"plain_key".as_ref()]); +/// ``` +pub trait PrefixExtractor: + Send + Sync + std::panic::UnwindSafe + std::panic::RefUnwindSafe +{ + /// Extracts zero or more prefixes from a key. + /// + /// All prefixes will be added to the filter during table construction. + /// + /// An empty iterator means the key is "out of domain" and won't be added to the filter. + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a>; + + /// Extracts the first prefix from a key. + /// + /// By default, this is derived from `extract`, meaning it is equivalent to `extract(key).next()`, + /// however it can be overridden to skip the Box allocation of `extract` in some cases. + /// + /// Implementations that override this method must remain semantically identical to + /// `self.extract(key).next()`: return `None` if `extract` would yield no prefixes, + /// and otherwise return the same first prefix. Violating this invariant can cause + /// incorrect filter pruning during reads. + fn extract_first<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + self.extract(key).next() + } + + /// Extracts the most specific (last) prefix from a key. + /// + /// For single-prefix extractors, this is the same as `extract_first`. + /// For multi-prefix extractors, this returns the highest-cardinality + /// prefix, which gives the best Bloom filter pruning. + /// + /// Defaults to consuming `extract(key)` to get the last element. + /// Can be overridden to avoid the Box allocation when the last prefix + /// can be computed directly. + fn extract_last<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + self.extract(key).last() + } + + /// Returns a stable compatibility identifier for this prefix extractor. + /// + /// This value is persisted in table metadata and compared on reopen to determine + /// whether the filter is compatible with the current extractor. It must change + /// whenever extraction behavior or any behavior-affecting parameter changes + /// (e.g. prefix length). + fn name(&self) -> &str; +} + +/// A prefix extractor that returns the full key. +/// +/// Useful when callers want prefix-aware filtering that behaves identically to +/// full-key filtering (e.g., for testing or as an explicit no-op extractor). +pub struct FullKeyExtractor; + +impl PrefixExtractor for FullKeyExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + Box::new(std::iter::once(key)) + } + + fn extract_last<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + self.extract_first(key) + } + + fn extract_first<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + Some(key) + } + + fn name(&self) -> &'static str { + "full_key" + } +} + +/// A prefix extractor that returns a fixed-length prefix. +/// +/// Keys shorter than the prefix length are considered "out of domain" and +/// return `None`. This prevents the prefix filter from producing false +/// negatives when a prefix query key is shorter than the configured length. +pub struct FixedPrefixExtractor { + length: usize, + name: String, +} + +impl FixedPrefixExtractor { + /// Creates a new fixed-length prefix extractor. + #[must_use] + pub fn new(length: usize) -> Self { + Self { + length, + name: format!("fixed_prefix:{length}"), + } + } +} + +impl PrefixExtractor for FixedPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + Box::new(self.extract_first(key).into_iter()) + } + + fn extract_first<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + if key.len() < self.length { + None + } else { + key.get(..self.length) + } + } + + fn extract_last<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + self.extract_first(key) + } + + fn name(&self) -> &str { + &self.name + } +} + +/// A prefix extractor that requires keys to be at least a certain length. +/// +/// Keys shorter than the required length are considered "out of domain" +/// and won't be added to the filter. +/// This matches `RocksDB`'s behavior. +pub struct FixedLengthExtractor { + length: usize, + name: String, +} + +impl FixedLengthExtractor { + /// Creates a new fixed-length extractor. + #[must_use] + pub fn new(length: usize) -> Self { + Self { + length, + name: format!("fixed_length:{length}"), + } + } +} + +impl PrefixExtractor for FixedLengthExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + Box::new(self.extract_first(key).into_iter()) + } + + fn extract_first<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + if key.len() < self.length { + // Key is too short - out of domain + None + } else { + #[expect( + clippy::expect_used, + reason = "length is already validated via key.len() >= self.length" + )] + Some( + key.get(..self.length) + .expect("prefix slice should be in bounds"), + ) + } + } + + fn extract_last<'a>(&self, key: &'a [u8]) -> Option<&'a [u8]> { + self.extract_first(key) + } + + fn name(&self) -> &str { + &self.name + } +} + +/// Users can implement their own prefix extractors that return multiple prefixes. +/// The filter will include all returned prefixes. +/// +/// # Examples +/// +/// ``` +/// use lsm_tree::prefix::PrefixExtractor; +/// use std::sync::Arc; +/// +/// // Example 1: Hierarchical prefix extractor based on delimiter +/// // For key "user/123/data" with delimiter '/', generates: +/// // - "user" +/// // - "user/123" +/// // - "user/123/data" (full key) +/// struct HierarchicalPrefixExtractor { +/// delimiter: u8, +/// } +/// +/// impl PrefixExtractor for HierarchicalPrefixExtractor { +/// fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { +/// let delimiter = self.delimiter; +/// let mut prefixes = Vec::new(); +/// +/// // Generate all prefixes up to each delimiter +/// for (i, &byte) in key.iter().enumerate() { +/// if byte == delimiter { +/// prefixes.push(&key[0..i]); +/// } +/// } +/// +/// // Always include the full key +/// prefixes.push(key); +/// +/// Box::new(prefixes.into_iter()) +/// } +/// +/// fn name(&self) -> &str { +/// "hierarchical_prefix" +/// } +/// } +/// +/// // Example 2: Extract domain prefix for flipped email keys +/// // For "example.com@user", this extracts: +/// // - "example.com" (domain prefix for range scans) +/// // - "example.com@user" (full key for point lookups) +/// struct DomainPrefixExtractor; +/// +/// impl PrefixExtractor for DomainPrefixExtractor { +/// fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { +/// if let Ok(key_str) = std::str::from_utf8(key) { +/// if let Some(at_pos) = key_str.find('@') { +/// // Return both domain prefix and full key +/// let domain_prefix = &key[..at_pos]; +/// return Box::new(vec![domain_prefix, key].into_iter()); +/// } +/// } +/// // If not a flipped email format, just return the full key +/// Box::new(std::iter::once(key)) +/// } +/// +/// fn name(&self) -> &str { +/// "domain_prefix" +/// } +/// } +/// +/// let ex = DomainPrefixExtractor; +/// assert_eq!(ex.name(), "domain_prefix"); +/// let prefixes: Vec<_> = ex.extract(b"example.com@user").collect(); +/// assert_eq!(prefixes, vec![b"example.com".as_ref(), b"example.com@user"]); +/// ``` +/// Type alias for a shared prefix extractor +pub type SharedPrefixExtractor = Arc; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_full_key_extractor() { + let extractor = FullKeyExtractor; + let key = b"test_key"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"test_key".as_ref())); + } + + #[test] + fn test_fixed_prefix_extractor() { + let extractor = FixedPrefixExtractor::new(5); + + // Key longer than prefix + let key = b"longer_key"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"longe".as_ref())); + + // Key shorter than prefix — out of domain + let key = b"key"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 0); + + // Key exactly prefix length + let key = b"exact"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"exact".as_ref())); + } + + #[test] + fn test_empty_key() { + let full_key = FullKeyExtractor; + let fixed = FixedPrefixExtractor::new(5); + + let key = b""; + + let prefixes: Vec<_> = full_key.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"".as_ref())); + + let prefixes: Vec<_> = fixed.extract(key).collect(); + assert_eq!(prefixes.len(), 0); // empty key is shorter than prefix length + } + + #[test] + fn test_fixed_length_extractor() { + let extractor = FixedLengthExtractor::new(5); + + // Key shorter than required length - out of domain + let key = b"abc"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 0); // Empty iterator + + // Key exactly required length + let key = b"exact"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"exact".as_ref())); + + // Key longer than required length + let key = b"longer_key"; + let prefixes: Vec<_> = extractor.extract(key).collect(); + assert_eq!(prefixes.len(), 1); + assert_eq!(prefixes.first(), Some(&b"longe".as_ref())); + } + + #[test] + fn test_extractor_names() { + assert_eq!(FullKeyExtractor.name(), "full_key"); + assert_eq!(FixedPrefixExtractor::new(4).name(), "fixed_prefix:4"); + assert_eq!(FixedPrefixExtractor::new(3).name(), "fixed_prefix:3"); + assert_eq!(FixedLengthExtractor::new(4).name(), "fixed_length:4"); + assert_eq!(FixedLengthExtractor::new(3).name(), "fixed_length:3"); + } +} diff --git a/src/range.rs b/src/range.rs index c0cd5df94..13a632fff 100644 --- a/src/range.rs +++ b/src/range.rs @@ -7,6 +7,7 @@ use crate::{ memtable::Memtable, merge::Merger, mvcc_stream::MvccStream, + prefix::SharedPrefixExtractor, run_reader::RunReader, value::{SeqNo, UserKey}, version::SuperVersion, @@ -68,6 +69,13 @@ pub fn prefix_to_range(prefix: &[u8]) -> (Bound, Bound) { pub struct IterState { pub(crate) version: SuperVersion, pub(crate) ephemeral: Option<(Arc, SeqNo)>, + pub(crate) prefix_extractor: Option, + + /// When set, this is the original prefix from a `tree.prefix()` call. + /// It allows the filter layer to consult the prefix filter even when the + /// range bounds (produced by `prefix_to_range`) have different extracted + /// prefixes. + pub(crate) prefix_hint: Option, } type BoxedMerge<'a> = Box> + Send + 'a>; @@ -96,6 +104,10 @@ impl DoubleEndedIterator for TreeIter { } impl TreeIter { + #[expect( + clippy::too_many_lines, + reason = "extended with prefix-hint validation and upfront pruning" + )] pub fn create_range, R: RangeBounds>( guard: IterState, range: R, @@ -165,17 +177,35 @@ impl TreeIter { range.start_bound().map(|x| &*x.user_key), range.end_bound().map(|x| &*x.user_key), )) { - let reader = table - .range(( + let mut skip = false; + if let Some(ex) = lock.prefix_extractor.as_ref() { + let ref_range = ( range.start_bound().map(|x| &x.user_key).cloned(), range.end_bound().map(|x| &x.user_key).cloned(), - )) - .filter(move |item| match item { - Ok(item) => seqno_filter(item.key.seqno, seqno), - Err(_) => true, - }); - - iters.push(Box::new(reader)); + ); + let hint = lock.prefix_hint.as_deref(); + if table.should_skip_range_by_prefix_filter( + &ref_range, + ex.as_ref(), + hint, + ) { + skip = true; + } + } + + if !skip { + let reader = table + .range(( + range.start_bound().map(|x| &x.user_key).cloned(), + range.end_bound().map(|x| &x.user_key).cloned(), + )) + .filter(move |item| match item { + Ok(item) => seqno_filter(item.key.seqno, seqno), + Err(_) => true, + }); + + iters.push(Box::new(reader)); + } } } _ => { @@ -185,6 +215,8 @@ impl TreeIter { range.start_bound().map(|x| &x.user_key).cloned(), range.end_bound().map(|x| &x.user_key).cloned(), ), + lock.prefix_extractor.clone(), + lock.prefix_hint.as_ref(), ) { iters.push(Box::new(reader.filter(move |item| match item { Ok(item) => seqno_filter(item.key.seqno, seqno), diff --git a/src/run_reader.rs b/src/run_reader.rs index 96384e66b..90ee78504 100644 --- a/src/run_reader.rs +++ b/src/run_reader.rs @@ -2,7 +2,10 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{version::Run, BoxedIterator, InternalValue, Table, UserKey}; +use crate::{ + prefix::SharedPrefixExtractor, version::Run, BoxedIterator, InternalValue, Table, UserKey, +}; +use std::ops::Bound::{self}; use std::{ ops::{Deref, RangeBounds}, sync::Arc, @@ -15,46 +18,235 @@ pub struct RunReader { hi: usize, lo_reader: Option>, hi_reader: Option>, + + // Owned range bounds for creating new per-table readers during iteration + range_start: Bound, + range_end: Bound, + + // Optional extractor for prefix-aware pruning during lazy advancement + extractor: Option, + + // Pre-validated prefix hint for direct filter probing during lazy iteration. + // Set only when the prefix_hint passes the stability guard + // (extract_X(hint) == extract_X(hint + "\0")) for either extract_last + // (preferred, most specific prefix → best pruning) or, as a fallback, + // extract_first. When set, `validated_prefix_hash` holds the hash of + // that validated prefix, so probe_prefix_filter_with_hash can be called + // directly without per-table guard checks, extract() Box allocations, + // or rehashing. + validated_prefix_hint: Option, + + // Precomputed hash of the validated prefix (see `validated_prefix_hint`). + // Avoids per-table extract() Box allocation and hash computation in the + // lazy loop hot path. + validated_prefix_hash: Option, } impl RunReader { + /// Creates a run reader over a disjoint set of tables. Returns None when up-front + /// prefix filter pruning determines that no table in the run may contain keys for the range. + /// Uses common-prefix pruning only; per-table skipping happens lazily during iteration. #[must_use] - pub fn new + Clone + Send + 'static>( + #[expect( + clippy::too_many_lines, + reason = "extended with prefix-hint validation, hash precomputation, and upfront pruning" + )] + pub fn new>( run: Arc>, range: R, + extractor: Option, + prefix_hint: Option<&UserKey>, ) -> Option { + const MAX_UPFRONT_CHECKS: usize = 10; + assert!(!run.is_empty(), "level reader cannot read empty level"); let (lo, hi) = run.range_overlap_indexes(&range)?; - Some(Self::culled(run, range, (Some(lo), Some(hi)))) + // Validate the prefix hint and precompute the most specific stable + // prefix hash. For multi-prefix extractors, the last (most specific) + // prefix gives better Bloom filter pruning than the first (coarsest). + // + // We try extract_last first (higher cardinality → fewer false positives). + // If its stability guard fails, fall back to extract_first. + // Stability guard: extract_X(hint) == extract_X(hint + "\0"). + let (validated_prefix_hint, validated_prefix_hash) = + if let (Some(hint), Some(ex)) = (prefix_hint.as_ref(), extractor.as_ref()) { + let hint_bytes: &[u8] = hint.as_ref(); + let mut extended = Vec::with_capacity(hint_bytes.len() + 1); + extended.extend_from_slice(hint_bytes); + extended.push(0u8); + + // Try the most specific prefix first + let last_hint = ex.extract_last(hint_bytes); + let last_extended = ex.extract_last(&extended); + let best_hash = if let (Some(lh), Some(le)) = (last_hint, last_extended) { + if lh == le { + Some(crate::table::filter::standard_bloom::Builder::get_hash(lh)) + } else { + None + } + } else { + None + }; + + if let Some(hash) = best_hash { + (Some((*hint).clone()), Some(hash)) + } else { + // Fall back to first prefix + let first_hint = ex.extract_first(hint_bytes); + let first_extended = ex.extract_first(&extended); + match (first_hint, first_extended) { + (Some(fh), Some(fe)) if fh == fe => { + let hash = crate::table::filter::standard_bloom::Builder::get_hash(fh); + (Some((*hint).clone()), Some(hash)) + } + _ => (None, None), + } + } + } else { + (None, None) + }; + + // Determine whether upfront pruning is possible: we need a common extracted + // prefix that applies to all keys in the range. Either from the validated + // hint (single allocation above) or by comparing both range bounds. + let can_prune_upfront = if validated_prefix_hint.is_some() { + true + } else if let Some(ex) = extractor.as_ref() { + let start_first = match range.start_bound() { + Bound::Included(uk) | Bound::Excluded(uk) => ex.extract_first(uk.as_ref()), + Bound::Unbounded => None, + }; + let end_first = match range.end_bound() { + Bound::Included(uk) | Bound::Excluded(uk) => ex.extract_first(uk.as_ref()), + Bound::Unbounded => None, + }; + matches!((start_first, end_first), (Some(s), Some(e)) if s == e) + } else { + false + }; + + // Early optimization: upfront pruning + if let Some(ex) = extractor.as_ref() { + if can_prune_upfront { + // Compute probe key once: use validated hint or start bound + let start_key = match range.start_bound() { + std::ops::Bound::Included(k) | std::ops::Bound::Excluded(k) => Some(k.as_ref()), + std::ops::Bound::Unbounded => None, + }; + let probe = if let Some(ref hint) = validated_prefix_hint { + hint.as_ref() + } else { + #[expect( + clippy::expect_used, + reason = "can_prune_upfront is only true when both bounds are concrete or a validated hint is set" + )] + let key = + start_key.expect("can_prune_upfront requires both bounds to be concrete"); + key + }; + + let mut checks = 0usize; + let mut has_potential_match = false; + + for idx in lo..=hi { + #[expect( + clippy::expect_used, + reason = "lo..=hi is bounded by run.len() from range_overlap_indexes" + )] + let table = run.deref().get(idx).expect("should exist"); + // SAFETY INVARIANT: range_overlap_indexes uses binary search on + // table min/max keys and is exact for disjoint sorted runs — + // every table in lo..=hi genuinely overlaps the query range. + // If this invariant were ever violated (e.g. by a future refactor), + // the impact is benign: table.range() would return an empty iterator + // (no data corruption, just wasted I/O). + debug_assert!( + table.check_key_range_overlap(&( + range.start_bound().map(AsRef::as_ref), + range.end_bound().map(AsRef::as_ref), + )), + "range_overlap_indexes returned a non-overlapping table in upfront pruning" + ); + + if !matches!( + table.maybe_contains_prefix(probe, ex.as_ref()), + Ok(Some(false)) + ) { + has_potential_match = true; + break; + } + + checks += 1; + if checks >= MAX_UPFRONT_CHECKS { + has_potential_match = true; + break; + } + } + + if !has_potential_match { + return None; + } + } + } + + Some(Self::culled( + run, + range, + (Some(lo), Some(hi)), + extractor, + validated_prefix_hint, + validated_prefix_hash, + )) } + /// Creates a run reader with precomputed overlap indices. + /// + /// This variant assumes the caller already determined the overlapping table + /// indices. It initializes boundary table readers and + /// performs lazy per-table prefix-filter skipping during iteration. #[must_use] - pub fn culled + Clone + Send + 'static>( + pub fn culled>( run: Arc>, range: R, (lo, hi): (Option, Option), + extractor: Option, + validated_prefix_hint: Option, + validated_prefix_hash: Option, ) -> Self { + use std::ops::Bound::{Excluded, Included, Unbounded}; + let lo = lo.unwrap_or_default(); let hi = hi.unwrap_or(run.len() - 1); + // Materialize owned range bounds for reuse when creating readers for other tables + let owned_start: std::ops::Bound = match range.start_bound() { + Included(k) => Included(k.clone()), + Excluded(k) => Excluded(k.clone()), + Unbounded => Unbounded, + }; + let owned_end: std::ops::Bound = match range.end_bound() { + Included(k) => Included(k.clone()), + Excluded(k) => Excluded(k.clone()), + Unbounded => Unbounded, + }; + // TODO: lazily init readers? #[expect( clippy::expect_used, reason = "we trust the caller to pass valid indexes" )] let lo_table = run.deref().get(lo).expect("should exist"); - let lo_reader = lo_table.range(range.clone()); + let lo_reader = lo_table.range((owned_start.clone(), owned_end.clone())); - // TODO: lazily init readers? let hi_reader = if hi > lo { #[expect( clippy::expect_used, reason = "we trust the caller to pass valid indexes" )] let hi_table = run.deref().get(hi).expect("should exist"); - Some(hi_table.range(range)) + Some(hi_table.range((owned_start.clone(), owned_end.clone()))) } else { None }; @@ -65,6 +257,11 @@ impl RunReader { hi, lo_reader: Some(Box::new(lo_reader)), hi_reader: hi_reader.map(|x| Box::new(x) as BoxedIterator), + range_start: owned_start, + range_end: owned_end, + extractor, + validated_prefix_hint, + validated_prefix_hash, } } } @@ -84,13 +281,88 @@ impl Iterator for RunReader { self.lo += 1; if self.lo < self.hi { - self.lo_reader = Some(Box::new( + // Lazily advance to the next table that overlaps the key range + loop { + if self.lo >= self.hi { + break; + } + #[expect( clippy::expect_used, reason = "hi is at most equal to the last slot; so because 0 <= lo < hi, it must be a valid index" )] - self.run.get(self.lo).expect("should exist").iter(), - )); + let table = self.run.get(self.lo).expect("should exist"); + + // SAFETY INVARIANT: range_overlap_indexes uses binary search on + // table min/max keys and is exact for disjoint sorted runs — + // every table in lo..hi genuinely overlaps the query range. + // If this invariant were ever violated (e.g. by a future refactor), + // the impact is benign: table.range() would return an empty iterator + // (no data corruption, just wasted I/O). + debug_assert!( + table.check_key_range_overlap(&( + self.range_start.as_ref().map(AsRef::as_ref), + self.range_end.as_ref().map(AsRef::as_ref), + )), + "range_overlap_indexes returned a non-overlapping table in forward lazy loop" + ); + + if let Some(ex) = &self.extractor { + // Use the pre-validated hint for a direct probe (no guard + // re-check, no Vec allocation). Falls back to the range-based + // path when no validated hint is available. + // + // prefix_filter_allowed must be checked per-table because + // optimize_runs can merge tables from different runs that + // were built with different extractor configs. + let skip = if let (Some(ref hint), Some(hash)) = + (&self.validated_prefix_hint, self.validated_prefix_hash) + { + if table.prefix_filter_allowed(Some(ex.name())) { + // Fast path: use precomputed hash to avoid + // per-table extract() Box allocation and hashing. + let probe = + table.probe_prefix_filter_with_hash(hint.as_ref(), hash); + + #[cfg(feature = "metrics")] + if matches!(&probe, Ok(Some(_))) { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.filter_queries.fetch_add(1, Relaxed); + } + + let should_skip = matches!(probe, Ok(Some(false))); + + #[cfg(feature = "metrics")] + if should_skip { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); + } + + should_skip + } else { + false + } + } else { + // should_skip_range_by_prefix_filter handles + // its own metrics internally. + let tmp_range = (self.range_start.clone(), self.range_end.clone()); + table.should_skip_range_by_prefix_filter( + &tmp_range, + ex.as_ref(), + None, + ) + }; + if skip { + self.lo += 1; + continue; + } + } + + let reader = + table.range((self.range_start.clone(), self.range_end.clone())); + self.lo_reader = Some(Box::new(reader)); + break; + } } } else if let Some(hi_reader) = &mut self.hi_reader { // NOTE: We reached the hi marker, so consume from it instead @@ -117,13 +389,80 @@ impl DoubleEndedIterator for RunReader { self.hi -= 1; if self.lo < self.hi { - self.hi_reader = Some(Box::new( + // Lazily move to previous table that overlaps the key range + loop { + if self.hi <= self.lo { + break; + } + #[expect( clippy::expect_used, reason = "because 0 <= lo <= hi, and hi monotonically decreases, hi must be a valid index" )] - self.run.get(self.hi).expect("should exist").iter(), - )); + let table = self.run.get(self.hi).expect("should exist"); + + // SAFETY INVARIANT: range_overlap_indexes uses binary search on + // table min/max keys and is exact for disjoint sorted runs — + // every table in lo..hi genuinely overlaps the query range. + // If this invariant were ever violated (e.g. by a future refactor), + // the impact is benign: table.range() would return an empty iterator + // (no data corruption, just wasted I/O). + debug_assert!( + table.check_key_range_overlap(&( + self.range_start.as_ref().map(AsRef::as_ref), + self.range_end.as_ref().map(AsRef::as_ref), + )), + "range_overlap_indexes returned a non-overlapping table in backward lazy loop" + ); + + if let Some(ex) = &self.extractor { + // prefix_filter_allowed must be checked per-table because + // optimize_runs can merge tables from different runs that + // were built with different extractor configs. + let skip = if let (Some(ref hint), Some(hash)) = + (&self.validated_prefix_hint, self.validated_prefix_hash) + { + if table.prefix_filter_allowed(Some(ex.name())) { + let probe = + table.probe_prefix_filter_with_hash(hint.as_ref(), hash); + + #[cfg(feature = "metrics")] + if matches!(&probe, Ok(Some(_))) { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.filter_queries.fetch_add(1, Relaxed); + } + + let should_skip = matches!(probe, Ok(Some(false))); + + #[cfg(feature = "metrics")] + if should_skip { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); + } + + should_skip + } else { + false + } + } else { + let tmp_range = (self.range_start.clone(), self.range_end.clone()); + table.should_skip_range_by_prefix_filter( + &tmp_range, + ex.as_ref(), + None, + ) + }; + if skip { + self.hi -= 1; + continue; + } + } + + let reader = + table.range((self.range_start.clone(), self.range_end.clone())); + self.hi_reader = Some(Box::new(reader)); + break; + } } } else if let Some(lo_reader) = &mut self.lo_reader { // NOTE: We reached the lo marker, so consume from it instead @@ -142,6 +481,7 @@ impl DoubleEndedIterator for RunReader { mod tests { use super::*; use crate::{AbstractTree, SequenceNumberCounter, Slice}; + use std::sync::Arc; use test_log::test; #[test] @@ -176,9 +516,15 @@ mod tests { let level = Arc::new(Run::new(tables).unwrap()); - assert!(RunReader::new(level.clone(), UserKey::from("y")..=UserKey::from("z"),).is_none()); + assert!(RunReader::new( + level.clone(), + UserKey::from("y")..=UserKey::from("z"), + None, + None + ) + .is_none()); - assert!(RunReader::new(level, UserKey::from("y")..).is_none()); + assert!(RunReader::new(level, UserKey::from("y").., None, None).is_none()); Ok(()) } @@ -217,7 +563,8 @@ mod tests { let level = Arc::new(Run::new(tables).unwrap()); { - let multi_reader = RunReader::culled(level.clone(), .., (Some(1), None)); + let multi_reader = + RunReader::culled(level.clone(), .., (Some(1), None), None, None, None); let mut iter = multi_reader.flatten(); assert_eq!(Slice::from(*b"d"), iter.next().unwrap().key.user_key); @@ -233,7 +580,7 @@ mod tests { } { - let multi_reader = RunReader::new(level.clone(), ..).unwrap(); + let multi_reader = RunReader::new(level.clone(), .., None, None).unwrap(); let mut iter = multi_reader.flatten(); @@ -253,7 +600,7 @@ mod tests { } { - let multi_reader = RunReader::new(level.clone(), ..).unwrap(); + let multi_reader = RunReader::new(level.clone(), .., None, None).unwrap(); let mut iter = multi_reader.rev().flatten(); @@ -273,7 +620,7 @@ mod tests { } { - let multi_reader = RunReader::new(level.clone(), ..).unwrap(); + let multi_reader = RunReader::new(level.clone(), .., None, None).unwrap(); let mut iter = multi_reader.flatten(); @@ -293,7 +640,8 @@ mod tests { } { - let multi_reader = RunReader::new(level.clone(), UserKey::from("g")..).unwrap(); + let multi_reader = + RunReader::new(level.clone(), UserKey::from("g").., None, None).unwrap(); let mut iter = multi_reader.flatten(); @@ -307,7 +655,7 @@ mod tests { } { - let multi_reader = RunReader::new(level, UserKey::from("g")..).unwrap(); + let multi_reader = RunReader::new(level, UserKey::from("g").., None, None).unwrap(); let mut iter = multi_reader.flatten().rev(); @@ -322,4 +670,611 @@ mod tests { Ok(()) } + + mod prefix_extractor { + use super::super::*; + use crate::prefix::{FixedLengthExtractor, SharedPrefixExtractor}; + use crate::{range::prefix_upper_range, AbstractTree, SequenceNumberCounter}; + use std::ops::Bound; + use test_log::test; + + #[test] + fn run_reader_prefix_range_pruning_absent() -> crate::Result<()> { + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + .open()?; + + // Create multiple tables with prefixes "aaa" and "bbb" + for p in [b"aaa", b"bbb"] { + for i in 0..10u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + let level = std::sync::Arc::new(Run::new(tables).unwrap()); + + // Query a prefix range for a non-existent prefix "zzz" + let prefix = b"zzz".to_vec(); + let start = Bound::Included(UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + let ex = Some(ex); + + // All overlapped tables report Some(false) -> should prune (None) + let reader = RunReader::new(level, (start, end), ex, None); + assert!(reader.is_none()); + + Ok(()) + } + + #[test] + fn run_reader_prefix_range_no_pruning_when_possible_hit() -> crate::Result<()> { + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + .open()?; + + // Tables with prefixes: "aaa" and "zzz" + for p in [b"aaa", b"zzz"] { + for i in 0..5u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:02}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + let level = std::sync::Arc::new(Run::new(tables).unwrap()); + + // Query a prefix range for existing prefix "zzz" + let prefix = b"zzz".to_vec(); + let start = Bound::Included(UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + let ex = Some(ex); + + let reader = RunReader::new(level, (start, end), ex, None); + assert!(reader.is_some()); + + Ok(()) + } + + /// Helper: create a multi-table run where each table has a wide key range + /// (from "aaa..." to "zzz...") but only contains specific prefixes in its filter. + /// This ensures key range overlaps with queries for absent prefixes. + fn create_wide_range_run_with_prefixes( + prefixes_per_table: &[&[&[u8]]], + ) -> crate::Result<(tempfile::TempDir, Arc>, SharedPrefixExtractor)> { + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + .open()?; + + for prefixes in prefixes_per_table { + for p in *prefixes { + for i in 0..5u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:02}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + let level = Arc::new(Run::new(tables).unwrap()); + + Ok((tempdir, level, ex)) + } + + /// Upfront pruning: all tables' key ranges overlap but ALL filters exclude the prefix. + /// The run reader should return None (no results possible). + #[test] + fn run_reader_upfront_pruning_all_excluded() -> crate::Result<()> { + use crate::config::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; + + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + // Extractor length 3: start "mmm..." and end "mmn..." both share prefix "mmm"/"mmn"... + // Actually: for prefix_upper_range("mmm") = Excluded("mmn"). Extract("mmm") = "mmm", + // Extract("mmn") = "mmn". Those differ! So we need the prefix range boundaries + // to share the same extracted prefix. Use the full prefix as the key: + // prefix = "mmm00" (5 bytes), extractor = 3, extract("mmm00") = "mmm", + // prefix_upper_range("mmm00") = Excluded("mmm01"), extract("mmm01") = "mmm". + // => common_prefix = "mmm". + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + // Use high bits-per-key to minimize false positive rate + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(50.0), + ))) + .open()?; + + // Table 1: wide key range (aaa..zzz) but only specific prefixes + for p in [b"aaa" as &[u8], b"bbb", b"ccc", b"xxx", b"yyy", b"zzz"] { + for i in 0..30u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + + // Table 2: wide key range, different prefixes, still no "mmm" + for p in [b"ddd" as &[u8], b"eee", b"fff", b"vvv", b"www", b"zzz"] { + for i in 0..30u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + let level = Arc::new(Run::new(tables).unwrap()); + + // Use "mmm00" as the prefix range key so both bounds share extracted prefix "mmm": + // start = Included("mmm00"), extract = "mmm" + // end = prefix_upper_range("mmm00") = Excluded("mmm01"), extract = "mmm" + // => common_prefix = Some("mmm") + let prefix = b"mmm00".to_vec(); + let start = Bound::Included(UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!( + reader.is_none(), + "should prune: no table contains prefix mmm" + ); + + Ok(()) + } + + /// Upfront pruning: one table's filter contains the prefix, so pruning does NOT return None. + /// The run reader should return Some since a potential match exists. + #[test] + fn run_reader_upfront_pruning_one_hit() -> crate::Result<()> { + // Table 1: prefixes "aaa" and "zzz" + // Table 2: prefixes "bbb" and "mmm" — this one has "mmm"! + let (_dir, level, ex) = + create_wide_range_run_with_prefixes(&[&[b"aaa", b"zzz"], &[b"bbb", b"mmm"]])?; + + // Use "mmm00" so both bounds share the same 3-byte extracted prefix "mmm": + // start=Included("mmm00") → prefix "mmm", end=Excluded("mmm01") → prefix "mmm" + let prefix = b"mmm00".to_vec(); + let start = Bound::Included(UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!( + reader.is_some(), + "should NOT prune: table 2 contains prefix mmm" + ); + + Ok(()) + } + + /// Upfront pruning with >10 tables: exceeds `MAX_UPFRONT_CHECKS` limit. + /// When too many tables need checking, pruning bails out and returns Some. + #[test] + fn run_reader_upfront_pruning_exceeds_max_checks() -> crate::Result<()> { + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + .open()?; + + // Create 12 tables, each with wide key range (aaa..zzz) but unique middle prefixes + for i in 0..12u32 { + // Each table has "aaa" and "zzz" to ensure wide key range overlap + for p in [b"aaa", b"zzz"] { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:02}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + // Each table also has a unique prefix that is NOT "mmm" + let unique = format!("p{i:02}"); + tree.insert(unique.as_bytes(), b"v", seqno.next()); + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + assert!(tables.len() >= 11, "need >10 tables for this test"); + let level = Arc::new(Run::new(tables).unwrap()); + + // Query for "mmm00" — no table has it, but after 10 checks we bail out + // and return Some (don't prune) because we exceeded MAX_UPFRONT_CHECKS. + // Use "mmm00" so both bounds share the same 3-byte extracted prefix "mmm". + let prefix = b"mmm00".to_vec(); + let start = Bound::Included(UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!( + reader.is_some(), + "should NOT prune: exceeded max upfront checks" + ); + + Ok(()) + } + + /// Unbounded start range with extractor: `common_prefix` should be None. + /// No upfront pruning occurs when the start bound is unbounded. + #[test] + fn run_reader_unbounded_start_with_extractor() -> crate::Result<()> { + let (_dir, level, ex) = + create_wide_range_run_with_prefixes(&[&[b"aaa", b"zzz"], &[b"bbb", b"yyy"]])?; + + // Unbounded start: common_prefix = None, no upfront pruning + let reader = RunReader::new(level, ..UserKey::from("mmm99"), Some(ex), None); + assert!(reader.is_some()); + + Ok(()) + } + + /// Unbounded end range with extractor: `common_prefix` should be None. + /// No upfront pruning occurs when the end bound is unbounded. + #[test] + fn run_reader_unbounded_end_with_extractor() -> crate::Result<()> { + let (_dir, level, ex) = + create_wide_range_run_with_prefixes(&[&[b"aaa", b"zzz"], &[b"bbb", b"yyy"]])?; + + // Unbounded end: common_prefix = None, no upfront pruning + let reader = RunReader::new(level, UserKey::from("mmm00").., Some(ex), None); + assert!(reader.is_some()); + + Ok(()) + } + + /// Cross-prefix range: start and end have different prefixes. + /// No upfront pruning occurs when the range spans multiple prefixes. + #[test] + fn run_reader_cross_prefix_range() -> crate::Result<()> { + let (_dir, level, ex) = + create_wide_range_run_with_prefixes(&[&[b"aaa", b"zzz"], &[b"bbb", b"yyy"]])?; + + // Start prefix "aaa", end prefix "bbb" — different, so common_prefix = None + let reader = RunReader::new( + level, + UserKey::from("aaa00")..UserKey::from("bbb99"), + Some(ex), + None, + ); + assert!(reader.is_some()); + + Ok(()) + } + + /// Helper: creates overlapping L0 tables with wide key ranges for lazy skip testing. + /// + /// Each table spans "aaa" to "zzz" (via anchor keys) but only some tables + /// contain the target prefix "mmm". This forces ALL tables to overlap any + /// prefix query, so the lazy skip loop in RunReader must check the prefix + /// filter on middle tables rather than having them excluded by + /// `range_overlap_indexes`. + /// + /// Layout (4 tables, each with "aaa" and "zzz" anchors): + /// Table 0: "aaa", "mmm", "zzz" — has target prefix + /// Table 1: "aaa", "zzz" — NO target prefix → lazy skip fires + /// Table 2: "aaa", "mmm", "zzz" — has target prefix + /// Table 3: "aaa", "zzz" — NO target prefix + fn create_overlapping_run_for_lazy_skip( + ) -> crate::Result<(tempfile::TempDir, Arc>, SharedPrefixExtractor)> { + use crate::config::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; + + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + // High bits-per-key to eliminate false positives + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(50.0), + ))) + .open()?; + + // Table layout: each flush creates one L0 table. + // All tables have "aaa" and "zzz" keys as anchors for wide key range. + let table_prefixes: &[&[&[u8]]] = &[ + &[b"aaa", b"mmm", b"zzz"], // Table 0: has "mmm" + &[b"aaa", b"zzz"], // Table 1: NO "mmm" + &[b"aaa", b"mmm", b"zzz"], // Table 2: has "mmm" + &[b"aaa", b"zzz"], // Table 3: NO "mmm" + ]; + + for prefixes in table_prefixes { + for p in *prefixes { + for i in 0..5u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:02}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + + assert_eq!( + tables.len(), + 4, + "expected exactly 4 L0 tables, got {}", + tables.len() + ); + + let level = Arc::new(Run::new(tables).unwrap()); + Ok((tempdir, level, ex)) + } + + /// Lazy per-table prefix skip during forward iteration. + /// + /// With 4 overlapping tables (all key ranges span "aaa".."zzz"), querying + /// prefix "mmm" causes `range_overlap_indexes` to set lo=0, hi=3. The lo + /// and hi readers are created for tables 0 and 3. As forward iteration + /// exhausts the lo reader and advances, the lazy skip loop processes middle + /// tables 1 and 2: + /// - Table 1 has no "mmm" in its filter → skip branch fires (lo += 1) + /// - Table 2 has "mmm" → reader is created + #[test] + fn run_reader_lazy_forward_prefix_skip() -> crate::Result<()> { + let (_dir, level, ex) = create_overlapping_run_for_lazy_skip()?; + + // Range ["mmm00", "mmm99"]: both bounds extract to "mmm" (FixedLengthExtractor(3)), + // so common_prefix = Some("mmm") and should_skip_range_by_prefix_filter works. + let start = Bound::Included(UserKey::from("mmm00")); + let end = Bound::Included(UserKey::from("mmm99")); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!(reader.is_some()); + + let results: Vec<_> = reader.unwrap().flatten().collect(); + // Tables 0 and 2 each have 5 "mmm" keys ("mmm00".."mmm04"). + // Tables 1 and 3 have no "mmm" keys. + // Table 0 is read by lo_reader, table 3 by hi_reader (yields nothing). + // The lazy loop skips table 1 (no "mmm") and reads table 2 (has "mmm"). + // Total: 5 (table 0) + 5 (table 2) = 10 keys. + assert_eq!(results.len(), 10, "expected 10 mmm keys from 2 tables"); + for item in &results { + assert!( + item.key.user_key.starts_with(b"mmm"), + "unexpected key: {:?}", + item.key.user_key + ); + } + + Ok(()) + } + + /// Lazy per-table prefix skip during reverse iteration. + /// + /// Same 4-table layout. Reverse iteration starts from hi_reader (table 3), + /// then the lazy skip loop processes middle tables in reverse (2, then 1): + /// - Table 2 has "mmm" → reader created + /// - Table 1 has no "mmm" → skip branch fires (hi -= 1) + #[test] + fn run_reader_lazy_reverse_prefix_skip() -> crate::Result<()> { + let (_dir, level, ex) = create_overlapping_run_for_lazy_skip()?; + + // Range ["mmm00", "mmm99"]: both bounds extract to "mmm" + let start = Bound::Included(UserKey::from("mmm00")); + let end = Bound::Included(UserKey::from("mmm99")); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!(reader.is_some()); + + let results: Vec<_> = reader.unwrap().rev().flatten().collect(); + // Same 10 mmm keys, but in reverse order + assert_eq!(results.len(), 10, "expected 10 mmm keys from 2 tables"); + for item in &results { + assert!( + item.key.user_key.starts_with(b"mmm"), + "unexpected key: {:?}", + item.key.user_key + ); + } + + Ok(()) + } + + /// Helper: 4 tables where only the first and last contain the target prefix. + /// + /// Layout (all tables share "aaa" and "zzz" anchors for wide key range): + /// Table 0: "aaa", "mmm", "zzz" — has target prefix (lo) + /// Table 1: "aaa", "zzz" — NO target prefix + /// Table 2: "aaa", "zzz" — NO target prefix + /// Table 3: "aaa", "mmm", "zzz" — has target prefix (hi) + /// + /// During backward iteration, after the hi reader (T3) is exhausted, + /// the inner loop must skip T2 and T1 (both lack "mmm"), decrementing + /// hi all the way down to lo — exercising the `hi <= lo` break. + #[cfg_attr(coverage_nightly, coverage(off))] + fn create_run_for_backward_hi_meets_lo( + ) -> crate::Result<(tempfile::TempDir, Arc>, SharedPrefixExtractor)> { + use crate::config::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; + + let tempdir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let ex: SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + let tree = + crate::Config::new(&tempdir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(ex.clone()) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(50.0), + ))) + .open()?; + + let table_prefixes: &[&[&[u8]]] = &[ + &[b"aaa", b"mmm", b"zzz"], // Table 0: has "mmm" + &[b"aaa", b"zzz"], // Table 1: NO "mmm" + &[b"aaa", b"zzz"], // Table 2: NO "mmm" + &[b"aaa", b"mmm", b"zzz"], // Table 3: has "mmm" + ]; + + for prefixes in table_prefixes { + for p in *prefixes { + for i in 0..5u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:02}").as_bytes()); + tree.insert(k, b"v", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + } + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + + assert_eq!( + tables.len(), + 4, + "expected exactly 4 L0 tables, got {}", + tables.len() + ); + + let level = Arc::new(Run::new(tables).unwrap()); + Ok((tempdir, level, ex)) + } + + /// Backward lazy loop: hi decrements past all middle tables to meet lo. + /// + /// With T0 and T3 having "mmm" and T1/T2 lacking it, reverse iteration + /// exhausts hi_reader (T3), then the inner loop skips T2 and T1 via the + /// prefix filter, decrementing hi to 0 where `hi <= lo` triggers the break. + /// Iteration then falls through to lo_reader (T0). + #[test] + fn run_reader_backward_lazy_hi_meets_lo() -> crate::Result<()> { + let (_dir, level, ex) = create_run_for_backward_hi_meets_lo()?; + + let start = Bound::Included(UserKey::from("mmm00")); + let end = Bound::Included(UserKey::from("mmm99")); + + let reader = RunReader::new(level, (start, end), Some(ex), None); + assert!(reader.is_some()); + + let results: Vec<_> = reader.unwrap().rev().flatten().collect(); + // T3 has 5 "mmm" keys, T0 has 5 "mmm" keys → 10 total + assert_eq!( + results.len(), + 10, + "expected 10 mmm keys from tables 0 and 3" + ); + for item in &results { + assert!( + item.key.user_key.starts_with(b"mmm"), + "unexpected key: {:?}", + item.key.user_key + ); + } + + Ok(()) + } + + /// Excluded start bound: verifies that a range with an excluded start bound + /// is handled correctly and still returns results. + #[test] + fn run_reader_excluded_start_bound() -> crate::Result<()> { + let (_dir, level, ex) = + create_wide_range_run_with_prefixes(&[&[b"aaa", b"zzz"], &[b"bbb", b"yyy"]])?; + + // Use Excluded start bound — not a common API pattern, but exercises the branch + let reader = RunReader::new( + level, + ( + Bound::Excluded(UserKey::from("aaa00")), + Bound::Included(UserKey::from("zzz99")), + ), + Some(ex), + None, + ); + assert!(reader.is_some()); + + Ok(()) + } + + /// Terminal None in backward iteration: after forward iteration fully + /// consumes `lo_reader`, calling `next_back()` with no `hi_reader` and + /// no `lo_reader` returns None immediately. + #[test] + fn run_reader_backward_terminal_none_after_forward_exhaustion() -> crate::Result<()> { + let tempdir = tempfile::tempdir()?; + let tree = crate::Config::new( + &tempdir, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + // Two tables with disjoint key ranges + tree.insert("a", vec![], 0); + tree.flush_active_memtable(0)?; + tree.insert("z", vec![], 0); + tree.flush_active_memtable(0)?; + + let tables = tree + .current_version() + .iter_tables() + .cloned() + .collect::>(); + let level = Arc::new(Run::new(tables).unwrap()); + + // Create reader over a narrow range that only overlaps one table + // hi == lo → no hi_reader created + let mut reader = + RunReader::new(level, UserKey::from("a")..=UserKey::from("a"), None, None).unwrap(); + + // Forward-exhaust the lo_reader + assert!(reader.next().is_some()); // "a" + assert!(reader.next().is_none()); // lo_reader exhausted, lo_reader = None + + // Now call next_back — no hi_reader, no lo_reader → return None (line 291) + assert!(reader.next_back().is_none()); + + Ok(()) + } + } } diff --git a/src/table/filter/block.rs b/src/table/filter/block.rs index 46f759e20..2f159c09e 100644 --- a/src/table/filter/block.rs +++ b/src/table/filter/block.rs @@ -17,6 +17,22 @@ impl FilterBlock { Ok(StandardBloomFilterReader::new(&self.0.data)?.contains_hash(hash)) } + /// Returns Ok(Some(true)) if the key's first extracted prefix may be + /// contained, Ok(Some(false)) if the filter indicates that prefix is + /// not present, or Ok(None) if the key is out of the extractor's + /// domain (`extract_first` returns None). + /// + /// Only `extract_first` is consulted. For multi-prefix extractors that + /// want to probe the most-specific prefix, use the hash-based probe + /// path on `Table` instead. + pub fn maybe_contains_prefix( + &self, + key: &[u8], + extractor: &dyn crate::prefix::PrefixExtractor, + ) -> crate::Result> { + Ok(StandardBloomFilterReader::new(&self.0.data)?.contains_prefix(key, extractor)) + } + /// Returns the block size in bytes. #[must_use] pub fn size(&self) -> usize { diff --git a/src/table/filter/standard_bloom/mod.rs b/src/table/filter/standard_bloom/mod.rs index d90bc7bbd..a75a78fb3 100644 --- a/src/table/filter/standard_bloom/mod.rs +++ b/src/table/filter/standard_bloom/mod.rs @@ -128,6 +128,25 @@ impl<'a> StandardBloomFilterReader<'a> { self.contains_hash(Self::get_hash(key)) } + /// Returns `Some(true)` if the key's first extracted prefix may be + /// contained, `Some(false)` if it is definitively absent, or `None` + /// if the key is out of the extractor's domain (`extract_first` + /// returns None). + /// + /// Note: only the first extracted prefix is consulted. For multi-prefix + /// extractors, this is a coarse pruning probe — callers that need to + /// check the most-specific prefix should compute that prefix's hash + /// themselves and use a hash-based probe path on `Table`. + #[must_use] + pub fn contains_prefix( + &self, + key: &[u8], + extractor: &dyn crate::prefix::PrefixExtractor, + ) -> Option { + let prefix = extractor.extract_first(key)?; + Some(self.contains_hash(Self::get_hash(prefix))) + } + /// Returns `true` if the bit at `idx` is `1`. fn has_bit(&self, idx: usize) -> bool { self.inner.get(idx) diff --git a/src/table/meta.rs b/src/table/meta.rs index ff1810f10..68e215bc2 100644 --- a/src/table/meta.rs +++ b/src/table/meta.rs @@ -50,6 +50,18 @@ pub struct ParsedMeta { pub data_block_compression: CompressionType, pub index_block_compression: CompressionType, + + /// Optional name of the prefix extractor used when this table was created. + pub prefix_extractor_name: Option, + + /// Whether this table's filter contains full-key hashes alongside prefix hashes. + /// Only meaningful when `prefix_extractor_name` is `Some`. When false, the filter + /// has only prefix hashes and the full-key Bloom path is unsafe to use (would + /// produce false negatives = data loss). + /// + /// Defaults to `true` for backward compatibility with tables written before this + /// field was persisted (those tables always wrote full-key hashes). + pub whole_key_filtering: bool, } macro_rules! read_u8 { @@ -207,6 +219,21 @@ impl ParsedMeta { CompressionType::decode_from(&mut bytes)? }; + let prefix_extractor_name = block + .point_read(b"prefix_extractor", SeqNo::MAX) + .map(|v| { + String::from_utf8(v.value.to_vec()).map_err(|e| crate::Error::Utf8(e.utf8_error())) + }) + .transpose()?; + + // whole_key_filtering: defaults to true for tables written before this + // field was persisted. Such tables predate prefix_extractor support, + // so they always wrote full-key hashes (no extractor → register_key + // for every key). + let whole_key_filtering = block + .point_read(b"whole_key_filtering", SeqNo::MAX) + .is_none_or(|v| v.value.first().copied().unwrap_or(1) != 0); + Ok(Self { id, created_at, @@ -221,6 +248,8 @@ impl ParsedMeta { weak_tombstone_reclaimable, data_block_compression, index_block_compression, + prefix_extractor_name, + whole_key_filtering, }) } } diff --git a/src/table/mod.rs b/src/table/mod.rs index 594653b0b..635bdfb37 100644 --- a/src/table/mod.rs +++ b/src/table/mod.rs @@ -44,7 +44,6 @@ use block_index::BlockIndexImpl; use inner::Inner; use iter::Iter; use std::{ - borrow::Cow, fs::File, ops::{Bound, RangeBounds}, path::PathBuf, @@ -90,6 +89,79 @@ impl Table { self.0.global_seqno } + /// Returns true if the table's stored prefix extractor configuration is compatible + /// with the currently configured extractor name. This is used to decide whether + /// prefix-aware filtering is allowed. + pub(crate) fn prefix_filter_allowed(&self, current_extractor_name: Option<&str>) -> bool { + match ( + self.metadata.prefix_extractor_name.as_deref(), + current_extractor_name, + ) { + (Some(a), Some(b)) => a == b, + (Some(_), None) | (None, Some(_)) => false, + (None, None) => true, + } + } + + /// Loads the filter block corresponding to `key`, if any. This unifies the logic used by + /// both `get()` and `maybe_contains_prefix()`. + fn load_filter_block_for_key( + &self, + key: &[u8], + ) -> crate::Result>> { + if let Some(block) = &self.pinned_filter_block { + return Ok(Some(std::borrow::Cow::Borrowed(block))); + } + + if let Some(filter_idx) = &self.pinned_filter_index { + let mut iter = filter_idx.iter(); + // NOTE: For filter block lookup, we use SeqNo::MAX to find the block + // that covers this key regardless of sequence number + let found = iter.seek(key, SeqNo::MAX); + + let handle = if found { + iter.next() + } else { + // The key is beyond all TLI entries. Fall back to the last filter + // partition: since the key exceeds the table's range it was never + // inserted, so the Bloom check will correctly report "not present". + filter_idx.iter().next_back() + }; + + if let Some(filter_block_handle) = handle { + let filter_block_handle = filter_block_handle.materialize(filter_idx.as_slice()); + + let block = self.load_block( + &filter_block_handle.into_inner(), + BlockType::Filter, + CompressionType::None, + )?; + let block = FilterBlock::new(block); + + return Ok(Some(std::borrow::Cow::Owned(block))); + } + return Ok(None); + } + + if let Some(_filter_tli_handle) = &self.regions.filter_tli { + // Unpinned filter TLI not supported yet + return Ok(None); + } + + if let Some(filter_block_handle) = &self.regions.filter { + let block = self.load_block( + filter_block_handle, + BlockType::Filter, + CompressionType::None, + )?; + let block = FilterBlock::new(block); + + return Ok(Some(std::borrow::Cow::Owned(block))); + } + + Ok(None) + } + pub fn referenced_blob_bytes(&self) -> crate::Result { if let Some(v) = self.0.cached_blob_bytes.get() { return Ok(*v); @@ -226,6 +298,30 @@ impl Table { self.metadata.file_size } + pub(crate) fn get_without_filter( + &self, + key: &[u8], + seqno: SeqNo, + ) -> crate::Result> { + // Translate seqno to "our" seqno, same as Table::get + let seqno = seqno.saturating_sub(self.global_seqno()); + + if self.metadata.seqnos.0 >= seqno { + return Ok(None); + } + + self.point_read(key, seqno) + } + + /// Looks up `key` at or below `seqno`, returning the newest visible value if present. + /// + /// This method performs a hash-based filter check (using `key_hash`) to skip + /// data block I/O when possible. It does NOT perform prefix-aware filtering; that is + /// handled at a higher level by `Tree::point_read_from_table`. + /// + /// Returns Ok(None) when the key is not present or shadowed by sequence rules. + /// + /// Errors reflect I/O or decoding failures when loading index or data blocks. pub fn get( &self, key: &[u8], @@ -242,40 +338,7 @@ impl Table { return Ok(None); } - let filter_block = if let Some(block) = &self.pinned_filter_block { - Some(Cow::Borrowed(block)) - } else if let Some(filter_idx) = &self.pinned_filter_index { - let mut iter = filter_idx.iter(); - iter.seek(key, seqno); - - if let Some(filter_block_handle) = iter.next() { - let filter_block_handle = filter_block_handle.materialize(filter_idx.as_slice()); - - let block = self.load_block( - &filter_block_handle.into_inner(), - BlockType::Filter, - CompressionType::None, // NOTE: We never write a filter block with compression - )?; - let block = FilterBlock::new(block); - - Some(Cow::Owned(block)) - } else { - None - } - } else if let Some(_filter_tli_handle) = &self.regions.filter_tli { - unimplemented!("unpinned filter TLI not supported"); - } else if let Some(filter_block_handle) = &self.regions.filter { - let block = self.load_block( - filter_block_handle, - BlockType::Filter, - CompressionType::None, // NOTE: We never write a filter block with compression - )?; - let block = FilterBlock::new(block); - - Some(Cow::Owned(block)) - } else { - None - }; + let filter_block = self.load_filter_block_for_key(key)?; if let Some(filter_block) = &filter_block { if !filter_block.maybe_contains_hash(key_hash)? { @@ -310,6 +373,73 @@ impl Table { } } + /// Checks via the filter whether the key's first extracted prefix may be + /// present in this table. Returns: + /// - Ok(Some(true)) if the filter indicates a possible match + /// - Ok(Some(false)) if the filter indicates no match + /// - Ok(None) if the key is out of the extractor's domain, the table's + /// stored extractor is incompatible with the current one, or no + /// filter block is available for this table. + /// + /// Only `extract_first` is consulted (single-probe). Callers that have + /// already computed a hash for a more specific prefix should use + /// `probe_prefix_filter_with_hash` instead. + pub fn maybe_contains_prefix( + &self, + key: &[u8], + extractor: &dyn crate::prefix::PrefixExtractor, + ) -> crate::Result> { + // Only consult the prefix-aware filter if the table's stored extractor + // configuration is compatible with the current one. Incompatible + // extractors or missing filters return None (cannot determine). + if !self.prefix_filter_allowed(Some(extractor.name())) { + return Ok(None); + } + + self.probe_prefix_filter(key, extractor) + } + + /// Fast prefix filter probe using a precomputed hash. Uses `key` only to + /// locate the correct filter partition (TLI seek); the Bloom check uses the + /// precomputed `hash` directly, avoiding the `extract()` Box allocation and + /// hash computation. + /// + /// Does NOT update filter metrics. + pub(crate) fn probe_prefix_filter_with_hash( + &self, + key: &[u8], + hash: u64, + ) -> crate::Result> { + let filter_block = self.load_filter_block_for_key(key)?; + + if let Some(filter_block) = filter_block { + return Ok(Some(filter_block.maybe_contains_hash(hash)?)); + } + + Ok(None) + } + + /// Core prefix filter probe — assumes the caller already validated extractor + /// compatibility. Returns `Ok(Some(false))` if the prefix is definitively absent, + /// `Ok(Some(true))` if maybe present, or `Ok(None)` if the key is out-of-domain. + /// + /// Does NOT update filter metrics — callers are responsible for tracking + /// `filter_queries` and `io_skipped_by_filter` in a context-appropriate way. + pub(crate) fn probe_prefix_filter( + &self, + key: &[u8], + extractor: &dyn crate::prefix::PrefixExtractor, + ) -> crate::Result> { + let filter_block = self.load_filter_block_for_key(key)?; + + if let Some(filter_block) = filter_block { + return filter_block.maybe_contains_prefix(key, extractor); + } + + // No filter available => cannot determine membership + Ok(None) + } + // TODO: maybe we can skip Fuse costs of the user key // TODO: because we just want to return the value // TODO: we would need to return something like ValueType + Value @@ -624,6 +754,129 @@ impl Table { self.metadata.seqnos.1 + self.global_seqno() } + /// Returns the minimum user key in this table's key range. + #[must_use] + pub fn min_key(&self) -> &UserKey { + self.metadata.key_range.min() + } + + /// Determines if this table can be skipped for a given user range by consulting the prefix filter. + /// + /// Behavior: + /// - If both bounds share the same extracted prefix, consult once using a bound key. + /// A definite negative (Ok(Some(false))) means the table can be skipped. + /// - If the table's stored extractor is incompatible with the provided extractor, do not skip. + pub(crate) fn should_skip_range_by_prefix_filter, R: RangeBounds>( + &self, + range: &R, + extractor: &dyn crate::prefix::PrefixExtractor, + prefix_hint: Option<&[u8]>, + ) -> bool { + if !self.prefix_filter_allowed(Some(extractor.name())) { + return false; + } + + // If a prefix hint is available (from tree.prefix()), try to use it + // for a direct probe. This handles the case where prefix_to_range produces + // bounds with different extracted prefixes (e.g. prefix "h" → range "h".."i" + // with extractor length 1). + // + // We can only trust the probe when the extractor produces the same prefix + // for both the hint AND for keys that would match the prefix query. We + // verify this with a stability guard: extract_X(hint) == extract_X(hint + "\0"). + // If appending a byte changes the extracted prefix, the hint is not a + // stable prefix for the keys in the range and the probe could yield a + // false negative. + // + // We try extract_last first (most specific prefix → best Bloom pruning), + // and fall back to extract_first if the last-prefix guard fails. + if let Some(hint) = prefix_hint { + let extended: Vec = hint.iter().copied().chain(std::iter::once(0u8)).collect(); + + // Try the most specific (last) prefix first for better pruning. + // Fall back to the first prefix if the last isn't stable. + let best_hash = { + let last_hint = extractor.extract_last(hint); + let last_extended = extractor.extract_last(&extended); + match (last_hint, last_extended) { + (Some(lh), Some(le)) if lh == le => { + Some(crate::table::filter::standard_bloom::Builder::get_hash(lh)) + } + _ => { + let first_hint = extractor.extract_first(hint); + let first_extended = extractor.extract_first(&extended); + match (first_hint, first_extended) { + (Some(fh), Some(fe)) if fh == fe => { + Some(crate::table::filter::standard_bloom::Builder::get_hash(fh)) + } + _ => None, + } + } + } + }; + + if let Some(hash) = best_hash { + let probe = self.probe_prefix_filter_with_hash(hint, hash); + + #[cfg(feature = "metrics")] + if matches!(&probe, Ok(Some(_))) { + use std::sync::atomic::Ordering::Relaxed; + self.metrics.filter_queries.fetch_add(1, Relaxed); + } + + if matches!(probe, Ok(Some(false))) { + #[cfg(feature = "metrics")] + { + use std::sync::atomic::Ordering::Relaxed; + self.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); + } + + return true; + } + } + return false; + } + + let start_key = match range.start_bound() { + std::ops::Bound::Included(k) | std::ops::Bound::Excluded(k) => Some(k.as_ref()), + std::ops::Bound::Unbounded => None, + }; + let end_key = match range.end_bound() { + std::ops::Bound::Included(k) | std::ops::Bound::Excluded(k) => Some(k.as_ref()), + std::ops::Bound::Unbounded => None, + }; + + let start_pref = start_key.and_then(|k| extractor.extract_first(k)); + let end_pref = end_key.and_then(|k| extractor.extract_first(k)); + + if let (Some(sp), Some(ep)) = (start_pref, end_pref) { + if sp == ep { + if let Some(sk) = start_key { + let probe = self.probe_prefix_filter(sk, extractor); + + #[cfg(feature = "metrics")] + if matches!(&probe, Ok(Some(_))) { + use std::sync::atomic::Ordering::Relaxed; + self.metrics.filter_queries.fetch_add(1, Relaxed); + } + + if matches!(probe, Ok(Some(false))) { + #[cfg(feature = "metrics")] + { + use std::sync::atomic::Ordering::Relaxed; + self.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); + } + + return true; + } + } + return false; + } + } + + false + } + /// Returns the number of tombstone markers in the `Table`. #[must_use] #[doc(hidden)] diff --git a/src/table/multi_writer.rs b/src/table/multi_writer.rs index fefed7ddf..021054e23 100644 --- a/src/table/multi_writer.rs +++ b/src/table/multi_writer.rs @@ -48,6 +48,11 @@ pub struct MultiWriter { /// Level the tables are written to initial_level: u8, + + /// Optional prefix extractor to register prefixes in filters. + prefix_extractor: Option, + + whole_key_filtering: bool, } impl MultiWriter { @@ -91,6 +96,9 @@ impl MultiWriter { current_key: None, linked_blobs: HashMap::default(), + + prefix_extractor: None, + whole_key_filtering: true, }) } @@ -124,6 +132,23 @@ impl MultiWriter { self } + #[must_use] + pub fn use_prefix_extractor( + mut self, + extractor: Option, + ) -> Self { + self.prefix_extractor.clone_from(&extractor); + self.writer = self.writer.use_prefix_extractor(extractor); + self + } + + #[must_use] + pub fn use_whole_key_filtering(mut self, enabled: bool) -> Self { + self.whole_key_filtering = enabled; + self.writer = self.writer.use_whole_key_filtering(enabled); + self + } + #[must_use] pub fn use_data_block_restart_interval(mut self, interval: u8) -> Self { self.data_block_restart_interval = interval; @@ -193,6 +218,9 @@ impl MultiWriter { .use_bloom_policy(self.bloom_policy) .use_data_block_hash_ratio(self.data_block_hash_ratio); + new_writer = new_writer.use_prefix_extractor(self.prefix_extractor.clone()); + new_writer = new_writer.use_whole_key_filtering(self.whole_key_filtering); + if self.use_partitioned_index { new_writer = new_writer.use_partitioned_index(); } diff --git a/src/table/tests.rs b/src/table/tests.rs index 5556fb903..f8794f2eb 100644 --- a/src/table/tests.rs +++ b/src/table/tests.rs @@ -1428,6 +1428,175 @@ fn table_global_seqno() -> crate::Result<()> { Ok(()) } +/// `get_without_filter` must apply the same `global_seqno` normalization as `Table::get`. +/// +/// A table with `global_seqno = 7` stores entry "a1" at internal seqno 1 +/// (logical seqno = 1 + 7 = 8). Querying at seqno 8 should NOT see "a1" +/// because `8 - 7 = 1` and the entry-level check `1 >= 1` filters it out. +/// Before the fix, `get_without_filter` skipped normalization and would +/// pass raw seqno 8 to `point_read`, where `1 >= 8` is false — incorrectly +/// returning the entry. +#[test] +#[expect(clippy::unwrap_used)] +fn table_get_without_filter_applies_global_seqno() -> crate::Result<()> { + use crate::ValueType::Value; + + let items = [ + InternalValue::from_components("a0", "a0", 0, Value), + InternalValue::from_components("a1", "a1", 1, Value), + InternalValue::from_components("b", "b", 8, Value), + ]; + + let dir = tempfile::tempdir()?; + let file = dir.path().join("table_gwf_global_seqno"); + + let mut writer = crate::table::Writer::new(file.clone(), 0, 0) + .unwrap() + .use_partitioned_filter() + .use_data_block_size(1) + .use_meta_partition_size(1); + + for item in items.iter().cloned() { + writer.write(item).unwrap(); + } + + let _trailer = writer.finish().unwrap(); + + let table = crate::Table::recover( + file, + crate::Checksum::from_raw(0), + 7, + 0, + Arc::new(crate::Cache::with_capacity_bytes(0)), + Some(Arc::new(crate::DescriptorTable::new(10))), + true, + true, + #[cfg(feature = "metrics")] + Default::default(), + ) + .unwrap(); + + // global_seqno is 7, so "a1" has logical seqno 1+7=8. + // Querying at seqno=8 should NOT return "a1" (normalized: 8-7=1, entry seqno 1 >= 1). + assert!( + table.get_without_filter(b"a1", 8)?.is_none(), + "a1 should be invisible at seqno 8 (logical seqno equals query seqno)", + ); + + // "a0" has logical seqno 0+7=7. Querying at seqno=8: normalized 8-7=1, entry seqno 0 < 1 → visible. + assert_eq!(b"a0", &*table.get_without_filter(b"a0", 8)?.unwrap().value,); + + // "b" has logical seqno 8+7=15. Querying at seqno=8: normalized 1, entry seqno 8 >= 1 → invisible. + assert!( + table.get_without_filter(b"b", 8)?.is_none(), + "b should be invisible at seqno 8 (logical seqno 15 > 8)", + ); + + // Verify consistency: get_without_filter and get agree on every key. + for (key, seqno) in [ + (b"a0" as &[u8], 8), + (b"a1", 8), + (b"b", 8), + (b"a0", 16), + (b"a1", 16), + (b"b", 16), + ] { + let hash = BloomBuilder::get_hash(key); + let with_filter = table.get(key, seqno, hash)?; + let without_filter = table.get_without_filter(key, seqno)?; + assert_eq!( + with_filter.as_ref().map(|v| &*v.value), + without_filter.as_ref().map(|v| &*v.value), + "get and get_without_filter must agree for key={:?} seqno={}", + std::str::from_utf8(key).unwrap_or("?"), + seqno, + ); + } + + Ok(()) +} + +/// Exercises the partition spill inside `PartitionedFilterWriter::register_bytes`. +/// With a prefix extractor, prefix hashes are registered via `register_bytes` +/// rather than `register_key`. Using a tiny partition size (1 byte) forces the +/// filter to spill after every prefix hash. +#[test] +#[expect(clippy::unwrap_used)] +fn table_partitioned_prefix_filter_spills_during_register_bytes() -> crate::Result<()> { + use crate::prefix::FixedLengthExtractor; + + let dir = tempdir()?; + let file = dir.path().join("table_partitioned_prefix_spill"); + let ex: crate::prefix::SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + + let mut writer = Writer::new(file.clone(), 0, 0)?; + writer = writer + .use_bloom_policy(BloomConstructionPolicy::BitsPerKey(50.0)) + .use_prefix_extractor(Some(ex.clone())) + .use_partitioned_filter() + .use_meta_partition_size(1); // Force spills on every prefix hash + + for p in [b"aaa", b"bbb", b"ccc", b"ddd"] { + for i in 0..20u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + writer.write(InternalValue::from_components( + &k, + [], + 0, + crate::ValueType::Value, + ))?; + } + } + let (_, checksum) = writer.finish()?.unwrap(); + + #[cfg(feature = "metrics")] + let metrics = Arc::new(crate::Metrics::default()); + + let table = Table::recover( + file, + checksum, + 0, + 0, + Arc::new(crate::Cache::with_capacity_bytes(1_000_000)), + Some(Arc::new(crate::DescriptorTable::new(10))), + true, + true, + #[cfg(feature = "metrics")] + metrics, + )?; + + // Verify the filter was built as a partitioned filter (has a top-level index) + assert!( + table.pinned_filter_index.is_some(), + "expected partitioned filter with top-level index", + ); + + // Verify data is still readable through the prefix filter. + // With a prefix extractor, the filter contains prefix hashes (not full-key + // hashes), so we probe via maybe_contains_prefix rather than get(). + assert_eq!( + Some(true), + table.maybe_contains_prefix(b"aaa0000", ex.as_ref())?, + ); + assert_eq!( + Some(true), + table.maybe_contains_prefix(b"ddd0019", ex.as_ref())?, + ); + // Prefix "zzz" was never written — the filter should reject it + assert_eq!( + Some(false), + table.maybe_contains_prefix(b"zzz0000", ex.as_ref())?, + ); + + // Also verify actual data reads bypass the filter successfully + assert!(table.point_read(b"aaa0000", SeqNo::MAX)?.is_some()); + assert!(table.point_read(b"ddd0019", SeqNo::MAX)?.is_some()); + assert!(table.point_read(b"zzz0000", SeqNo::MAX)?.is_none()); + + Ok(()) +} + #[test] #[expect(clippy::unwrap_used)] fn table_return_global_seqno() -> crate::Result<()> { @@ -1470,3 +1639,154 @@ fn table_return_global_seqno() -> crate::Result<()> { Ok(()) } + +#[test] +#[expect(clippy::unwrap_used)] +fn table_should_skip_range_by_prefix_filter() -> crate::Result<()> { + use crate::prefix::FixedLengthExtractor; + use crate::range::prefix_upper_range; + use std::ops::Bound; + + let dir = tempdir()?; + let file = dir.path().join("table"); + let ex: crate::prefix::SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + + // Write a table containing keys with prefixes "aaa" and "bbb" only + let mut writer = Writer::new(file.clone(), 0, 0)?; + writer = writer + .use_bloom_policy(BloomConstructionPolicy::BitsPerKey(50.0)) + .use_prefix_extractor(Some(ex.clone())); + + for p in [b"aaa", b"bbb"] { + for i in 0..20u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + writer.write(InternalValue::from_components( + &k, + [], + 0, + crate::ValueType::Value, + ))?; + } + } + let (_, checksum) = writer.finish()?.unwrap(); + + #[cfg(feature = "metrics")] + let metrics = Arc::new(crate::Metrics::default()); + + let table = Table::recover( + file, + checksum, + 0, + 0, + Arc::new(crate::Cache::with_capacity_bytes(1_000_000)), + Some(Arc::new(crate::DescriptorTable::new(10))), + true, + true, + #[cfg(feature = "metrics")] + metrics, + )?; + + // Absent prefix "zzz": filter should say skip + let prefix = b"zzz00".to_vec(); + let start = Bound::Included(crate::UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + assert!( + table.should_skip_range_by_prefix_filter(&(start, end), ex.as_ref(), None), + "should skip: table does not contain prefix zzz" + ); + + // Present prefix "aaa": filter should NOT say skip + let prefix = b"aaa00".to_vec(); + let start = Bound::Included(crate::UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + assert!( + !table.should_skip_range_by_prefix_filter(&(start, end), ex.as_ref(), None), + "should NOT skip: table contains prefix aaa" + ); + + // Incompatible extractor name: should not skip (conservative) + let other_ex: crate::prefix::SharedPrefixExtractor = + Arc::new(crate::prefix::FixedPrefixExtractor::new(3)); + let prefix = b"zzz00".to_vec(); + let start = Bound::Included(crate::UserKey::from(prefix.clone())); + let end = prefix_upper_range(&prefix); + assert!( + !table.should_skip_range_by_prefix_filter(&(start, end), other_ex.as_ref(), None), + "should NOT skip: extractor name mismatch" + ); + + Ok(()) +} + +/// A multi-prefix range (start and end prefixes differ) must never be skipped +/// by the prefix filter, even when the start prefix matches the table's min key +/// prefix. The table may contain keys under other prefixes that fall within the +/// queried range. +#[test] +#[expect(clippy::unwrap_used)] +fn table_should_skip_range_multi_prefix_start_match_min_key() -> crate::Result<()> { + use crate::prefix::FixedLengthExtractor; + + let dir = tempdir()?; + let file = dir.path().join("table_multi_prefix_skip"); + let ex: crate::prefix::SharedPrefixExtractor = Arc::new(FixedLengthExtractor::new(3)); + + // Build a table with keys in two prefixes: "aaa" (min key prefix) and "bbb". + // Use a tiny data block size and partition size so the two prefixes land in + // different filter partitions. + let mut writer = Writer::new(file.clone(), 0, 0)?; + writer = writer + .use_bloom_policy(BloomConstructionPolicy::BitsPerKey(50.0)) + .use_prefix_extractor(Some(ex.clone())) + .use_partitioned_filter() + .use_data_block_size(1) + .use_meta_partition_size(1); + + for p in [b"aaa", b"bbb"] { + for i in 0..20u32 { + let mut k = p.to_vec(); + k.extend_from_slice(format!("{i:04}").as_bytes()); + writer.write(InternalValue::from_components( + &k, + [], + 0, + crate::ValueType::Value, + ))?; + } + } + let (_, checksum) = writer.finish()?.unwrap(); + + #[cfg(feature = "metrics")] + let metrics = Arc::new(crate::Metrics::default()); + + let table = Table::recover( + file, + checksum, + 0, + 0, + Arc::new(crate::Cache::with_capacity_bytes(1_000_000)), + Some(Arc::new(crate::DescriptorTable::new(10))), + true, + true, + #[cfg(feature = "metrics")] + metrics, + )?; + + // Range: start = "aaa9999" (after all "aaa" keys), end = "ccc0000". + // start_pref = "aaa", end_pref = "ccc" → sp != ep (multi-prefix range). + // sp == table.min_key_prefix ("aaa") → enters the second optimization block. + // The filter partition for "aaa9999" may not contain hash("aaa") because no + // "aaa" keys exist near "aaa9999" — but "bbb" keys ARE in the table and in range. + // The table must NOT be skipped. + let range = ( + std::ops::Bound::Included(b"aaa9999".to_vec()), + std::ops::Bound::Included(b"ccc0000".to_vec()), + ); + assert!( + !table.should_skip_range_by_prefix_filter(&range, ex.as_ref(), None), + "must NOT skip: table contains 'bbb' keys within the multi-prefix range" + ); + + Ok(()) +} diff --git a/src/table/writer/filter/full.rs b/src/table/writer/filter/full.rs index 66e7c2aa9..50e0f0ca2 100644 --- a/src/table/writer/filter/full.rs +++ b/src/table/writer/filter/full.rs @@ -16,6 +16,11 @@ pub struct FullFilterWriter { pub bloom_hash_buffer: Vec, bloom_policy: BloomConstructionPolicy, + + /// When true, sort+dedup the hash buffer at finish time to eliminate + /// duplicate prefix hashes. Enabled by `enable_dedup()` when a prefix + /// extractor is configured. + needs_dedup: bool, } impl FullFilterWriter { @@ -23,6 +28,7 @@ impl FullFilterWriter { Self { bloom_hash_buffer: Vec::new(), bloom_policy, + needs_dedup: false, } } } @@ -49,47 +55,69 @@ impl FilterWriter for FullFilterWriter { Ok(()) } + fn register_bytes(&mut self, bytes: &[u8]) -> crate::Result<()> { + self.bloom_hash_buffer.push(Builder::get_hash(bytes)); + Ok(()) + } + + fn enable_dedup(&mut self) { + self.needs_dedup = true; + } + fn finish( self: Box, file_writer: &mut sfa::Writer>>, ) -> crate::Result { if self.bloom_hash_buffer.is_empty() { log::trace!("Filter writer has no buffered hashes - not building filter"); - } else { - file_writer.start("filter")?; + return Ok(0); + } - let n = self.bloom_hash_buffer.len(); + file_writer.start("filter")?; - log::trace!( - "Constructing Bloom filter with {n} entries: {:?}", - self.bloom_policy, - ); + let mut hashes = self.bloom_hash_buffer; - let start = std::time::Instant::now(); + // When a prefix extractor is configured, multiple keys can produce the + // same prefix hash. Sort + dedup so the filter is sized for the + // true number of unique prefixes. Skipped for the full-key path where + // each hash is already unique (the Writer deduplicates at the user-key + // level before calling register_key). + if self.needs_dedup { + hashes.sort_unstable(); + hashes.dedup(); + } - let filter_bytes = { - let mut builder = self.bloom_policy.init(n); + let n = hashes.len(); - for hash in self.bloom_hash_buffer { - builder.set_with_hash(hash); - } + log::trace!( + "Constructing Bloom filter with {n} entries: {:?}", + self.bloom_policy, + ); - builder.build() - }; + let start = std::time::Instant::now(); - log::trace!( - "Built Bloom filter ({}B) in {:?}", - filter_bytes.len(), - start.elapsed(), - ); + let filter_bytes = { + let mut builder = self.bloom_policy.init(n); - Block::write_into( - file_writer, - &filter_bytes, - crate::table::block::BlockType::Filter, - CompressionType::None, - )?; - } + for hash in hashes { + builder.set_with_hash(hash); + } + + builder.build() + }; + + log::trace!( + "Built Bloom filter ({}B) in {:?}", + filter_bytes.len(), + start.elapsed(), + ); + + Block::write_into( + file_writer, + &filter_bytes, + crate::table::block::BlockType::Filter, + CompressionType::None, + )?; Ok(1) } diff --git a/src/table/writer/filter/mod.rs b/src/table/writer/filter/mod.rs index 892027542..90ca46460 100644 --- a/src/table/writer/filter/mod.rs +++ b/src/table/writer/filter/mod.rs @@ -16,9 +16,34 @@ use std::{fs::File, io::BufWriter}; pub trait FilterWriter { // NOTE: We purposefully use a UserKey instead of &[u8] // so we can clone it without heap allocation, if needed - /// Registers a key in the block index. + /// Registers a key in the filter by hashing it. fn register_key(&mut self, key: &UserKey) -> crate::Result<()>; + /// Registers arbitrary bytes into the filter (used for prefix entries). + /// Implementations should hash the bytes identically to full keys. + fn register_bytes(&mut self, bytes: &[u8]) -> crate::Result<()>; + + /// Enables hash deduplication at flush time. Should be called when a prefix + /// extractor is configured, since multiple keys can produce the same prefix + /// hash. Without a prefix extractor, each key produces a unique hash and + /// dedup is unnecessary. + fn enable_dedup(&mut self) {} + + /// Informs the filter writer that a new user key is about to be registered. + /// Implementations may use this to spill an oversized buffered partition + /// on key boundaries, so a partition's TLI key always corresponds to a + /// key whose hashes are fully committed to that partition. No-op for + /// non-partitioned filters. + /// + /// # Errors + /// + /// Returns an error if a partition spill triggered by this call fails. + /// Only possible for partitioned implementations, which perform I/O + /// during spills. + fn notify_key(&mut self, _key: &UserKey) -> crate::Result<()> { + Ok(()) + } + /// Writes the filter to a file. /// /// Returns the number of filter blocks written (always 1 in case of full filter block). diff --git a/src/table/writer/filter/partitioned.rs b/src/table/writer/filter/partitioned.rs index 1cd8068ff..fecd6ca0d 100644 --- a/src/table/writer/filter/partitioned.rs +++ b/src/table/writer/filter/partitioned.rs @@ -34,6 +34,11 @@ pub struct PartitionedFilterWriter { last_key: Option, + /// When true, sort+dedup per-partition hash buffer to eliminate duplicate + /// prefix hashes. Enabled by `enable_dedup()` when a prefix extractor is + /// configured. + needs_dedup: bool, + compression: CompressionType, } @@ -52,12 +57,18 @@ impl PartitionedFilterWriter { relative_file_pos: 0, last_key: None, + needs_dedup: false, compression: CompressionType::None, } } fn spill_filter_partition(&mut self, key: &UserKey) -> crate::Result<()> { + if self.needs_dedup { + self.bloom_hash_buffer.sort_unstable(); + self.bloom_hash_buffer.dedup(); + } + let filter_bytes = { let mut builder = self.bloom_policy.init(self.bloom_hash_buffer.len()); @@ -160,6 +171,26 @@ impl FilterWriter for PartitionedFilterWri self } + fn notify_key(&mut self, key: &UserKey) -> crate::Result<()> { + // If the buffered partition is over the threshold, spill it now, + // before any of the new key's hashes are buffered. The spilled + // partition's TLI key is the *previous* user key, whose hashes are + // all already committed. register_bytes never spills; register_key + // may spill at the end of a user key, after all of that key's + // hashes (prefixes + full) have been buffered. + if self.approx_filter_size >= self.partition_size as usize { + if let Some(prev_key) = self.last_key.clone() { + self.spill_filter_partition(&prev_key)?; + } + } + self.last_key = Some(key.clone()); + Ok(()) + } + + fn enable_dedup(&mut self) { + self.needs_dedup = true; + } + fn register_key(&mut self, key: &UserKey) -> crate::Result<()> { self.bloom_hash_buffer.push(Builder::get_hash(key)); @@ -169,6 +200,10 @@ impl FilterWriter for PartitionedFilterWri self.last_key = Some(key.clone()); + // Spilling here is safe because register_key is called once per user + // key, after all of that key's prefix hashes (if any) have been + // registered. The spilled partition's TLI key correctly maps to a + // key whose hashes are fully present in the partition. if self.approx_filter_size >= self.partition_size as usize { self.spill_filter_partition(key)?; } @@ -176,6 +211,20 @@ impl FilterWriter for PartitionedFilterWri Ok(()) } + fn register_bytes(&mut self, bytes: &[u8]) -> crate::Result<()> { + // Buffer the hash without spilling. Mid-key spills are deferred to + // the next `notify_key` (or `register_key` for the same user key) + // so the spilled partition's TLI key always reflects fully committed + // hashes for that key. + self.bloom_hash_buffer.push(Builder::get_hash(bytes)); + + self.approx_filter_size = self + .bloom_policy + .estimated_filter_size(self.bloom_hash_buffer.len()); + + Ok(()) + } + fn finish( mut self: Box, file_writer: &mut sfa::Writer>>, @@ -194,6 +243,14 @@ impl FilterWriter for PartitionedFilterWri self.spill_filter_partition(&last_key)?; } + // If no filter partitions were created (e.g. a prefix extractor was + // configured but every key was shorter than the required prefix length, + // so no hashes were registered), skip writing the empty filter. + if self.tli_handles.is_empty() { + log::trace!("No filter partitions created - not building filter"); + return Ok(0); + } + let index_base_offset = BlockOffset(file_writer.get_mut().stream_position()?); file_writer.start("filter")?; diff --git a/src/table/writer/mod.rs b/src/table/writer/mod.rs index f85d8c845..42906289c 100644 --- a/src/table/writer/mod.rs +++ b/src/table/writer/mod.rs @@ -92,6 +92,14 @@ pub struct Writer { linked_blob_files: Vec, initial_level: u8, + + /// Optional prefix extractor used to register extracted prefixes into the filter. + /// When present, extracted prefixes are registered instead of the full key. + prefix_extractor: Option, + + /// When true, full-key hashes are always added to the filter (even with a + /// prefix extractor), enabling precise point-read filtering. + whole_key_filtering: bool, } impl Writer { @@ -140,6 +148,9 @@ impl Writer { previous_item: None, linked_blob_files: Vec::new(), + + prefix_extractor: None, + whole_key_filtering: true, }) } @@ -162,6 +173,14 @@ impl Writer { pub fn use_partitioned_filter(mut self) -> Self { self.filter_writer = Box::new(filter::PartitionedFilterWriter::new(self.bloom_policy)) .use_tli_compression(self.index_block_compression); + // If a prefix extractor was already configured, propagate dedup to + // the freshly-installed partitioned writer. Without this, callers + // that invoke `use_prefix_extractor` *before* `use_partitioned_filter` + // would silently lose dedup because the previous filter writer + // (where `enable_dedup` was applied) is discarded above. + if self.prefix_extractor.is_some() { + self.filter_writer.enable_dedup(); + } self } @@ -233,6 +252,25 @@ impl Writer { self } + /// Sets the prefix extractor to enable prefix-aware filter construction. + #[must_use] + pub fn use_prefix_extractor( + mut self, + extractor: Option, + ) -> Self { + if extractor.is_some() { + self.filter_writer.enable_dedup(); + } + self.prefix_extractor = extractor; + self + } + + #[must_use] + pub fn use_whole_key_filtering(mut self, enabled: bool) -> Self { + self.whole_key_filtering = enabled; + self + } + /// Writes an item. /// /// # Note @@ -273,7 +311,40 @@ impl Writer { // of the same key if self.bloom_policy.is_active() { - self.filter_writer.register_key(&user_key)?; + // When a prefix extractor is configured, register extracted + // prefix hashes. When whole_key_filtering is also enabled + // (the default), register the full key hash too. This allows: + // - Prefix scans to use the prefix filter (coarse, table-level) + // - Point reads to use the full-key Bloom (precise, when enabled) + // This matches RocksDB's whole_key_filtering + prefix_extractor + // approach. + // + // Order matters for partitioned filters: + // 1. notify_key flushes any pending oversized partition + // first, using the *previous* user key as the TLI + // boundary — so partition i covers everything up to and + // including the previous user key. + // 2. Prefix hashes are buffered (register_bytes never spills). + // 3. The full-key hash is registered last; register_key is + // the only spill trigger inside this key, and any spill + // it causes uses the *current* user key as the TLI + // boundary, after all of this key's hashes (prefixes + + // full) are committed to the partition. + // This guarantees a partition's TLI key always corresponds + // to a key whose hashes are fully present in that partition. + if let Some(ref extractor) = self.prefix_extractor { + self.filter_writer.notify_key(&user_key)?; + + for prefix in extractor.extract(user_key.as_ref()) { + self.filter_writer.register_bytes(prefix)?; + } + + if self.whole_key_filtering { + self.filter_writer.register_key(&user_key)?; + } + } else { + self.filter_writer.register_key(&user_key)?; + } } } @@ -418,7 +489,7 @@ impl Writer { InternalValue::from_components(key, value, 0, crate::ValueType::Value) } - let meta_items = [ + let mut meta_items = vec![ meta( "block_count#data", &(self.meta.data_block_count as u64).to_le_bytes(), @@ -453,47 +524,63 @@ impl Writer { meta("item_count", &(self.meta.item_count as u64).to_le_bytes()), meta( "key#max", - // NOTE: At the beginning we check that we have written at least 1 item, so last_key must exist #[expect(clippy::expect_used)] self.meta.last_key.as_ref().expect("should exist"), ), meta( "key#min", - // NOTE: At the beginning we check that we have written at least 1 item, so first_key must exist #[expect(clippy::expect_used)] self.meta.first_key.as_ref().expect("should exist"), ), meta("key_count", &(self.meta.key_count as u64).to_le_bytes()), meta("prefix_truncation#data", &[1]), // NOTE: currently prefix truncation can not be disabled meta("prefix_truncation#index", &[1]), // NOTE: currently prefix truncation can not be disabled - meta( - "restart_interval#data", - &self.data_block_restart_interval.to_le_bytes(), - ), - meta( - "restart_interval#index", - &self.index_block_restart_interval.to_le_bytes(), - ), - meta("seqno#max", &self.meta.highest_seqno.to_le_bytes()), - meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes()), - meta("table_id", &self.table_id.to_le_bytes()), - meta("table_version", &[3u8]), - meta( - "tombstone_count", - &(self.meta.tombstone_count as u64).to_le_bytes(), - ), - meta("user_data_size", &self.meta.uncompressed_size.to_le_bytes()), - meta( - "weak_tombstone_count", - &(self.meta.weak_tombstone_count as u64).to_le_bytes(), - ), - meta( - "weak_tombstone_reclaimable", - &(self.meta.weak_tombstone_reclaimable_count as u64).to_le_bytes(), - ), ]; + // Persist the extractor name so recovery can compare it to the current extractor. + // If names differ, disable prefix-based pruning for this table to avoid false negatives. + if let Some(ref extractor) = self.prefix_extractor { + meta_items.push(meta("prefix_extractor", extractor.name().as_bytes())); + // Persist whether this table contains full-key hashes alongside + // prefix hashes. Reads must use this value (not the runtime + // config) to decide whether the full-key Bloom is trustworthy. + // Mismatched config at recovery would otherwise cause data loss. + meta_items.push(meta( + "whole_key_filtering", + &[u8::from(self.whole_key_filtering)], + )); + } + meta_items.push(meta( + "restart_interval#data", + &self.data_block_restart_interval.to_le_bytes(), + )); + meta_items.push(meta( + "restart_interval#index", + &self.index_block_restart_interval.to_le_bytes(), + )); + meta_items.push(meta("seqno#max", &self.meta.highest_seqno.to_le_bytes())); + meta_items.push(meta("seqno#min", &self.meta.lowest_seqno.to_le_bytes())); + meta_items.push(meta("table_id", &self.table_id.to_le_bytes())); + meta_items.push(meta("table_version", &[3u8])); + meta_items.push(meta( + "tombstone_count", + &(self.meta.tombstone_count as u64).to_le_bytes(), + )); + meta_items.push(meta( + "user_data_size", + &self.meta.uncompressed_size.to_le_bytes(), + )); + meta_items.push(meta( + "weak_tombstone_count", + &(self.meta.weak_tombstone_count as u64).to_le_bytes(), + )); + meta_items.push(meta( + "weak_tombstone_reclaimable", + &(self.meta.weak_tombstone_reclaimable_count as u64).to_le_bytes(), + )); + + // Ensure deterministic ordering for metadata entries without cloning keys + meta_items.sort_by(|a, b| a.key.cmp(&b.key)); - // NOTE: Just to make sure the items are definitely sorted #[cfg(debug_assertions)] { let is_sorted = meta_items.iter().is_sorted_by_key(|kv| &kv.key); diff --git a/src/tree/ingest.rs b/src/tree/ingest.rs index 0a588ab6d..7bf826f20 100644 --- a/src/tree/ingest.rs +++ b/src/tree/ingest.rs @@ -94,6 +94,11 @@ impl<'a> Ingestion<'a> { .get(INITIAL_CANONICAL_LEVEL), ); + // Propagate the configured prefix extractor so writers can register extracted + // prefixes and persist the extractor name in table metadata. + writer = writer.use_prefix_extractor(tree.config.prefix_extractor.clone()); + writer = writer.use_whole_key_filtering(tree.config.whole_key_filtering); + if index_partitioning { writer = writer.use_partitioned_index(); } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 453e9891c..fbdc94ed5 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -162,7 +162,7 @@ impl AbstractTree for Tree { .expect("lock is poisoned") .get_version_for_snapshot(seqno); - Self::get_internal_entry_from_version(&super_version, key, seqno) + Self::get_internal_entry_from_version(&super_version, key, seqno, &self.config) } fn current_version(&self) -> Version { @@ -387,7 +387,12 @@ impl AbstractTree for Tree { Bloom(policy) => policy, None => BloomConstructionPolicy::BitsPerKey(0.0), } - }); + }) + // Ensure tables built during flush carry the configured extractor. + // This lets writers register prefixes and persist the extractor name in metadata + // for compatibility checks at read time. + .use_prefix_extractor(self.config.prefix_extractor.clone()) + .use_whole_key_filtering(self.config.whole_key_filtering); if index_partitioning { table_writer = table_writer.use_partitioned_index(); @@ -673,6 +678,8 @@ impl Tree { range: &'a R, seqno: SeqNo, ephemeral: Option<(Arc, SeqNo)>, + prefix_extractor: Option, + prefix_hint: Option<&[u8]>, ) -> impl DoubleEndedIterator> + 'static { use crate::range::{IterState, TreeIter}; use std::ops::Bound::{self, Excluded, Included, Unbounded}; @@ -691,7 +698,14 @@ impl Tree { let bounds: (Bound, Bound) = (lo, hi); - let iter_state = { IterState { version, ephemeral } }; + let iter_state = { + IterState { + version, + ephemeral, + prefix_extractor, + prefix_hint: prefix_hint.map(Into::into), + } + }; TreeIter::create_range(iter_state, bounds, seqno) } @@ -700,6 +714,7 @@ impl Tree { super_version: &SuperVersion, key: &[u8], seqno: SeqNo, + config: &Config, ) -> crate::Result> { if let Some(entry) = super_version.active_memtable.get(key, seqno) { return Ok(ignore_tombstone_value(entry)); @@ -713,13 +728,14 @@ impl Tree { } // Now look in tables... this may involve disk I/O - Self::get_internal_entry_from_tables(&super_version.version, key, seqno) + Self::get_internal_entry_from_tables(&super_version.version, key, seqno, config) } fn get_internal_entry_from_tables( version: &Version, key: &[u8], seqno: SeqNo, + config: &Config, ) -> crate::Result> { // NOTE: Create key hash for hash sharing // https://fjall-rs.github.io/post/bloom-filter-hash-sharing/ @@ -730,7 +746,7 @@ impl Tree { .flat_map(|lvl| lvl.iter()) .filter_map(|run| run.get_for_key(key)) { - if let Some(item) = table.get(key, seqno, key_hash)? { + if let Some(item) = Self::point_read_from_table(config, table, key, seqno, key_hash)? { return Ok(ignore_tombstone_value(item)); } } @@ -840,6 +856,77 @@ impl Tree { .is_empty() } + /// Centralized point-read from a single table with prefix-aware pre-checks and + /// compatibility gating. Returns Ok(None) if the prefix filter definitively excludes + /// the key or if the table lookup returns no match. + fn point_read_from_table( + config: &Config, + table: &Table, + key: &[u8], + seqno: SeqNo, + key_hash: u64, + ) -> crate::Result> { + // Determine compatibility of table's stored extractor with current config + let allow_filter = + table.prefix_filter_allowed(config.prefix_extractor.as_ref().map(|e| e.name())); + + if allow_filter { + if let Some(ex) = config.prefix_extractor.as_ref() { + // Use the TABLE's persisted whole_key_filtering value, NOT the + // runtime config. Otherwise reopening a tree with a different + // config could route reads through the wrong filter type and + // produce false negatives (data loss). + if table.metadata.whole_key_filtering { + // The table's filter contains full-key hashes. The full-key + // Bloom is strictly more precise than the prefix pre-check + // for point reads, so skip the prefix check and go straight + // to the Bloom. This avoids a redundant filter probe on + // every point read. + } else { + // Table only has prefix hashes; use the prefix filter as + // the sole pre-check. + let probe = table.maybe_contains_prefix(key, ex.as_ref())?; + + #[cfg(feature = "metrics")] + if probe.is_some() { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.filter_queries.fetch_add(1, Relaxed); + } + + if probe == Some(false) { + #[cfg(feature = "metrics")] + { + use std::sync::atomic::Ordering::Relaxed; + table.metrics.io_skipped_by_filter.fetch_add(1, Relaxed); + } + + return Ok(None); + } + + return table.get_without_filter(key, seqno); + } + } + + return table.get(key, seqno, key_hash); + } + + // Filter compatibility failed (mismatched/missing extractor) but the + // full-key Bloom may still be valid. The filter contains full-key + // hashes whenever `whole_key_filtering` is true; that flag is + // independent of the prefix extractor and is preserved across + // reopens with a different (or absent) extractor. Tables written + // without an extractor parse `whole_key_filtering=true` by default + // (see meta.rs), so legacy tables also take this path. + if table.metadata.whole_key_filtering { + return table.get(key, seqno, key_hash); + } + + // Table was written with an extractor and `whole_key_filtering=false`, + // so its filter contains only prefix hashes that aren't queryable with + // the full-key hash. Skip the filter and read the data block directly. + table.get_without_filter(key, seqno) + } + fn inner_compact( &self, strategy: Arc, @@ -881,7 +968,15 @@ impl Tree { .expect("lock is poisoned") .get_version_for_snapshot(seqno); - Self::create_internal_range(super_version, range, seqno, ephemeral).map(|item| match item { + Self::create_internal_range( + super_version, + range, + seqno, + ephemeral, + self.config.prefix_extractor.clone(), + None, + ) + .map(|item| match item { Ok(kv) => Ok((kv.key.user_key, kv.value)), Err(e) => Err(e), }) @@ -896,8 +991,28 @@ impl Tree { ) -> impl DoubleEndedIterator> + 'static { use crate::range::prefix_to_range; - let range = prefix_to_range(prefix.as_ref()); - self.create_range(&range, seqno, ephemeral) + let prefix_bytes = prefix.as_ref(); + let range = prefix_to_range(prefix_bytes); + + #[expect(clippy::expect_used, reason = "lock is expected to not be poisoned")] + let super_version = self + .version_history + .read() + .expect("lock is poisoned") + .get_version_for_snapshot(seqno); + + Self::create_internal_range( + super_version, + &range, + seqno, + ephemeral, + self.config.prefix_extractor.clone(), + Some(prefix_bytes), + ) + .map(|item| match item { + Ok(kv) => Ok((kv.key.user_key, kv.value)), + Err(e) => Err(e), + }) } /// Adds an item to the active memtable. diff --git a/tests/prefix_filter.rs b/tests/prefix_filter.rs new file mode 100644 index 000000000..8b1434d41 --- /dev/null +++ b/tests/prefix_filter.rs @@ -0,0 +1,6456 @@ +use lsm_tree::config::{ + BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry, KvSeparationOptions, PinningPolicy, +}; +use lsm_tree::Guard; +use lsm_tree::SequenceNumberCounter; +use lsm_tree::{ + prefix::{FixedLengthExtractor, FixedPrefixExtractor, FullKeyExtractor, PrefixExtractor}, + AbstractTree, Config, SeqNo, +}; +use std::sync::Arc; + +// Helper function to generate test keys with prefixes +fn generate_test_key(prefix: &str, suffix: &str) -> Vec { + format!("{}{}", prefix, suffix).into_bytes() +} + +/// Tests that an empty filter does not crash the partitioned filter writer (found by fuzz testing) +#[test] +fn test_prefix_filter_partitioned_filter_fuzz_0() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(100))) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + .open()?; + + for i in 0..50u32 { + let key = format!("zebra_{:04}", i); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..50u32 { + let key = format!("zulu_{:04}", i); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let count = tree + .range::<&[u8], std::ops::RangeFrom<&[u8]>>((&b"user1_0000"[..]).., u64::MAX, None) + .count(); + assert_eq!(count, 100); + + Ok(()) +} + +#[test] +fn test_prefix_filter_range_start_only_prefix_no_upfront_prune() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 5; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + for i in 0..50u32 { + let key = format!("zebra_{:04}", i); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..50u32 { + let key = format!("zulu_{:04}", i); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let count = tree + .range::<&[u8], std::ops::RangeFrom<&[u8]>>((&b"user1_0000"[..]).., u64::MAX, None) + .count(); + assert_eq!(count, 100); + + Ok(()) +} + +#[test] +fn test_prefix_filter_with_fixed_prefix() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 8; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert keys with common prefixes + let prefix1 = "prefix01"; + let prefix2 = "prefix02"; + + for i in 0..100 { + let key1 = generate_test_key(prefix1, &format!("_{:04}", i)); + let key2 = generate_test_key(prefix2, &format!("_{:04}", i)); + + tree.insert(key1, b"value1", 0); + tree.insert(key2, b"value2", 0); + } + + tree.flush_active_memtable(0)?; + + // Test that keys with matching prefixes are found + for i in 0..100 { + let key1 = generate_test_key(prefix1, &format!("_{:04}", i)); + let key2 = generate_test_key(prefix2, &format!("_{:04}", i)); + + assert!(tree.contains_key(&key1, u64::MAX)?); + assert!(tree.contains_key(&key2, u64::MAX)?); + } + + // Test that keys with non-matching prefixes work correctly + let non_existent_key = generate_test_key("prefix99", "_0000"); + assert!(!tree.contains_key(&non_existent_key, u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"aaaaaaaa_0000", b"lo", 0); + tree.insert(b"zzzzzzzz_0000", b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "prefix05" has 8-byte prefix "prefix05" which is absent from this new segment's + // filter (which only has "aaaaaaaa" and "zzzzzzzz") and falls within range. + let _ = tree.contains_key(b"prefix05_0000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_with_fixed_length() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let required_len = 10; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(required_len))) + .open()?; + + // Insert keys with exactly the required length prefix + for i in 0..50 { + let key = format!("exactlen{:02}_suffix_{}", i, i); + tree.insert(key.as_bytes(), b"value", 0); + } + + // Insert keys that are too short (out of domain) + for i in 0..20 { + let short_key = format!("key{}", i); + tree.insert(short_key.as_bytes(), b"short_value", 0); + } + + tree.flush_active_memtable(0)?; + + // Verify keys with matching length are found + for i in 0..50 { + let key = format!("exactlen{:02}_suffix_{}", i, i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + // Verify short keys are also found (they're stored but not in filter) + for i in 0..20 { + let short_key = format!("key{}", i); + assert!(tree.contains_key(short_key.as_bytes(), u64::MAX)?); + } + + // Verify non-existent prefix is quickly rejected + // Use a key that matches the required length to ensure it's in-domain + let range = tree.range("nonexist00".."nonexist99", u64::MAX, None); + assert_eq!(range.count(), 0); + + #[cfg(feature = "metrics")] + { + // Look up a key whose 10-byte prefix ("exactlenZZ") is absent but within range + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"exactlenZZ_suffix_X", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_full_key() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Using FullKeyExtractor (default behavior) + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + // Insert various keys + let keys = vec![ + b"apple".to_vec(), + b"banana".to_vec(), + b"cherry".to_vec(), + b"date".to_vec(), + b"elderberry".to_vec(), + ]; + + for key in &keys { + tree.insert(key.clone(), b"value", 0); + } + + tree.flush_active_memtable(0)?; + + // All keys should be found + for key in &keys { + assert!(tree.contains_key(key, u64::MAX)?); + } + + // Non-existent key test + assert!(!tree.contains_key(b"fig", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Look up a key within range [apple..elderberry] whose full-key prefix is absent. + // "blueberry" is between "banana" and "cherry" lexicographically. + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"blueberry", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_range_queries() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 5; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert keys with common prefixes + let prefixes = vec!["user_", "post_", "comm_"]; + + for prefix in &prefixes { + for i in 0..20 { + let key = format!("{}{:04}", prefix, i); + tree.insert(key.as_bytes(), format!("value_{}", i).as_bytes(), 0); + } + } + + tree.flush_active_memtable(0)?; + + // Test prefix iteration + for prefix in &prefixes { + let start_key = prefix.to_string(); + let end_key = format!("{}~", prefix); // '~' is after all digits and letters + + let count = tree + .range(start_key.as_bytes()..end_key.as_bytes(), u64::MAX, None) + .count(); + assert_eq!(count, 20); + } + + // Test non-existent prefix range + let count = tree + .range(&b"none_"[..]..&b"none~"[..], u64::MAX, None) + .count(); + assert_eq!(count, 0); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 5-byte prefix "none_" that is within range [comm_..user~] + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"none_0000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_after_compaction() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 6; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert first batch of keys + for i in 0..50 { + let key = format!("batch1_{:04}", i); + tree.insert(key.as_bytes(), b"value1", 0); + } + + tree.flush_active_memtable(0)?; + + // Insert second batch with overlapping keys + for i in 25..75 { + let key = format!("batch1_{:04}", i); + tree.insert(key.as_bytes(), b"value2", 0); + } + + tree.flush_active_memtable(0)?; + + // Force compaction + use lsm_tree::compaction::Leveled; + tree.compact(Arc::new(Leveled::default()), 0)?; + + // All keys should still be found after compaction + for i in 0..75 { + let key = format!("batch1_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"aaaaaa_0000", b"lo", 0); + tree.insert(b"zzzzzz_0000", b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "batchM" is between "aaaaaa" and "zzzzzz" and not in the filter + let _ = tree.contains_key(b"batchM_0000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased after compaction" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_with_deletions() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 7; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert keys + for i in 0..100 { + let key = format!("deltest_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + // Sentinel with a different 7-byte prefix to widen the key range + tree.insert(b"zzzzzzz_sentinel", b"value", 0); + + tree.flush_active_memtable(0)?; + + // Delete some keys + for i in (0..100).step_by(2) { + let key = format!("deltest_{:04}", i); + tree.remove(key.as_bytes(), 0); + } + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + #[cfg(feature = "metrics")] + let initial_hits = tree.metrics().io_skipped_by_filter(); + + // Verify deletions + for i in 0..100 { + let key = format!("deltest_{:04}", i); + if i % 2 == 0 { + assert!(!tree.contains_key(key.as_bytes(), u64::MAX)?); + } else { + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + } + + #[cfg(feature = "metrics")] + { + // Look up a key with a prefix absent from the filter to verify + // the filter is functional after deletions. + let before_q = tree.metrics().filter_queries(); + let before_s = tree.metrics().io_skipped_by_filter(); + + // "delzzz_0000" has prefix "delzz..." absent from filter, + // and falls within the table's key range. + assert!(!tree.contains_key(b"delzzzz_0000", u64::MAX)?); + + let after_q = tree.metrics().filter_queries(); + let after_s = tree.metrics().io_skipped_by_filter(); + + assert!( + after_q > before_q, + "filter should be consulted for absent-prefix key after deletions" + ); + assert!( + after_s > before_s, + "filter should skip absent-prefix key after deletions" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_edge_cases() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Test with prefix length of 1 + let tree = Config::new( + folder.path().join("test1"), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(1))) + .open()?; + + tree.insert(b"a", b"value", 0); + tree.insert(b"b", b"value", 0); + tree.insert(b"ab", b"value", 0); + tree.insert(b"ba", b"value", 0); + tree.insert(b"d", b"value", 0); // widen range so "c" prefix is within [a..d] + + tree.flush_active_memtable(0)?; + + assert!(tree.contains_key(b"a", u64::MAX)?); + assert!(tree.contains_key(b"b", u64::MAX)?); + assert!(tree.contains_key(b"ab", u64::MAX)?); + assert!(tree.contains_key(b"ba", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 1-byte prefix "c" within range [a..d] + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"c", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased for point lookups" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + // Test with empty keys + let tree2 = Config::new( + folder.path().join("test2"), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .open()?; + + tree2.insert(b"test", b"short_key", 0); + tree2.insert(b"longer_key", b"long_key", 0); + + tree2.flush_active_memtable(0)?; + + assert!(tree2.contains_key(b"test", u64::MAX)?); + assert!(tree2.contains_key(b"longer_key", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 5-byte prefix "mzzzz" within range [longer_key..test] + let initial_queries2 = tree2.metrics().filter_queries(); + let initial_hits2 = tree2.metrics().io_skipped_by_filter(); + let _ = tree2.contains_key(b"mzzzz_key", u64::MAX)?; + let final_queries2 = tree2.metrics().filter_queries(); + let final_hits2 = tree2.metrics().io_skipped_by_filter(); + assert!( + final_queries2 > initial_queries2, + "filter queries should have increased for short/long key lookups" + ); + assert!( + final_hits2 > initial_hits2, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_large_dataset() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 12; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert a large number of keys with various prefixes + let prefixes = vec![ + "transaction_", + "userprofile_", + "sessiondata_", + "logentryval_", + ]; + + for prefix in &prefixes { + for i in 0..1000 { + let key = format!("{}{:08}", prefix, i); + let value = format!("data_{}", i); + tree.insert(key.as_bytes(), value.as_bytes(), 0); + + // Flush periodically to create multiple segments + if i % 250 == 249 { + tree.flush_active_memtable(0)?; + } + } + } + + // Final flush + tree.flush_active_memtable(0)?; + + // Verify all keys are found + for prefix in &prefixes { + for i in 0..1000 { + let key = format!("{}{:08}", prefix, i); + assert!( + tree.contains_key(key.as_bytes(), u64::MAX)?, + "Key {} not found", + key + ); + } + } + + // Test non-existent keys with matching prefixes + for prefix in &prefixes { + let non_existent_key = format!("{}{:08}", prefix, 9999); + assert!(!tree.contains_key(non_existent_key.as_bytes(), u64::MAX)?); + } + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"aaaaaaaaaaaa00000000", b"lo", 0); + tree.insert(b"zzzzzzzzzzzz00000000", b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "mmmmmmmmmmmm00000000" has 12-byte prefix "mmmmmmmmmmmm" which is + // absent from the new segment's filter and within its range. + let _ = tree.contains_key(b"mmmmmmmmmmmm00000000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased for large dataset" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_concurrent_access() -> lsm_tree::Result<()> { + use std::thread; + + let folder = tempfile::tempdir()?; + let prefix_len = 8; + + let tree = Arc::new( + Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?, + ); + + // Spawn multiple threads to insert data + let handles: Vec<_> = (0..4) + .map(|thread_id| { + let tree = Arc::clone(&tree); + thread::spawn(move || { + for i in 0..250 { + let key = format!("thread{:02}_{:04}", thread_id, i); + tree.insert(key.as_bytes(), b"value", 0); + } + }) + }) + .collect(); + + // Wait for all threads to complete + for handle in handles { + handle.join().unwrap(); + } + + // Add a key with prefix "thread99" to widen the range for filter testing + tree.insert(b"thread99_0000", b"value", 0); + + tree.flush_active_memtable(0)?; + + // Verify all keys from all threads + for thread_id in 0..4 { + for i in 0..250 { + let key = format!("thread{:02}_{:04}", thread_id, i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + } + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 8-byte prefix "thread50" within range [thread00..thread99] + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"thread50_0000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased for concurrent access" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_sequence_consistency() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 9; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert initial data with sequence number 0-49 + for i in 0..50 { + let key = format!("seqtest1_{:04}", i); + tree.insert(key.as_bytes(), b"v1", i as u64); + } + // Sentinel with a different 9-byte prefix to widen the key range + tree.insert(b"zzzzzzzzz_sentinel", b"v1", 0); + + tree.flush_active_memtable(0)?; + + // Insert more data with sequence numbers 50-99 + for i in 50..100 { + let key = format!("seqtest1_{:04}", i); + tree.insert(key.as_bytes(), b"v2", i as u64); + } + + tree.flush_active_memtable(0)?; + + // Verify that at sequence number 50, only the first 50 keys are visible + // (keys inserted at seqno 0-49 are visible at seqno >= their insert seqno) + for i in 0..50 { + let key = format!("seqtest1_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), 50)?); + } + + for i in 50..100 { + let key = format!("seqtest1_{:04}", i); + assert!(!tree.contains_key(key.as_bytes(), 50)?); + } + + // Verify tree sees all data at max sequence number + for i in 0..100 { + let key = format!("seqtest1_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + #[cfg(feature = "metrics")] + { + // Look up a key with an absent prefix to verify filter is functional + let before = tree.metrics().filter_queries(); + + assert!(!tree.contains_key(b"seqzzzzz_0000", u64::MAX)?); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter queries should have increased for sequence consistency checks" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_seek_optimization() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 8; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + // Insert keys with specific prefixes + for i in 0..100 { + let key = format!("prefix_a_{:04}", i); + tree.insert(key.as_bytes(), b"value_a", 0); + } + + for i in 0..100 { + let key = format!("prefix_b_{:04}", i); + tree.insert(key.as_bytes(), b"value_b", 0); + } + + tree.flush_active_memtable(0)?; + + // Seek with existing prefix should find keys + let range_a = tree.range("prefix_a_0000".."prefix_a_9999", u64::MAX, None); + assert_eq!(range_a.count(), 100); + + // Seek with non-existent prefix should return empty (optimized via filter) + let range_c = tree.range("prefix_c_0000".."prefix_c_9999", u64::MAX, None); + assert_eq!(range_c.count(), 0); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"aaaaaaaa_0000", b"lo", 0); + tree.insert(b"zzzzzzzz_0000", b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "prefix_m" has 8-byte prefix "prefix_m" which is absent from the new + // segment's filter and falls within its range [aaaaaaaa..zzzzzzzz]. + let _ = tree.contains_key(b"prefix_m_0000", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should have increased for range operations" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + // Verify partial prefix matches work + let range_partial = tree.range("prefix_a_0050".."prefix_a_0060", u64::MAX, None); + assert_eq!(range_partial.count(), 10); + + Ok(()) +} + +#[test] +fn test_no_prefix_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Create tree without prefix extractor (default behavior) + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + // Insert various keys + for i in 0..100 { + let key = format!("noprefix_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + + tree.flush_active_memtable(0)?; + + // All keys should be found (full key matching) + for i in 0..100 { + let key = format!("noprefix_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + #[cfg(feature = "metrics")] + { + // Look up a key that does NOT exist but falls within the table's key + // range so the filter is actually consulted. Under the current metrics + // semantics (issue #246), filter_queries only increments when the filter + // definitively excludes a key or when the filter lets a key through but + // the key is not found (false positive / wasted I/O). + let initial_queries = tree.metrics().filter_queries(); + + assert!(!tree.contains_key(b"noprefix_0050x", u64::MAX)?); + + let final_queries = tree.metrics().filter_queries(); + assert!( + final_queries > initial_queries, + "filter queries should increment for a missing key lookup" + ); + } + + Ok(()) +} + +// Custom segmented prefix extractor for account_id#user_id pattern +struct SegmentedPrefixExtractor { + delimiter: u8, +} + +impl SegmentedPrefixExtractor { + fn new(delimiter: u8) -> Self { + Self { delimiter } + } +} + +impl PrefixExtractor for SegmentedPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + let mut prefixes = Vec::new(); + + // Find the first delimiter position + if let Some(first_delim_pos) = key.iter().position(|&b| b == self.delimiter) { + // Add the prefix up to the first delimiter (account_id) + prefixes.push(&key[..first_delim_pos]); + + // Find the second delimiter position + if let Some(second_delim_pos) = key[first_delim_pos + 1..] + .iter() + .position(|&b| b == self.delimiter) + { + // Add the prefix up to the second delimiter (account_id#user_id) + let full_prefix_end = first_delim_pos + 1 + second_delim_pos; + prefixes.push(&key[..full_prefix_end]); + } else { + // If no second delimiter, use the entire key as prefix + prefixes.push(key); + } + } else { + // No delimiter found, use the entire key + prefixes.push(key); + } + + Box::new(prefixes.into_iter()) + } + + fn name(&self) -> &str { + "SegmentedPrefixExtractor" + } +} + +#[test] +fn test_prefix_filter_segmented_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let delimiter = b'#'; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(SegmentedPrefixExtractor::new(delimiter))) + .open()?; + + // Insert keys with account_id#user_id#data pattern + let account1 = "acc001"; + let account2 = "acc002"; + + // Insert users for account1 + for user_id in 1..=5 { + for data_id in 1..=10 { + let key = format!("{}#user{:03}#data{:04}", account1, user_id, data_id); + let value = format!("value_{}_{}", user_id, data_id); + tree.insert(key.as_bytes(), value.as_bytes(), 0); + } + } + + // Insert users for account2 + for user_id in 1..=3 { + for data_id in 1..=10 { + let key = format!("{}#user{:03}#data{:04}", account2, user_id, data_id); + let value = format!("value_{}_{}", user_id, data_id); + tree.insert(key.as_bytes(), value.as_bytes(), 0); + } + } + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Test 1: Query for specific user within account1 + let user_key = format!("{}#user002#data0005", account1); + assert!(tree.contains_key(user_key.as_bytes(), u64::MAX)?); + + // Test 2: Query for all data of a specific user (prefix range query) + let user_prefix_start = format!("{}#user002#", account1); + let user_prefix_end = format!("{}#user002~", account1); // ~ is after # + let user_range = tree.range( + user_prefix_start.as_bytes()..user_prefix_end.as_bytes(), + u64::MAX, + None, + ); + assert_eq!(user_range.count(), 10); // Should find 10 data items for this user + + // Test 3: Query for all users in account1 (account-level prefix) + let account_prefix_start = format!("{}#", account1); + let account_prefix_end = format!("{}~", account1); // ~ is after # + let account_range = tree.range( + account_prefix_start.as_bytes()..account_prefix_end.as_bytes(), + u64::MAX, + None, + ); + assert_eq!(account_range.count(), 50); // 5 users * 10 data items + + // Test 4: Query for non-existent account + let non_existent_start = "acc999#"; + let non_existent_end = "acc999~"; + let non_existent_range = tree.range( + non_existent_start.as_bytes()..non_existent_end.as_bytes(), + u64::MAX, + None, + ); + assert_eq!(non_existent_range.count(), 0); + + // Test 5: Query for non-existent user in existing account + let non_user_key = format!("{}#user999#data0001", account1); + assert!(!tree.contains_key(non_user_key.as_bytes(), u64::MAX)?); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Should have filter queries for all lookups + assert!( + final_queries > initial_queries, + "filter queries should have increased for segmented lookups" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_single_byte_keys() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(2))) + .open()?; + + // Insert single-byte keys + for i in 0u8..10 { + tree.insert([i], format!("value_{}", i).as_bytes(), 0); + } + + // Insert two-byte keys + for i in 0u8..10 { + tree.insert([i, i], format!("value_{}{}", i, i).as_bytes(), 0); + } + + tree.flush_active_memtable(0)?; + + // All keys should be found + for i in 0u8..10 { + assert!(tree.contains_key([i], u64::MAX)?); + assert!(tree.contains_key([i, i], u64::MAX)?); + } + + // Non-existent single-byte key + assert!(!tree.contains_key([255], u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 2-byte prefix [0x05, 0xAA] within range [0x00..0x09,0x09] + // Using 0xAA to make collision with existing prefixes extremely unlikely. + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(&[0x05, 0xAA, 0xBB], u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should increase for single/two-byte key lookups" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_null_bytes() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Insert keys with null bytes + tree.insert(b"\0\0\0data", b"null_prefix", 0); + tree.insert(b"pre\0fix", b"null_middle", 0); + tree.insert(b"suffix\0", b"null_end", 0); + tree.insert(b"\0", b"single_null", 0); + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // All keys should be found + assert!(tree.contains_key(b"\0\0\0data", u64::MAX)?); + assert!(tree.contains_key(b"pre\0fix", u64::MAX)?); + assert!(tree.contains_key(b"suffix\0", u64::MAX)?); + assert!(tree.contains_key(b"\0", u64::MAX)?); + + // Non-existent keys with null bytes + assert!(!tree.contains_key(b"\0\0\0missing", u64::MAX)?); + assert!(!tree.contains_key(b"pre\0missing", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for null byte key lookups" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_non_ascii() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(6))) + .open()?; + + // Insert keys with UTF-8 characters + tree.insert("prefix_测试_data".as_bytes(), b"chinese", 0); + tree.insert("prefix_тест_data".as_bytes(), b"cyrillic", 0); + tree.insert("prefix_🦀_data".as_bytes(), b"emoji", 0); + tree.insert("prefix_café".as_bytes(), b"accented", 0); + + // Insert binary keys (non-UTF8) + tree.insert([0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA], b"binary", 0); + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // All keys should be found + assert!(tree.contains_key("prefix_测试_data".as_bytes(), u64::MAX)?); + assert!(tree.contains_key("prefix_тест_data".as_bytes(), u64::MAX)?); + assert!(tree.contains_key("prefix_🦀_data".as_bytes(), u64::MAX)?); + assert!(tree.contains_key("prefix_café".as_bytes(), u64::MAX)?); + assert!(tree.contains_key([0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA], u64::MAX)?); + + // Non-existent keys + assert!(!tree.contains_key("prefix_missing".as_bytes(), u64::MAX)?); + assert!(!tree.contains_key([0xFF, 0xFE, 0xFD, 0x00, 0x00, 0x00], u64::MAX)?); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for non-ASCII key lookups" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_keys_as_prefixes() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Insert keys where some are prefixes of others + tree.insert(b"a", b"value1", 0); + tree.insert(b"ab", b"value2", 0); + tree.insert(b"abc", b"value3", 0); + tree.insert(b"abcd", b"value4", 0); + tree.insert(b"abcde", b"value5", 0); + tree.insert(b"abcdef", b"value6", 0); + + tree.flush_active_memtable(0)?; + + // All keys should be found regardless of prefix relationships + assert!(tree.contains_key(b"a", u64::MAX)?); + assert!(tree.contains_key(b"ab", u64::MAX)?); + assert!(tree.contains_key(b"abc", u64::MAX)?); + assert!(tree.contains_key(b"abcd", u64::MAX)?); + assert!(tree.contains_key(b"abcde", u64::MAX)?); + assert!(tree.contains_key(b"abcdef", u64::MAX)?); + + // Non-existent keys with same prefix + assert!(!tree.contains_key(b"abcdx", u64::MAX)?); + assert!(!tree.contains_key(b"abx", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 4-byte prefix "abcc" within range [a..abcdef] + // "abcc" is not among existing prefixes (a, ab, abc, abcd) and + // "abcc_test" < "abcdef", so it falls within the table's key range. + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"abcc_test", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should increase for prefix-related key lookups" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_very_long_keys() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(10))) + .open()?; + + // Create very long keys + let long_key1 = vec![b'a'; 10000]; + let long_key2 = vec![b'b'; 10000]; + let mut long_key3 = vec![b'c'; 5000]; + long_key3.extend(vec![b'd'; 5000]); + + tree.insert(&long_key1, b"long1", 0); + tree.insert(&long_key2, b"long2", 0); + tree.insert(&long_key3, b"long3", 0); + + tree.flush_active_memtable(0)?; + + // All long keys should be found + assert!(tree.contains_key(&long_key1, u64::MAX)?); + assert!(tree.contains_key(&long_key2, u64::MAX)?); + assert!(tree.contains_key(&long_key3, u64::MAX)?); + + // Non-existent long key + let non_existent = vec![b'x'; 10000]; + assert!(!tree.contains_key(&non_existent, u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + let lo_key = vec![b'A'; 10000]; // 'A' < 'a' + let hi_key = vec![b'z'; 10000]; + tree.insert(&lo_key, b"lo", 0); + tree.insert(&hi_key, b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // 10-byte prefix "eeeeeeeeee" is between "AAAAAAAAAA" and "zzzzzzzzzz" + // and not in the new segment's filter + let absent_key = vec![b'e'; 10000]; + let _ = tree.contains_key(&absent_key, u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for very long key lookups" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_all_same_byte() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .open()?; + + // Insert keys that are all the same byte + for len in 1..=10 { + let key = vec![b'x'; len]; + tree.insert(&key, format!("value_{}", len).as_bytes(), 0); + } + + tree.flush_active_memtable(0)?; + + // All keys should be found + for len in 1..=10 { + let key = vec![b'x'; len]; + assert!(tree.contains_key(&key, u64::MAX)?); + } + + // Non-existent key with same pattern + assert!(!tree.contains_key(vec![b'x'; 15], u64::MAX)?); + assert!(!tree.contains_key(vec![b'y'; 5], u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(&vec![b'a'; 5], b"lo", 0); + tree.insert(&vec![b'z'; 5], b"hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // 5-byte prefix "mmmmm" is between "aaaaa" and "zzzzz" and not in the filter + let _ = tree.contains_key(&vec![b'm'; 5], u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for same-byte key lookups" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +// Custom extractor that returns many prefixes for stress testing +struct ManyPrefixExtractor; + +impl PrefixExtractor for ManyPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + let mut prefixes = Vec::new(); + + // Generate all possible prefixes (up to 20 or key length) + for i in 1..=key.len().min(20) { + prefixes.push(&key[0..i]); + } + + // Also add the full key + if !prefixes.is_empty() { + prefixes.push(key); + } + + Box::new(prefixes.into_iter()) + } + + fn name(&self) -> &str { + "ManyPrefixExtractor" + } +} + +#[test] +fn test_prefix_filter_many_prefixes() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(ManyPrefixExtractor)) + .open()?; + + // Insert keys that will generate many prefixes + tree.insert(b"this_is_a_very_long_key_for_testing", b"value1", 0); + tree.insert(b"another_long_key_with_many_prefixes", b"value2", 0); + tree.insert(b"short", b"value3", 0); + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // All keys should be found + assert!(tree.contains_key(b"this_is_a_very_long_key_for_testing", u64::MAX)?); + assert!(tree.contains_key(b"another_long_key_with_many_prefixes", u64::MAX)?); + assert!(tree.contains_key(b"short", u64::MAX)?); + + // Test non-existent key + assert!(!tree.contains_key(b"non_existent_key_with_many_prefixes", u64::MAX)?); + + // Range queries should work with many prefixes + let range = tree.range(b"this".as_ref().., u64::MAX, None); + assert!(range.count() > 0); + + let range = tree.range(b"anot".as_ref().., u64::MAX, None); + assert!(range.count() > 0); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for many-prefix extractor" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_disabled() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Create tree with filter disabled + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::None)) // Disable filter + .open()?; + + // Insert some keys + for i in 0..100 { + let key = format!("disabled_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Keys should still be found (via actual disk lookups) + for i in 0..100 { + let key = format!("disabled_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Should have no filter queries when disabled + assert_eq!( + final_queries, initial_queries, + "No filter queries when disabled" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_false_positive_rate() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Use higher bits per key for lower false positive rate + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(8))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(20.0), + ))) // Higher bits for lower FP rate + .open()?; + + // Insert a specific set of keys + for i in 0..1000 { + let key = format!("fptest_{:06}", i * 2); // Even numbers only + tree.insert(key.as_bytes(), b"value", 0); + } + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + #[cfg(feature = "metrics")] + let initial_hits = tree.metrics().io_skipped_by_filter(); + + let mut false_positives = 0; + let total_checks = 1000; + + // Check for non-existent keys (odd numbers) + for i in 0..total_checks { + let key = format!("fptest_{:06}", i * 2 + 1); + if tree.contains_key(key.as_bytes(), u64::MAX)? { + false_positives += 1; + } + } + + // With 20 bits per key, false positive rate should be very low + let fp_rate = false_positives as f64 / total_checks as f64; + assert!( + fp_rate < 0.01, + "False positive rate {} should be less than 1% with 20 bits per key", + fp_rate + ); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + // The full-key Bloom should catch most nonexistent keys, incrementing + // both filter_queries and io_skipped_by_filter. + assert!( + final_queries > initial_queries, + "filter queries should increase for false-positive rate test" + ); + assert!( + final_hits > initial_hits, + "filter should skip nonexistent keys via full-key Bloom" + ); + } + + Ok(()) +} + +#[test] +fn test_prefix_filter_mixed_domain_keys() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(8))) + .open()?; + + // Mix of in-domain and out-of-domain keys + tree.insert(b"12345678_data", b"in_domain", 0); // In domain + tree.insert(b"short", b"out_of_domain", 0); // Out of domain + tree.insert(b"12345678", b"exact_length", 0); // Exact length + tree.insert(b"1234567", b"too_short", 0); // Out of domain + tree.insert(b"123456789", b"longer", 0); // In domain + + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // All keys should be found + assert!(tree.contains_key(b"12345678_data", u64::MAX)?); + assert!(tree.contains_key(b"short", u64::MAX)?); + assert!(tree.contains_key(b"12345678", u64::MAX)?); + assert!(tree.contains_key(b"1234567", u64::MAX)?); + assert!(tree.contains_key(b"123456789", u64::MAX)?); + + // Non-existent keys with different domain status + assert!(!tree.contains_key(b"12345678_missing", u64::MAX)?); // Would be in domain + assert!(!tree.contains_key(b"tiny", u64::MAX)?); // Would be out of domain + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for mixed domain key lookups" + ); + } + + Ok(()) +} + +/// Test that range queries don't incorrectly skip segments when the start bound +/// doesn't exist in the filter but other keys in the range do exist +#[test] +fn test_prefix_filter_range_with_missing_start_bound() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Use full key as prefix (FullKeyExtractor) + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + // Insert keys b and c, but not a + tree.insert(b"b", b"value_b", 0); + tree.insert(b"c", b"value_c", 0); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query range a..=c + // Extract common prefix from both bounds (empty for "a" and "c") + // But now we check if start bound "a" exists - it doesn't, but segment starts with "b" + // So we can't skip the segment (different prefixes) + let mut results = Vec::new(); + for item in tree.range(&b"a"[..]..=&b"c"[..], u64::MAX, None) { + results.push(item.key()?.to_vec()); + } + + // Should return b and c (even though a doesn't exist) + assert_eq!(results.len(), 2); + assert_eq!(results[0], b"b"); + assert_eq!(results[1], b"c"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert_eq!( + final_queries, initial_queries, + "filter should not be queried" + ); + } + + Ok(()) +} + +/// Test the new optimization: when range has no common prefix but start bound prefix doesn't exist +#[test] +fn test_prefix_filter_range_start_prefix_optimization() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Use a fixed prefix extractor + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Insert keys that all start with "bbb" + tree.insert(b"bbb_1", b"value1", 0); + tree.insert(b"bbb_2", b"value2", 0); + tree.insert(b"bbb_3", b"value3", 0); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query range aaa..zzz (no common prefix) + // The segment starts with "bbb" and "aaa" doesn't exist + // But since segment min ("bbb") != start prefix ("aaa"), we can't skip + let mut results = Vec::new(); + for item in tree.range(&b"aaa"[..]..&b"zzz"[..], u64::MAX, None) { + results.push(item.key()?.to_vec()); + } + assert_eq!(results.len(), 3, "Should find all bbb keys"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert_eq!( + final_queries, initial_queries, + "filter should not be queried" + ); + } + + // Now test where we CAN skip: segment that starts with same prefix as missing start bound + let tree2 = Config::new( + folder.path().join("test2"), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Create a tree with keys having prefix "aaa" and "aac" but not "aab" + tree2.insert(b"aaa_1", b"value1", 0); + tree2.insert(b"aaa_2", b"value2", 0); + tree2.insert(b"aac_1", b"value3", 0); + tree2.insert(b"aac_2", b"value4", 0); + tree2.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree2.metrics().filter_queries(); + + // First verify the tree has data + assert!(tree2.contains_key(b"aaa_1", u64::MAX)?); + assert!(tree2.contains_key(b"aac_1", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + assert_eq!( + final_queries, initial_queries, + "filter should not be queried" + ); + } + + #[cfg(feature = "metrics")] + let initial_queries = tree2.metrics().filter_queries(); + + // Query for range with common prefix "aab" - no keys exist with this prefix + // Range: aab_1..aab_9 has common prefix "aab" + // The segment contains "aaa" and "aac" keys, so it overlaps the range + // filter will be checked for "aab" and should indicate it doesn't exist + let range_iter = tree2.range(&b"aab_1"[..]..&b"aab_9"[..], u64::MAX, None); + let results: Vec<_> = range_iter.collect(); + assert_eq!( + results.len(), + 0, + "No keys should match since aab prefix doesn't exist" + ); + + #[cfg(feature = "metrics")] + { + let final_queries = tree2.metrics().filter_queries(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for range operations" + ); + } + + Ok(()) +} + +/// Test that range queries correctly handle different prefix scenarios: +/// same prefix, different prefixes, and non-existent prefixes +#[test] +fn test_prefix_filter_range_across_different_prefixes() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .open()?; + + // Store keys with same prefix + tree.insert("user1_a", "v1", 0); + tree.insert("user1_b", "v2", 0); + tree.flush_active_memtable(0)?; + + tree.insert("user2_a", "v3", 1); + tree.insert("user2_b", "v4", 1); + tree.flush_active_memtable(0)?; + + // Query with common prefix "user1" - should find entries + let count = tree.range("user1_a"..="user1_z", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find user1 entries"); + + // Query with non-existent prefix - should return nothing + let count = tree.range("user3_a"..="user3_z", u64::MAX, None).count(); + assert_eq!(count, 0, "Should find no user3 entries"); + + // Query across different prefixes - no common prefix + let count = tree.range("user1_a"..="user2_b", u64::MAX, None).count(); + assert_eq!(count, 4, "Should find all entries when no common prefix"); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert("aaaa_z", "lo", 0); + tree.insert("zzzz_z", "hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "user5" (5-byte prefix) is between "aaaa_" and "zzzz_" and not in the filter + let _ = tree.contains_key(b"user5_a", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for range operations" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test range queries with reversed bounds (should return empty) +#[test] +fn test_prefix_filter_range_reversed_bounds() -> lsm_tree::Result<()> { + use std::ops::Bound; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + // Insert some keys + tree.insert("a", "value_a", 0); + tree.insert("b", "value_b", 0); + tree.insert("c", "value_c", 0); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query with reversed bounds - should return empty + let count = tree.range("c".."a", u64::MAX, None).count(); + assert_eq!(count, 0, "Reversed bounds should return empty"); + + // Also test with excluded bounds reversed + let count = tree + .range::<&str, _>((Bound::Excluded("c"), Bound::Included("a")), u64::MAX, None) + .count(); + assert_eq!(count, 0, "Reversed excluded bounds should return empty"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Reversed bounds may skip filter entirely + assert_eq!( + final_queries, initial_queries, + "filter should not be queried for reversed (empty) ranges" + ); + } + + Ok(()) +} + +/// Test range with same key but different bound types +#[test] +fn test_prefix_filter_range_same_key_different_bounds() -> lsm_tree::Result<()> { + use std::ops::Bound; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + tree.insert("key", "value", 0); + tree.insert("key2", "value2", 0); + tree.flush_active_memtable(0)?; + + // Included..Excluded with same key (empty range) + let count = tree + .range::<&str, _>( + (Bound::Included("key"), Bound::Excluded("key")), + u64::MAX, + None, + ) + .count(); + assert_eq!(count, 0, "Included..Excluded same key should be empty"); + + // Excluded..Included with same key (empty range) + let count = tree + .range::<&str, _>( + (Bound::Excluded("key"), Bound::Included("key")), + u64::MAX, + None, + ) + .count(); + assert_eq!(count, 0, "Excluded..Included same key should be empty"); + + // Included..Included with same key (single item) + let count = tree + .range::<&str, _>( + (Bound::Included("key"), Bound::Included("key")), + u64::MAX, + None, + ) + .count(); + assert_eq!(count, 1, "Included..Included same key should return 1"); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent full-key prefix within range [key..key2] + // "key1" is between "key" and "key2" lexicographically and not in the filter. + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"key1", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for same-key range operations" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test range with non-consecutive keys having common prefix +#[test] +fn test_prefix_filter_range_non_consecutive_keys() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Insert non-consecutive keys with same prefix + tree.insert("app_1", "v1", 0); + tree.insert("app_3", "v3", 0); + tree.insert("app_5", "v5", 0); + tree.insert("app_7", "v7", 0); + tree.flush_active_memtable(0)?; + + // Query for range that includes missing keys + let count = tree.range("app_2"..="app_6", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find app_3 and app_5"); + + // Query for range entirely between existing keys + let count = tree.range("app_4".."app_5", u64::MAX, None).count(); + assert_eq!(count, 0, "No keys in range app_4..app_5"); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert("aaa_1", "lo", 0); + tree.insert("zzz_1", "hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // 3-byte prefix "mmm" is between "aaa" and "zzz" and not in the filter + let _ = tree.contains_key(b"mmm_1", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should work with non-consecutive keys" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test range queries across multiple segments with different prefixes +#[test] +fn test_prefix_filter_range_multiple_segments() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Create first segment with user prefix + tree.insert("user_001", "v1", 0); + tree.insert("user_002", "v2", 0); + tree.flush_active_memtable(0)?; + + // Create second segment with item prefix + tree.insert("item_001", "v3", 1); + tree.insert("item_002", "v4", 1); + tree.flush_active_memtable(0)?; + + // Create third segment with both prefixes + tree.insert("user_003", "v5", 2); + tree.insert("item_003", "v6", 2); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query across all segments + let count = tree.range("item_001"..="user_003", u64::MAX, None).count(); + assert_eq!(count, 6, "Should find all items and users in range"); + + // Query for non-existent prefix across segments + let count = tree.range("test_001"..="test_999", u64::MAX, None).count(); + assert_eq!(count, 0, "Non-existent prefix should return nothing"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Queries across multiple segments should check filters + assert!( + final_queries > initial_queries, + "filter queries should increase for multi-segment range queries" + ); + } + + Ok(()) +} + +/// Test range with keys where prefix changes at segment boundary +#[test] +fn test_prefix_filter_range_prefix_boundary() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // First segment ends with "aaz" + tree.insert("aax_1", "v1", 0); + tree.insert("aay_1", "v2", 0); + tree.insert("aaz_1", "v3", 0); + tree.flush_active_memtable(0)?; + + // Second segment starts with "aba" (different prefix) + tree.insert("aba_1", "v4", 1); + tree.insert("abb_1", "v5", 1); + tree.insert("abc_1", "v6", 1); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query across the boundary + let count = tree.range("aay_1"..="abb_1", u64::MAX, None).count(); + assert_eq!(count, 4, "Should find keys from both segments"); + + // Query that spans missing prefix between segments + let count = tree.range("aaz_2"..="aba_0", u64::MAX, None).count(); + assert_eq!(count, 0, "No keys in the gap between segments"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Common prefix is only 2 chars ("aa" and "ab"), less than extractor length (3) + // So filter may be bypassed + assert_eq!( + final_queries, initial_queries, + "filter should be bypassed when common prefix is shorter than extractor" + ); + } + + Ok(()) +} + +/// Test range with no prefix extractor (should not use filter optimization) +#[test] +fn test_prefix_filter_range_no_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Create tree without prefix extractor + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + // Insert various keys + tree.insert("a", "v1", 0); + tree.insert("b", "v2", 0); + tree.insert("c", "v3", 0); + tree.insert("d", "v4", 0); + tree.insert("e", "v5", 0); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Range queries should work normally without filter optimization + let count = tree.range("a"..="c", u64::MAX, None).count(); + assert_eq!(count, 3, "Should find a, b, c"); + + let count = tree.range("b"..="d", u64::MAX, None).count(); + assert_eq!(count, 3, "Should find b, c, d"); + + // Empty range + let count = tree.range("f"..="z", u64::MAX, None).count(); + assert_eq!(count, 0, "Should find nothing"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Without prefix extractor, filter optimization is not used for ranges + assert_eq!( + final_queries, initial_queries, + "filter should not be used for ranges without prefix extractor" + ); + } + + Ok(()) +} + +/// Test range with both bounds excluded +#[test] +fn test_prefix_filter_range_both_excluded() -> lsm_tree::Result<()> { + use std::ops::Bound; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + // Insert keys + for key in ["a", "b", "c", "d", "e"] { + tree.insert(key, "value", 0); + } + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + #[cfg(feature = "metrics")] + let initial_hits = tree.metrics().io_skipped_by_filter(); + + // Test with both bounds excluded + let count = tree + .range::<&str, _>((Bound::Excluded("a"), Bound::Excluded("e")), u64::MAX, None) + .count(); + assert_eq!(count, 3, "Should return b, c, d"); + + // Edge case: adjacent keys with both excluded + let count = tree + .range::<&str, _>((Bound::Excluded("b"), Bound::Excluded("c")), u64::MAX, None) + .count(); + assert_eq!(count, 0, "No keys between adjacent excluded bounds"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + // Range queries with excluded bounds may or may not use filter + // depending on prefix extraction logic + assert!( + final_queries >= initial_queries, + "filter queries should not decrease for excluded bound ranges" + ); + + // All keys exist, hits should not increase + assert_eq!( + final_hits, initial_hits, + "filter hits should not increase for existing keys" + ); + } + + Ok(()) +} + +/// Test range after compaction with prefix filters +#[test] +fn test_prefix_filter_range_after_compaction() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Create multiple segments + for i in 0..3 { + tree.insert(format!("user_{}", i), format!("v{}", i), i); + tree.insert(format!("item_{}", i), format!("i{}", i), i); + tree.flush_active_memtable(0)?; + } + + // Skip compaction test since it's not implemented + // tree.major_compact(u64::MAX)?; + + // Verify range queries still work after compaction + let count = tree.range("user_0"..="user_2", u64::MAX, None).count(); + assert_eq!(count, 3, "Should find all user keys after compaction"); + + let count = tree.range("item_0"..="item_2", u64::MAX, None).count(); + assert_eq!(count, 3, "Should find all item keys after compaction"); + + // Query across prefixes + let count = tree.range("item_1"..="user_1", u64::MAX, None).count(); + assert_eq!(count, 4, "Should find mixed keys after compaction"); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert("aaaa_0", "lo", 0); + tree.insert("zzzz_0", "hi", 0); + tree.flush_active_memtable(0)?; + + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + // "nnnn" is between "aaaa" and "zzzz" and not in the filter + let _ = tree.contains_key(b"nnnn_0", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should increase for range operations after compaction" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test range with Unicode/UTF-8 prefix boundaries +#[test] +fn test_prefix_filter_range_utf8_boundaries() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(6))) // 6 bytes = 2 UTF-8 chars for these emojis + .open()?; + + // Insert keys with emoji prefixes (each emoji is 3-4 bytes) + tree.insert("🎈🎈_001", "v1", 0); + tree.insert("🎈🎈_002", "v2", 0); + tree.insert("🎉🎉_001", "v3", 0); + tree.insert("🎉🎉_002", "v4", 0); + tree.flush_active_memtable(0)?; + + // Query within same emoji prefix + let count = tree.range("🎈🎈_001"..="🎈🎈_002", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find keys with balloon prefix"); + + // Query across different emoji prefixes + let count = tree.range("🎈🎈_002"..="🎉🎉_001", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find keys across emoji boundaries"); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 6-byte prefix within range. + // 🎈 = F0 9F 8E 88 (4 bytes), 🎉 = F0 9F 8E 89 (4 bytes) + // 6-byte prefix of "🎈🎈_001" is F0 9F 8E 88 F0 9F + // Use bytes that form a distinct 6-byte prefix between the two emoji prefixes. + // F0 9F 8E 88 F0 A0 is between F0 9F 8E 88 F0 9F and F0 9F 8E 89 F0 9F + let absent_key: &[u8] = &[0xF0, 0x9F, 0x8E, 0x88, 0xF0, 0xA0, b'_', b'0', b'0', b'1']; + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(absent_key, u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "filter queries should increase for UTF-8 boundary range queries" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test with custom extractor returning multiple prefixes +#[test] +fn test_prefix_filter_range_multi_prefix_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Custom extractor that returns multiple prefixes + struct MultiPrefixExtractor; + impl PrefixExtractor for MultiPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + if key.len() >= 6 { + // Return both 3-byte and 6-byte prefixes + Box::new(vec![&key[..3], &key[..6]].into_iter()) + } else if key.len() >= 3 { + Box::new(std::iter::once(&key[..3])) + } else { + Box::new(std::iter::once(key)) + } + } + fn name(&self) -> &str { + "MultiPrefixExtractor" + } + } + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(MultiPrefixExtractor)) + .open()?; + + tree.insert("abc123_data", "v1", 0); + tree.insert("abc456_data", "v2", 0); + tree.insert("def123_data", "v3", 0); + tree.flush_active_memtable(0)?; + + // Query should work with common 3-byte prefix + let count = tree.range("abc000"..="abc999", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find keys with abc prefix"); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent prefix within range [abc123_data..def123_data]. + // "bcd000_data" has 3-byte prefix "bcd" and 6-byte prefix "bcd000", + // neither of which is in the filter (existing: abc, abc123, abc456, def, def123). + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(b"bcd000_data", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should work with segmented extractor" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test range with bytes at UTF-8 boundary splitting +#[test] +fn test_prefix_filter_range_utf8_split() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Use a fixed byte extractor that might split UTF-8 chars + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(2))) + .open()?; + + // Insert keys with multi-byte UTF-8 characters + tree.insert("中文_1", "v1", 0); + tree.insert("中文_2", "v2", 0); + tree.insert("日本_1", "v3", 0); + tree.flush_active_memtable(0)?; + + // The prefix will be the first 2 bytes, which splits the UTF-8 character + // This tests that the implementation handles partial UTF-8 correctly + let count = tree.range("中文_1"..="中文_2", u64::MAX, None).count(); + assert_eq!(count, 2, "Should find keys despite UTF-8 splitting"); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 2-byte prefix within range. + // 中 = E4 B8 AD, 日 = E6 97 A5. 2-byte prefixes: E4 B8 and E6 97. + // A key with 2-byte prefix E5 00 is between E4 B8 and E6 97. + let absent_key: &[u8] = &[0xE5, 0x00, b'_', b'1']; + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.contains_key(absent_key, u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter queries should increase for UTF-8 split range" + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Test empty range (start > end after normalization) +#[test] +fn test_prefix_filter_empty_normalized_range() -> lsm_tree::Result<()> { + use std::ops::Bound; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + folder.path(), + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + tree.insert("b", "value", 0); + tree.flush_active_memtable(0)?; + + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + #[cfg(feature = "metrics")] + let initial_hits = tree.metrics().io_skipped_by_filter(); + + // Create a range that becomes empty after normalization + let count = tree + .range::<&str, _>((Bound::Excluded("b"), Bound::Excluded("b")), u64::MAX, None) + .count(); + assert_eq!(count, 0, "Empty normalized range should return nothing"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + + // Empty normalized range may skip filter + assert_eq!( + final_queries, initial_queries, + "filter should not be queried for empty normalized range" + ); + + assert_eq!( + final_hits, initial_hits, + "filter hits should not change for empty range" + ); + } + + Ok(()) +} + +/// A test prefix extractor that extracts a fixed prefix with a custom name +struct TestPrefixExtractor { + length: usize, + name: String, +} + +impl TestPrefixExtractor { + fn new(length: usize, name: &str) -> Self { + Self { + length, + name: name.to_string(), + } + } +} + +impl PrefixExtractor for TestPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + if key.len() >= self.length { + Box::new(std::iter::once(&key[..self.length])) + } else { + Box::new(std::iter::once(key)) + } + } + + fn name(&self) -> &str { + &self.name + } +} + +#[test] +fn test_same_extractor_compatibility() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + let extractor = Arc::new(TestPrefixExtractor::new(4, "test_extractor")); + + // Create a tree with prefix extractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor.clone()) + .open()?; + + tree.insert("user_key1", "value1", 0); + tree.insert("user_key2", "value2", 0); + tree.insert("data_key1", "value3", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with the same extractor - should work fine with prefix filtering + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor) + .open()?; + + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + assert_eq!(&*tree.get("data_key1", u64::MAX)?.unwrap(), b"value3"); + + #[cfg(feature = "metrics")] + { + // Look up a key with absent 4-byte prefix within range [data_key1..user_key2]. + // "nnnn_key1" has prefix "nnnn" which is between "data" and "user". + let initial_queries = tree.metrics().filter_queries(); + let initial_hits = tree.metrics().io_skipped_by_filter(); + let _ = tree.get("nnnn_key1", u64::MAX)?; + let final_queries = tree.metrics().filter_queries(); + let final_hits = tree.metrics().io_skipped_by_filter(); + assert!( + final_queries > initial_queries, + "Compatible extractor should increment filter queries: {} -> {}", + initial_queries, + final_queries + ); + assert!( + final_hits > initial_hits, + "io_skipped_by_filter should have increased" + ); + } + + // Test range queries with prefix filtering optimization + let items: Vec<_> = tree.range("user"..="user_zzzz", u64::MAX, None).collect(); + assert_eq!(items.len(), 2); + } + + Ok(()) +} + +#[test] +fn test_different_extractor_incompatible() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + let extractor1 = Arc::new(TestPrefixExtractor::new(4, "test_extractor_v1")); + let extractor2 = Arc::new(TestPrefixExtractor::new(4, "test_extractor_v2")); + + // Create a tree with first extractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor1) + .open()?; + + tree.insert("user_key1", "value1", 0); + tree.insert("user_key2", "value2", 0); + tree.insert("data_key1", "value3", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with different extractor - should disable prefix filtering for old segments + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor2) + .open()?; + + // Should still work, but without prefix filtering optimization for old segments + // The incompatible extractor means filter is completely bypassed + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + assert_eq!(&*tree.get("data_key1", u64::MAX)?.unwrap(), b"value3"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + // Should NOT have incremented filter queries since extractor is incompatible + assert_eq!( + final_queries, initial_queries, + "Incompatible extractor should not increment filter queries: {} -> {}", + initial_queries, final_queries + ); + } + + // Range queries should still work correctly (but without optimization for old segments) + let items: Vec<_> = tree.range("user"..="user_zzzz", u64::MAX, None).collect(); + assert_eq!(items.len(), 2); + + // New writes should use the new extractor + tree.insert("test_key1", "value4", 1); + tree.flush_active_memtable(0)?; + + assert_eq!(&*tree.get("test_key1", u64::MAX)?.unwrap(), b"value4"); + } + + Ok(()) +} + +#[test] +fn test_no_extractor_to_extractor() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + // Create a tree without prefix extractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + tree.insert("user_key1", "value1", 0); + tree.insert("user_key2", "value2", 0); + tree.insert("data_key1", "value3", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with prefix extractor - should disable prefix filtering for old segments + { + let extractor = Arc::new(TestPrefixExtractor::new(4, "test_extractor")); + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor) + .open()?; + + // Should still work, but old segments won't use prefix filtering + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + assert_eq!(&*tree.get("data_key1", u64::MAX)?.unwrap(), b"value3"); + + // New writes should use prefix extractor + tree.insert("test_key1", "value4", 1); + tree.flush_active_memtable(0)?; + + assert_eq!(&*tree.get("test_key1", u64::MAX)?.unwrap(), b"value4"); + } + + Ok(()) +} + +#[test] +fn test_extractor_to_no_extractor() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + let extractor = Arc::new(TestPrefixExtractor::new(4, "test_extractor")); + + // Create a tree with prefix extractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor) + .open()?; + + tree.insert("user_key1", "value1", 0); + tree.insert("user_key2", "value2", 0); + tree.insert("data_key1", "value3", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen without prefix extractor - should disable prefix filtering for old segments + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + // Should still work, but old segments won't use prefix filtering + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + assert_eq!(&*tree.get("data_key1", u64::MAX)?.unwrap(), b"value3"); + + // Range queries should still work + let items: Vec<_> = tree.range("user"..="user_zzzz", u64::MAX, None).collect(); + assert_eq!(items.len(), 2); + } + + Ok(()) +} + +#[test] +fn test_builtin_extractors_compatibility() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + // Create with FixedPrefixExtractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + tree.insert("user_key1", "value1", 0); + tree.insert("user_key2", "value2", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with FixedLengthExtractor (different name) - should be incompatible + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(4))) + .open()?; + + // Should work but without prefix filtering for old segments + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + } + + // Reopen with same type (FixedPrefixExtractor) - should be compatible + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Should work with prefix filtering for old segments + assert_eq!(&*tree.get("user_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("user_key2", u64::MAX)?.unwrap(), b"value2"); + } + + Ok(()) +} + +#[test] +fn test_new_segments_use_new_extractor() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + let extractor1 = Arc::new(TestPrefixExtractor::new(4, "old_extractor")); + let extractor2 = Arc::new(TestPrefixExtractor::new(4, "new_extractor")); + + // Create first segment with old extractor + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor1) + .open()?; + + tree.insert("old_key1", "value1", 0); + tree.insert("old_key2", "value2", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with new extractor and create new segment + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor2) + .open()?; + + // Add data to create a new segment with the new extractor. + // Use keys whose range does NOT overlap old keys (old_key1..old_key2) + // so that old-key lookups don't hit the new segment's filter. + tree.insert("aaa_key1", "value3", 1); + tree.insert("aaa_key2", "value4", 1); + tree.insert("hhh_key1", "sentinel", 1); + tree.flush_active_memtable(0)?; + + // Test that old segment uses no filtering (extractor incompatible) + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // Query old keys - should NOT increment filter queries (incompatible extractor) + // old_key1/old_key2 are outside the new segment's range [aaa_key1..hhh_key1] + // since 'o' > 'h'. + assert_eq!(&*tree.get("old_key1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("old_key2", u64::MAX)?.unwrap(), b"value2"); + + #[cfg(feature = "metrics")] + let after_old_queries = tree.metrics().filter_queries(); + + // Query a nonexistent key with absent 4-byte prefix "cccc" within range + // [aaa_key1..hhh_key1] on the new segment (compatible extractor). + let _ = tree.get("cccc_key1", u64::MAX)?; + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + // Old keys should not have incremented filter queries + assert_eq!( + after_old_queries, initial_queries, + "Old keys should not increment filter queries due to incompatible extractor" + ); + + // Absent prefix on new segment should have incremented filter queries + assert!( + final_queries > after_old_queries, + "New keys should increment filter queries with compatible extractor: {} -> {}", + after_old_queries, + final_queries + ); + } + + // Verify new keys are still readable + assert_eq!(&*tree.get("aaa_key1", u64::MAX)?.unwrap(), b"value3"); + assert_eq!(&*tree.get("aaa_key2", u64::MAX)?.unwrap(), b"value4"); + } + + Ok(()) +} + +#[test] +fn test_multiple_extractor_changes() -> lsm_tree::Result<()> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path(); + + let extractor1 = Arc::new(TestPrefixExtractor::new(2, "v1")); + let extractor2 = Arc::new(TestPrefixExtractor::new(2, "v2")); + let extractor3 = Arc::new(TestPrefixExtractor::new(2, "v3")); + + // Create segments with different extractors over time + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor1) + .open()?; + tree.insert("aa_data1", "value1", 0); + tree.flush_active_memtable(0)?; + } + + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor2) + .open()?; + tree.insert("bb_data2", "value2", 0); + tree.flush_active_memtable(0)?; + } + + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor3) + .open()?; + tree.insert("cc_data3", "value3", 0); + tree.insert("zz_sent1", "sentinel", 0); // widen range + tree.flush_active_memtable(0)?; + + // Only the last segment should use filtering + #[cfg(feature = "metrics")] + let initial_queries = tree.metrics().filter_queries(); + + // These should not increment filter queries (incompatible) + assert_eq!(&*tree.get("aa_data1", u64::MAX)?.unwrap(), b"value1"); + assert_eq!(&*tree.get("bb_data2", u64::MAX)?.unwrap(), b"value2"); + + #[cfg(feature = "metrics")] + let middle_queries = tree.metrics().filter_queries(); + + // Look up absent prefix "mm" within range [cc_data3..zz_sent1] on new segment + let _ = tree.get("mm_data3", u64::MAX)?; + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + assert_eq!( + middle_queries, initial_queries, + "Old segments should not increment metrics" + ); + assert!( + final_queries > middle_queries, + "New segment should increment metrics" + ); + } + + // Verify all data is still readable + assert_eq!(&*tree.get("cc_data3", u64::MAX)?.unwrap(), b"value3"); + } + + Ok(()) +} + +/// Prefix API with forward iteration returns only keys matching the prefix. +#[test] +fn test_prefix_filter_prefix_api_forward() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..20u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + for i in 0..10u32 { + tree.insert(format!("bbb_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let keys: Vec<_> = tree + .prefix(b"aaa_", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 20); + assert!(keys[0].starts_with(b"aaa_")); + + let keys: Vec<_> = tree + .prefix(b"bbb_", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 10); + + // Non-existent prefix returns nothing. + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// Prefix API with reverse iteration returns only keys matching the prefix in reverse order. +#[test] +fn test_prefix_filter_prefix_api_reverse() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..30u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaa_0000"[..]..=&b"aaa_0029"[..], SeqNo::MAX, None) + .rev() + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 30); + assert_eq!(&*keys[0], b"aaa_0029"); + assert_eq!(&*keys[29], b"aaa_0000"); + + Ok(()) +} + +/// Verifies that partitioned filters are actually built when a prefix extractor +/// is configured. +/// +/// Background: the PartitionedFilterWriter tracks `last_key` (set by `register_key`) +/// to create TLI partition boundary entries and to know whether any data was written. +/// When a prefix extractor is configured, only `register_bytes` is called (not +/// `register_key`), so `last_key` was never set, causing `finish()` to bail out +/// with 0 filter blocks. +#[test] +fn test_prefix_filter_partitioned_filter_actually_built() -> lsm_tree::Result<()> { + use lsm_tree::config::PinningPolicy; + + let dir = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + let tree = Config::new(&dir, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + // Force partitioned filters for ALL levels (including level 0 where flush goes) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + // Pin filters so they are loaded and available for consultation + .filter_block_pinning_policy(PinningPolicy::all(true)) + .open()?; + + // Insert keys with known prefixes + for prefix in [b"aaaa", b"bbbb", b"cccc"] { + for i in 0..50u32 { + let mut key = prefix.to_vec(); + key.extend_from_slice(format!("{:04}", i).as_bytes()); + tree.insert(key, b"value", seqno.next()); + } + } + tree.flush_active_memtable(0)?; + + // Grab the flushed table + let version = tree.current_version(); + let tables: Vec<_> = version.iter_tables().collect(); + assert!( + !tables.is_empty(), + "should have at least one table after flush" + ); + + let extractor = FixedPrefixExtractor::new(4); + + // A non-existent prefix should be definitively excluded by the filter. + // If the filter was NOT built (the bug), this returns Some(true) — "cannot exclude". + // If the filter WAS built (fixed), this returns Some(false) — "definitely not present". + let result = tables[0].maybe_contains_prefix(b"zzzz_test", &extractor)?; + assert_eq!( + result, + Some(false), + "Partitioned filter with prefix extractor must be built and exclude absent prefixes" + ); + + // An existing prefix should NOT be excluded + let result = tables[0].maybe_contains_prefix(b"aaaa_test", &extractor)?; + assert_ne!( + result, + Some(false), + "Partitioned filter should not exclude a prefix that exists in the table" + ); + + Ok(()) +} + +/// BlobTree flush stores prefix entries in the filter and persists the +/// extractor name in table metadata. +#[test] +fn test_prefix_filter_blob_tree_flush() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..50u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + for i in 0..50u32 { + tree.insert(format!("bbb_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..50u32 { + let key = format!("aaa_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + for i in 0..50u32 { + let key = format!("bbb_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"AAA_0000", &big_value, 0); + tree.insert(b"zzz_0000", &big_value, 0); + tree.flush_active_memtable(0)?; + + let before = tree.metrics().filter_queries(); + let before_hits = tree.metrics().io_skipped_by_filter(); + // "mmm_0000" has 4-byte prefix "mmm_" which is between "AAA_" and "zzz_" + // and absent from the filter + let _ = tree.contains_key(b"mmm_0000", u64::MAX)?; + let after = tree.metrics().filter_queries(); + let after_hits = tree.metrics().io_skipped_by_filter(); + assert!( + after > before, + "prefix filter should be consulted on blob tree tables \ + (before={before}, after={after})" + ); + assert!( + after_hits > before_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// BlobTree prefix() forward and reverse iteration with separated values. +#[test] +fn test_prefix_filter_blob_tree_prefix_iter() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..20u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + for i in 0..10u32 { + tree.insert(format!("bbb_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + // Forward iteration with value resolution. + let items: Vec<_> = tree + .prefix(b"aaa_", SeqNo::MAX, None) + .map(|g| g.into_inner().unwrap()) + .collect(); + assert_eq!(items.len(), 20); + for (_, v) in &items { + assert_eq!(v.len(), 2_000); + } + + // Reverse iteration. + let rev_keys: Vec<_> = tree + .prefix(b"aaa_", SeqNo::MAX, None) + .rev() + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(rev_keys.len(), 20); + assert!(rev_keys[0] > rev_keys[19]); + + // Non-existent prefix. + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// BlobTree range scans with prefix filter after flush. +#[test] +fn test_prefix_filter_blob_tree_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..50u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + let count = tree + .range::<&[u8], _>(&b"aaa_0000"[..]..=&b"aaa_0049"[..], u64::MAX, None) + .count(); + assert_eq!(count, 50); + + let count = tree + .range::<&[u8], _>(&b"zzz_0000"[..]..=&b"zzz_9999"[..], u64::MAX, None) + .count(); + assert_eq!(count, 0); + + Ok(()) +} + +/// BlobTree compaction preserves prefix filter functionality. +#[test] +fn test_prefix_filter_blob_tree_compaction() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for batch in 0..3u32 { + for i in 0..30u32 { + let key = format!("aa{}_{:04}", batch, i); + tree.insert(key.as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + } + + tree.major_compact(u64::MAX, 0)?; + + for batch in 0..3u32 { + for i in 0..30u32 { + let key = format!("aa{}_{:04}", batch, i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + // NOTE: We do not assert on filter_queries after major compaction because + // data lands at the last level, which uses partitioned filters with an + // unpinned TLI by default. The filter block is not accessible in that + // configuration, so the metric may not increment. + + Ok(()) +} + +/// BlobTree recovery: prefix filter works after close and reopen. +#[test] +fn test_prefix_filter_blob_tree_recovery() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + let big_value = vec![b'x'; 2_000]; + + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..50u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with the same extractor. + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..50u32 { + let key = format!("aaa_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + #[cfg(feature = "metrics")] + { + // Create a new wide-range segment so an absent prefix falls within it + tree.insert(b"AAA_0000", &big_value, 0); + tree.insert(b"zzz_0000", &big_value, 0); + tree.flush_active_memtable(0)?; + + let before = tree.metrics().filter_queries(); + let before_hits = tree.metrics().io_skipped_by_filter(); + // "mmm_0001" has absent prefix "mmm_" within [AAA_..zzz_] + let _ = tree.contains_key(b"mmm_0001", u64::MAX)?; + let after = tree.metrics().filter_queries(); + let after_hits = tree.metrics().io_skipped_by_filter(); + assert!(after > before, "filter queries should have increased"); + assert!( + after_hits > before_hits, + "io_skipped_by_filter should have increased" + ); + } + } + + Ok(()) +} + +/// BlobTree with a mix of inline (small) and separated (large) values. +/// Both paths must produce correct prefix filter entries. +#[test] +fn test_prefix_filter_blob_tree_mixed_values() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + let small_value = b"tiny"; + let big_value = vec![b'x'; 2_000]; + + // Small values (stored inline). + for i in 0..20u32 { + tree.insert(format!("sml_{:04}", i).as_bytes(), small_value, 0); + } + // Large values (stored in blob files). + for i in 0..20u32 { + tree.insert(format!("big_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..20u32 { + assert!(tree.contains_key(format!("sml_{:04}", i).as_bytes(), u64::MAX)?); + assert!(tree.contains_key(format!("big_{:04}", i).as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + // Both prefixes work via the prefix() API. + assert_eq!(tree.prefix(b"sml_", SeqNo::MAX, None).count(), 20); + assert_eq!(tree.prefix(b"big_", SeqNo::MAX, None).count(), 20); + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// BlobTree with deletions: tombstones interact correctly with prefix filter. +#[test] +fn test_prefix_filter_blob_tree_deletions() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..40u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + // Delete even-numbered keys. + for i in (0..40u32).step_by(2) { + tree.remove(format!("aaa_{:04}", i).as_bytes(), 1); + } + tree.flush_active_memtable(0)?; + + // Even keys gone, odd keys remain. + for i in 0..40u32 { + let key = format!("aaa_{:04}", i); + if i % 2 == 0 { + assert!(!tree.contains_key(key.as_bytes(), u64::MAX)?); + } else { + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + } + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 20); + + Ok(()) +} + +/// BlobTree MVCC: snapshot reads at specific sequence numbers with prefix filter. +#[test] +fn test_prefix_filter_blob_tree_mvcc() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + // Batch 1: seqno 0..49 + for i in 0..50u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, i as SeqNo); + } + tree.flush_active_memtable(0)?; + + // Batch 2: seqno 50..99 + for i in 50..100u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, i as SeqNo); + } + tree.flush_active_memtable(0)?; + + // Snapshot at seqno 50: only first 50 keys visible. + assert_eq!(tree.prefix(b"aaa_", 50, None).count(), 50); + + // Snapshot at latest: all 100 visible. + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 100); + + Ok(()) +} + +/// BlobTree extractor compatibility change: opening with a differently-named +/// extractor gracefully degrades (old tables bypass the filter, new tables use it). +#[test] +fn test_prefix_filter_blob_tree_extractor_change() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let path = folder.path(); + let big_value = vec![b'x'; 2_000]; + + // Use custom extractors with distinct names so the system detects incompatibility. + let extractor_v1 = Arc::new(TestPrefixExtractor::new(4, "extractor_v1")); + let extractor_v2 = Arc::new(TestPrefixExtractor::new(6, "extractor_v2")); + + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor_v1) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..20u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with a differently-named extractor. + { + let tree = Config::new( + path, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(extractor_v2) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + // Old data must still be readable despite incompatible extractor. + for i in 0..20u32 { + let key = format!("aaa_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + // Insert new data with the new extractor, verify coexistence. + for i in 0..10u32 { + tree.insert(format!("bbbbbb_{:04}", i).as_bytes(), &big_value, 1); + } + tree.flush_active_memtable(0)?; + + for i in 0..10u32 { + let key = format!("bbbbbb_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + } + + Ok(()) +} + +/// Standard tree ingestion produces tables with working prefix filters. +#[test] +fn test_prefix_filter_ingestion() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + let mut ingestion = tree.ingestion()?; + for i in 0..50u32 { + ingestion.write(format!("aaa_{:04}", i).as_bytes(), b"value")?; + } + for i in 0..50u32 { + ingestion.write(format!("bbb_{:04}", i).as_bytes(), b"value")?; + } + ingestion.finish()?; + + for i in 0..50u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + assert!(tree.contains_key(format!("bbb_{:04}", i).as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 50); + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + #[cfg(feature = "metrics")] + { + let before = tree.metrics().filter_queries(); + let before_hits = tree.metrics().io_skipped_by_filter(); + // "abc_0000" has absent prefix "abc_" within [aaa_0000..bbb_0049] + let _ = tree.contains_key(b"abc_0000", u64::MAX)?; + let after = tree.metrics().filter_queries(); + let after_hits = tree.metrics().io_skipped_by_filter(); + assert!(after > before, "filter queries should have increased"); + assert!( + after_hits > before_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// BlobTree ingestion produces tables with working prefix filters. +#[test] +fn test_prefix_filter_blob_tree_ingestion() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + let mut ingestion = tree.ingestion()?; + for i in 0..50u32 { + ingestion.write(format!("aaa_{:04}", i).as_bytes(), &big_value)?; + } + for i in 0..50u32 { + ingestion.write(format!("bbb_{:04}", i).as_bytes(), &big_value)?; + } + ingestion.finish()?; + + for i in 0..50u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + assert!(tree.contains_key(format!("bbb_{:04}", i).as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 50); + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + #[cfg(feature = "metrics")] + { + let before = tree.metrics().filter_queries(); + let before_hits = tree.metrics().io_skipped_by_filter(); + // "abc_0000" has absent prefix "abc_" within [aaa_0000..bbb_0049] + let _ = tree.contains_key(b"abc_0000", u64::MAX)?; + let after = tree.metrics().filter_queries(); + let after_hits = tree.metrics().io_skipped_by_filter(); + assert!(after > before, "filter queries should have increased"); + assert!( + after_hits > before_hits, + "io_skipped_by_filter should have increased" + ); + } + + Ok(()) +} + +/// Ingestion with tombstones and weak tombstones alongside prefix filter. +#[test] +fn test_prefix_filter_ingestion_tombstones() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Seed some data. + for i in 0..30u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + // Ingest tombstones for the first 10 keys and weak tombstones for the next 5. + let mut ingestion = tree.ingestion()?; + for i in 0..10u32 { + ingestion.write_tombstone(format!("aaa_{:04}", i).as_bytes())?; + } + for i in 10..15u32 { + ingestion.write_weak_tombstone(format!("aaa_{:04}", i).as_bytes())?; + } + ingestion.finish()?; + + // First 10 deleted. + for i in 0..10u32 { + assert!(!tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + // 10..14 deleted by weak tombstone (shadowing the original insert). + for i in 10..15u32 { + assert!(!tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + // 15..29 still alive. + for i in 15..30u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + + Ok(()) +} + +/// Weak tombstones correctly interact with prefix filter during flush and compaction. +#[test] +fn test_prefix_filter_weak_tombstones() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..20u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + // Weak-delete half the keys. + for i in 0..10u32 { + tree.remove_weak(format!("aaa_{:04}", i).as_bytes(), 1); + } + tree.flush_active_memtable(0)?; + + // Before compaction: weak tombstones shadow the values. + for i in 0..10u32 { + assert!(!tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + for i in 10..20u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + + // After compaction: weak tombstones resolved, prefix filter still correct. + tree.major_compact(u64::MAX, 2)?; + + for i in 0..10u32 { + assert!(!tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + for i in 10..20u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 10); + + Ok(()) +} + +/// drop_range correctly interacts with prefix filter. After dropping tables +/// that are fully contained in a range, queries for those keys return empty. +/// Each prefix must be in a separate table for drop_range to drop it. +#[test] +fn test_prefix_filter_drop_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Flush each prefix into its own table so drop_range can fully contain one. + for i in 0..30u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..30u32 { + tree.insert(format!("bbb_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + assert_eq!(tree.table_count(), 2); + + // Drop the table fully contained in the "aaa_" range. + tree.drop_range::<&[u8], _>(&b"aaa_"[..]..&b"aab_"[..])?; + + // "aaa_" prefix should be empty. + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 0); + + // "bbb_" prefix should be untouched. + assert_eq!(tree.prefix(b"bbb_", SeqNo::MAX, None).count(), 30); + + Ok(()) +} + +/// Partitioned filters combined with prefix extractor. +/// Forces partitioned filters at all levels and pins the filter index +/// so that prefix lookups can load the correct partition. +#[test] +fn test_prefix_filter_partitioned() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + // Pin the filter block TLI so the partitioned filter can be looked up. + .filter_block_pinning_policy(PinningPolicy::all(true)) + .open()?; + + for i in 0..100u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + for i in 0..100u32 { + tree.insert(format!("bbb_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..100u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + assert!(tree.contains_key(format!("bbb_{:04}", i).as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 100); + assert_eq!(tree.prefix(b"bbb_", SeqNo::MAX, None).count(), 100); + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// Partitioned filters on a BlobTree with prefix extractor. +#[test] +fn test_prefix_filter_blob_tree_partitioned() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + .filter_block_pinning_policy(PinningPolicy::all(true)) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..50u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + for i in 0..50u32 { + assert!(tree.contains_key(format!("aaa_{:04}", i).as_bytes(), u64::MAX)?); + } + assert!(!tree.contains_key(b"zzz_0000", u64::MAX)?); + + assert_eq!(tree.prefix(b"aaa_", SeqNo::MAX, None).count(), 50); + assert_eq!(tree.prefix(b"zzz_", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// size_of() goes through the point-read path; verify it interacts with +/// the prefix filter correctly. +#[test] +fn test_prefix_filter_size_of() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + tree.insert(b"aaa_0001", b"hello", 0); + tree.flush_active_memtable(0)?; + + assert_eq!(tree.size_of(b"aaa_0001", u64::MAX)?, Some(5)); + assert_eq!(tree.size_of(b"zzz_0000", u64::MAX)?, None); + + Ok(()) +} + +/// BlobTree size_of() with prefix filter. +#[test] +fn test_prefix_filter_blob_tree_size_of() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + tree.insert(b"aaa_0001", &big_value, 0); + tree.insert(b"aaa_0002", b"tiny", 0); + tree.flush_active_memtable(0)?; + + assert_eq!(tree.size_of(b"aaa_0001", u64::MAX)?, Some(2_000)); + assert_eq!(tree.size_of(b"aaa_0002", u64::MAX)?, Some(4)); + assert_eq!(tree.size_of(b"zzz_0000", u64::MAX)?, None); + + Ok(()) +} + +/// first_key_value and last_key_value use unbounded iteration; the prefix +/// filter should not interfere. +#[test] +fn test_prefix_filter_first_last_kv() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..20u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + for i in 0..20u32 { + tree.insert(format!("zzz_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let first = tree + .first_key_value(SeqNo::MAX, None) + .unwrap() + .key() + .unwrap(); + assert_eq!(&*first, b"aaa_0000"); + + let last = tree + .last_key_value(SeqNo::MAX, None) + .unwrap() + .key() + .unwrap(); + assert_eq!(&*last, b"zzz_0019"); + + Ok(()) +} + +/// Reverse range iteration with prefix filter on a standard tree. +#[test] +fn test_prefix_filter_reverse_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..30u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaa_0000"[..]..=&b"aaa_0029"[..], SeqNo::MAX, None) + .rev() + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 30); + assert_eq!(&*keys[0], b"aaa_0029"); + assert_eq!(&*keys[29], b"aaa_0000"); + + Ok(()) +} + +/// Reverse range iteration on a BlobTree with prefix filter. +#[test] +fn test_prefix_filter_blob_tree_reverse_range() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let big_value = vec![b'x'; 2_000]; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .with_kv_separation(Some(KvSeparationOptions::default())) + .open()?; + + for i in 0..30u32 { + tree.insert(format!("aaa_{:04}", i).as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaa_0000"[..]..=&b"aaa_0029"[..], SeqNo::MAX, None) + .rev() + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 30); + assert_eq!(&*keys[0], b"aaa_0029"); + assert_eq!(&*keys[29], b"aaa_0000"); + + Ok(()) +} + +/// Test FullKeyExtractor through the flush/point-read/range pipeline. +#[test] +fn test_full_key_extractor_pipeline() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FullKeyExtractor)) + .open()?; + + tree.insert(b"alpha", b"v1", 0); + tree.insert(b"beta", b"v2", 0); + tree.insert(b"gamma", b"v3", 0); + tree.flush_active_memtable(0)?; + + // Point reads + assert_eq!(&*tree.get(b"alpha", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"beta", SeqNo::MAX)?.unwrap(), b"v2"); + assert!(tree.get(b"missing", SeqNo::MAX)?.is_none()); + + // Prefix query + let keys: Vec<_> = tree + .prefix(b"alpha", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 1); + assert_eq!(&*keys[0], b"alpha"); + + // Range query + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"a"[..]..&b"c"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 2); + assert_eq!(&*keys[0], b"alpha"); + assert_eq!(&*keys[1], b"beta"); + + Ok(()) +} + +/// Test FixedLengthExtractor through the tree pipeline. +/// Keys shorter than the required length are out-of-domain (extract returns empty). +#[test] +fn test_fixed_length_extractor_pipeline() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(4))) + .open()?; + + // Normal keys (len >= 4) + tree.insert(b"aaaa_001", b"v1", 0); + tree.insert(b"aaaa_002", b"v2", 0); + tree.insert(b"bbbb_001", b"v3", 0); + + // Short keys (len < 4) — out of domain for FixedLengthExtractor + tree.insert(b"ab", b"short1", 0); + tree.insert(b"zz", b"short2", 0); + + tree.flush_active_memtable(0)?; + + // Point reads work for all keys + assert_eq!(&*tree.get(b"aaaa_001", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"ab", SeqNo::MAX)?.unwrap(), b"short1"); + + // Prefix query for normal-length key + let keys: Vec<_> = tree + .prefix(b"aaaa_001", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert!(!keys.is_empty()); + assert!(keys.iter().any(|k| &**k == b"aaaa_001")); + + // Range query still returns all keys in range + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"a"[..]..&b"c"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + // Should contain: ab, aaaa_001, aaaa_002, bbbb_001 + assert_eq!(keys.len(), 4); + + Ok(()) +} + +/// Test FixedLengthExtractor boundary: key length exactly == extractor length. +#[test] +fn test_fixed_length_extractor_boundary() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(3))) + .open()?; + + // Exactly 3 bytes — on boundary + tree.insert(b"abc", b"v1", 0); + // More than 3 bytes + tree.insert(b"abcdef", b"v2", 0); + // Less than 3 bytes — out of domain + tree.insert(b"ab", b"v3", 0); + + tree.flush_active_memtable(0)?; + + assert_eq!(&*tree.get(b"abc", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"abcdef", SeqNo::MAX)?.unwrap(), b"v2"); + assert_eq!(&*tree.get(b"ab", SeqNo::MAX)?.unwrap(), b"v3"); + + // Prefix query with boundary-length key + let keys: Vec<_> = tree + .prefix(b"abc", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + // "abc" and "abcdef" share prefix "abc" + assert_eq!(keys.len(), 2); + + Ok(()) +} + +/// Test RunReader forward lazy per-table prefix skip. +/// Creates 4 tables with distinct prefixes in a single run (L1), +/// then queries a prefix that only exists in one table — the others should be skipped. +#[test] +fn test_run_reader_forward_lazy_prefix_skip() -> lsm_tree::Result<()> { + use lsm_tree::compaction::Leveled; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Create 4 separate L0 tables with distinct prefixes + for prefix in &[b"aaaa", b"bbbb", b"cccc", b"dddd"] { + for i in 0..5u32 { + let key = format!("{}_{:04}", std::str::from_utf8(*prefix).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + } + + // Compact to L1 so they form a single run + tree.compact(Arc::new(Leveled::default()), 0)?; + + // Query only "bbbb" prefix — tables for aaaa, cccc, dddd should be skipped lazily + let keys: Vec<_> = tree + .prefix(b"bbbb", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 5); + for k in &keys { + assert!(k.starts_with(b"bbbb")); + } + + // Query "dddd" prefix (last table) — forward iteration should skip aaaa, bbbb, cccc + let keys: Vec<_> = tree + .prefix(b"dddd", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 5); + for k in &keys { + assert!(k.starts_with(b"dddd")); + } + + Ok(()) +} + +/// Test RunReader reverse lazy per-table prefix skip. +/// Same setup as forward test but iterates in reverse. +#[test] +fn test_run_reader_reverse_lazy_prefix_skip() -> lsm_tree::Result<()> { + use lsm_tree::compaction::Leveled; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Create 4 separate L0 tables with distinct prefixes + for prefix in &[b"aaaa", b"bbbb", b"cccc", b"dddd"] { + for i in 0..5u32 { + let key = format!("{}_{:04}", std::str::from_utf8(*prefix).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + } + + // Compact to L1 + tree.compact(Arc::new(Leveled::default()), 0)?; + + // Reverse prefix query for "aaaa" — should skip bbbb, cccc, dddd tables + let keys: Vec<_> = tree + .prefix(b"aaaa", SeqNo::MAX, None) + .rev() + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 5); + assert_eq!(&*keys[0], b"aaaa_0004"); + assert_eq!(&*keys[4], b"aaaa_0000"); + + Ok(()) +} + +/// Test RunReader with unbounded ranges and an extractor configured. +#[test] +fn test_run_reader_unbounded_range_with_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + tree.insert(b"aaaa_001", b"v1", 0); + tree.insert(b"bbbb_001", b"v2", 0); + tree.flush_active_memtable(0)?; + + // Fully unbounded range (both sides Unbounded) + let keys: Vec<_> = tree + .range::<&[u8], _>(.., SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 2); + + // Open-start range (Unbounded..Included) + let keys: Vec<_> = tree + .range::<&[u8], _>(..=&b"aaaa_999"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 1); + assert_eq!(&*keys[0], b"aaaa_001"); + + // Open-end range (Included..Unbounded) + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"bbbb"[..].., SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 1); + assert_eq!(&*keys[0], b"bbbb_001"); + + Ok(()) +} + +/// Test RunReader with a cross-prefix range (start and end extract to different prefixes). +#[test] +fn test_run_reader_cross_prefix_range() -> lsm_tree::Result<()> { + use lsm_tree::compaction::Leveled; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for prefix in &[b"aaaa", b"bbbb", b"cccc"] { + for i in 0..3u32 { + let key = format!("{}_{:04}", std::str::from_utf8(*prefix).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + } + tree.compact(Arc::new(Leveled::default()), 0)?; + + // Range from "aaaa" to "cccc" — different prefixes, common_prefix should be None + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaaa_0000"[..]..&b"cccc_0000"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + // Should include all aaaa_* and bbbb_* keys (cccc_0000 is excluded by range end) + assert_eq!(keys.len(), 6); + + Ok(()) +} + +/// Test single-table run prefix skip. +/// Creates a table with a wide key range but missing a specific prefix, +/// then queries for that absent prefix. The key range overlaps the query, +/// but the prefix filter excludes it so no results are returned. +#[test] +fn test_single_table_range_prefix_skip() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Single table with wide key range: "aaa" through "zzz", but NO "mmm" prefix + for p in [b"aaa", b"bbb", b"ccc", b"yyy", b"zzz"] { + for i in 0..3u32 { + let key = format!("{}_{:02}", std::str::from_utf8(p).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + } + tree.flush_active_memtable(0)?; + + // Query for prefix "mmm" — table key range (aaa..zzz) overlaps, but filter excludes "mmm" + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"mmm_00"[..]..&b"mmm_99"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + + Ok(()) +} + +/// Test should_skip_range_by_prefix_filter where start prefix matches min_key prefix +/// but end prefix differs, and the table contains the start prefix. +/// The filter reports the prefix exists, so the table is NOT skipped. +#[test] +fn test_skip_range_start_prefix_matches_min_key() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Wide key range table: prefixes "aaa" through "zzz", but not "mmm" + for p in [b"aaa", b"bbb", b"yyy", b"zzz"] { + for i in 0..3u32 { + let key = format!("{}_{:02}", std::str::from_utf8(p).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + } + tree.flush_active_memtable(0)?; + + // Range where start prefix ("aaa") matches min_key prefix ("aaa"), + // end prefix ("bbb") differs. Start prefix IS in the filter, + // so table is NOT skipped. + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaa_00"[..]..&b"bbb_99"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 6); // aaa_00..02 + bbb_00..02 + + Ok(()) +} + +/// Test should_skip_range_by_prefix_filter where start prefix matches min_key prefix +/// but the start key itself is absent from the filter. The filter excludes the start key, +/// so the table IS skipped. This exercises the fallback path in should_skip_range_by_prefix_filter. +#[test] +fn test_skip_range_start_prefix_matches_min_key_but_absent() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Table with wide key range: "aaa" and "zzz" present, but NOT "mmm" + for p in [b"aaa", b"zzz"] { + for i in 0..5u32 { + let key = format!("{}_{:02}", std::str::from_utf8(p).unwrap(), i); + tree.insert(key.as_bytes(), b"value", 0); + } + } + tree.flush_active_memtable(0)?; + + // Range where start prefix ("aaa") matches min_key prefix ("aaa"), + // end prefix ("mmm") differs. But we're querying with start key "aaa_xx" + // which has prefix "aaa" that IS in the filter. So the table won't be skipped. + // (The fallback path is: start prefix == min_key prefix AND filter says it might exist) + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaa_00"[..]..&b"mmm_99"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 5); // aaa_00..04 + + Ok(()) +} + +/// Test should_skip_range_by_prefix_filter where start prefix differs from min_key prefix. +#[test] +fn test_skip_range_start_prefix_differs_from_min_key() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Table with "aaaa" prefix only + for i in 0..5u32 { + let key = format!("aaaa_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + + // Range where start prefix ("bbbb") differs from min_key prefix ("aaaa"), + // and end prefix ("cccc") also differs. Neither matches. + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"bbbb_0000"[..]..&b"cccc_0000"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + + Ok(()) +} + +/// Test maybe_contains_prefix with an incompatible extractor name. +/// The table was written with "fixed_prefix:4" but we reopen with a differently-named extractor. +#[test] +fn test_maybe_contains_prefix_incompatible_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Write data with "fixed_prefix:4" extractor + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + tree.insert(b"aaaa_001", b"v1", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with a differently-named extractor + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(TestPrefixExtractor::new(4, "different_name"))) + .open()?; + + // Point read should still work — incompatible extractor means filter is bypassed + assert_eq!(&*tree.get(b"aaaa_001", SeqNo::MAX)?.unwrap(), b"v1"); + + // Prefix query with non-existent prefix should still return nothing + // (data just isn't there, filter cannot help but doesn't hurt) + let keys: Vec<_> = tree + .prefix(b"zzzz", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + } + + Ok(()) +} + +/// Test partitioned filter with unpinned TLI (top-level index). +#[test] +fn test_partitioned_filter_unpinned_tli() -> lsm_tree::Result<()> { + use lsm_tree::config::PinningPolicy; + + let folder = tempfile::tempdir()?; + + let mut config = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .filter_block_pinning_policy(PinningPolicy::all(false)) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + .index_block_pinning_policy(PinningPolicy::all(false)); + + // Unpin the top-level filter index so we exercise the unpinned TLI code path + config.top_level_filter_block_pinning_policy = PinningPolicy::all(false); + + let tree = config.open()?; + + for i in 0..50u32 { + let key = format!("aaaa_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + + // Point reads should still work through the unpinned TLI path + assert_eq!(&*tree.get(b"aaaa_0000", SeqNo::MAX)?.unwrap(), b"value"); + assert_eq!(&*tree.get(b"aaaa_0049", SeqNo::MAX)?.unwrap(), b"value"); + assert!(tree.get(b"zzzz_0000", SeqNo::MAX)?.is_none()); + + // Prefix query + let keys: Vec<_> = tree + .prefix(b"aaaa", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 50); + + Ok(()) +} + +/// Test partitioned filter prefix spill (register_bytes causing filter partition flush). +/// Uses a small partition size and many unique prefixes to trigger the spill path. +#[test] +fn test_partitioned_filter_prefix_spill() -> lsm_tree::Result<()> { + use lsm_tree::config::{BlockSizePolicy, PinningPolicy}; + + let folder = tempfile::tempdir()?; + + let mut config = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .filter_block_partitioning_policy(PinningPolicy::all(true)); + + // Set very small partition size to force spills + config.filter_block_partition_size_policy = BlockSizePolicy::all(128); + config.index_block_partition_size_policy = BlockSizePolicy::all(128); + + let tree = config.open()?; + + // Insert many unique prefixes to force filter partition spills + for p in 0..200u32 { + let key = format!("{:04}_data", p); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + + // Verify data is still accessible + assert_eq!(&*tree.get(b"0000_data", SeqNo::MAX)?.unwrap(), b"value"); + assert_eq!(&*tree.get(b"0199_data", SeqNo::MAX)?.unwrap(), b"value"); + assert!(tree.get(b"9999_data", SeqNo::MAX)?.is_none()); + + Ok(()) +} + +/// Test prefix with all 0xFF bytes -> prefix_upper_range returns Unbounded. +#[test] +fn test_prefix_all_0xff() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + // Insert keys with all-0xFF prefix + let prefix = vec![0xFF, 0xFF, 0xFF]; + for i in 0..5u8 { + let mut key = prefix.clone(); + key.push(i); + tree.insert(&key, b"value", 0); + } + + // Also insert some non-0xFF keys + tree.insert(b"aaa_001", b"other", 0); + tree.flush_active_memtable(0)?; + + // Prefix query with all-0xFF prefix should work (upper bound is Unbounded) + let keys: Vec<_> = tree + .prefix(&prefix, SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 5); + for k in &keys { + assert!(k.starts_with(&prefix)); + } + + Ok(()) +} + +/// Test BlobTree with partitioned filter + prefix extractor flush. +#[test] +fn test_blob_tree_partitioned_filter_flush() -> lsm_tree::Result<()> { + use lsm_tree::config::PinningPolicy; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .filter_block_partitioning_policy(PinningPolicy::all(true)) + .with_kv_separation(Some(KvSeparationOptions::default().separation_threshold(1))) + .open()?; + + let big_value = vec![b'x'; 1_000]; + for i in 0..20u32 { + let key = format!("pref_{:04}", i); + tree.insert(key.as_bytes(), &big_value, 0); + } + tree.flush_active_memtable(0)?; + + // Point reads + assert_eq!( + &*tree.get(b"pref_0000", SeqNo::MAX)?.unwrap(), + &big_value[..] + ); + assert!(tree.get(b"miss_0000", SeqNo::MAX)?.is_none()); + + // Prefix query + let keys: Vec<_> = tree + .prefix(b"pref", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 20); + + Ok(()) +} + +/// Test MAX_UPFRONT_CHECKS exceeded path: >10 tables in a run with absent prefix. +#[test] +fn test_run_reader_max_upfront_checks_exceeded() -> lsm_tree::Result<()> { + use lsm_tree::compaction::Leveled; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Create 12 separate L0 tables with distinct prefixes + for p in 0..12u32 { + let prefix = format!("{:04}", p); + for i in 0..3u32 { + let key = format!("{}_{:04}", prefix, i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + } + + // Compact to L1 so they form a single sorted run + tree.compact(Arc::new(Leveled::default()), 0)?; + + // Query an absent prefix — upfront check will try up to 10 tables, + // exceed the limit, and fall through to lazy per-table checking + let keys: Vec<_> = tree + .prefix(b"zzzz", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + + Ok(()) +} + +/// Test short keys with a prefix extractor that has a longer prefix length. +/// FixedPrefixExtractor returns the full key when it's shorter than prefix length. +#[test] +fn test_short_keys_with_prefix_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(8))) + .open()?; + + // All keys shorter than prefix length (8) + tree.insert(b"a", b"v1", 0); + tree.insert(b"ab", b"v2", 0); + tree.insert(b"abc", b"v3", 0); + tree.insert(b"abcdefg", b"v4", 0); // still 7 < 8 + tree.flush_active_memtable(0)?; + + // All should be readable + assert_eq!(&*tree.get(b"a", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"abcdefg", SeqNo::MAX)?.unwrap(), b"v4"); + + // Range query returns all + let keys: Vec<_> = tree + .range::<&[u8], _>(.., SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 4); + + Ok(()) +} + +/// tree.prefix() converts the prefix to a range via prefix_to_range, which +/// increments the last byte: prefix "h" becomes Included("h")..Excluded("i"). +/// With FixedPrefixExtractor::new(1), extract_first("h") = "h" and +/// extract_first("i") = "i", so the start and end prefixes differ. The filter +/// layer must still recognize this as a single-prefix query and consult the +/// filter. This test uses a single-table run. +#[test] +#[cfg(feature = "metrics")] +fn test_prefix_query_filter_used_single_table_exact_len() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(1))) + .open()?; + + // Insert keys with prefixes "a" and "z" + tree.insert(b"abc", b"1", 0); + tree.insert(b"abd", b"2", 0); + tree.insert(b"zde", b"3", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix "h" — no keys with this prefix exist. + // The filter should be consulted and should skip the table. + let before = tree.metrics().filter_queries(); + + let keys: Vec<_> = tree + .prefix(b"h", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0, "no keys with prefix 'h' should be found"); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter should be consulted for prefix query 'h' (single table, extractor len=1)" + ); + + Ok(()) +} + +/// Same scenario as above but with FixedPrefixExtractor::new(3) and prefix +/// query "abc". The range becomes Included("abc")..Excluded("abd"), and +/// extract_first("abc") = "abc", extract_first("abd") = "abd" — they differ. +/// The filter must still be used. Keys span from "aaa" to "zzz" so that the +/// query range overlaps the table's key range. High BPK to avoid false +/// positives in the small filter. +#[test] +#[cfg(feature = "metrics")] +fn test_prefix_query_filter_used_single_table_3byte_prefix() -> lsm_tree::Result<()> { + use lsm_tree::config::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry}; + + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .open()?; + + // Insert keys with prefixes "aaa" and "zzz" so the table's key range + // covers "aaa".."zzz", making the query for "abc" overlap the range. + tree.insert(b"aaa_001", b"1", 0); + tree.insert(b"aaa_002", b"2", 0); + tree.insert(b"zzz_001", b"3", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix "abc" — between "aaa" and "zzz" but not in the filter. + let before = tree.metrics().filter_queries(); + + let keys: Vec<_> = tree + .prefix(b"abc", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0, "no keys with prefix 'abc' should be found"); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter should be consulted for prefix query 'abc' (single table, extractor len=3)" + ); + + Ok(()) +} + +/// FixedLengthExtractor has the same issue: prefix "h" with extractor +/// length 1 causes extract_first("h") != extract_first("i"). Keys span +/// from "a" to "z" so the query range overlaps the table. +#[test] +#[cfg(feature = "metrics")] +fn test_prefix_query_filter_used_fixed_length_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedLengthExtractor::new(1))) + .open()?; + + // Keys span "a" to "z" so the table's key range covers the query for "h" + tree.insert(b"a_one", b"1", 0); + tree.insert(b"a_two", b"2", 0); + tree.insert(b"z_one", b"3", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix "h" — between "a" and "z" but not in the filter + let before = tree.metrics().filter_queries(); + + let keys: Vec<_> = tree + .prefix(b"h", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter should be consulted for prefix query 'h' (FixedLengthExtractor len=1)" + ); + + Ok(()) +} + +/// Multi-table run: with enough data flushed and compacted into a single level +/// with multiple tables, RunReader's upfront pruning and lazy per-table skip +/// should both use the filter for a prefix query whose length equals the +/// extractor length. +#[test] +#[cfg(feature = "metrics")] +fn test_prefix_query_filter_used_multi_table_run() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(1))) + .open()?; + + // Insert many keys across two prefixes to produce multiple tables after compaction + for i in 0..500u32 { + let key = format!("a{i:04}"); + tree.insert(key.as_bytes(), b"v", 0); + } + for i in 0..500u32 { + let key = format!("z{i:04}"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + // Query for prefix "m" — between "a" and "z", not in any table + let before = tree.metrics().filter_queries(); + + let keys: Vec<_> = tree + .prefix(b"m", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0, "no keys with prefix 'm' should be found"); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter should be consulted for prefix query 'm' (multi-table run, extractor len=1)" + ); + + Ok(()) +} + +/// Ensure that existing prefixes still return correct results — no false +/// negatives from the optimization. +#[test] +fn test_prefix_query_existing_prefix_still_found() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(1))) + .open()?; + + tree.insert(b"abc", b"1", 0); + tree.insert(b"abd", b"2", 0); + tree.insert(b"zde", b"3", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix "a" — should find 2 keys + let keys: Vec<_> = tree + .prefix(b"a", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 2, "should find 2 keys with prefix 'a'"); + + // Query for prefix "z" — should find 1 key + let keys: Vec<_> = tree + .prefix(b"z", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 1, "should find 1 key with prefix 'z'"); + + Ok(()) +} + +/// Prefix query where the prefix is longer than the extractor length should +/// already work (both bounds extract to the same prefix). Verify this still +/// holds after the fix. +#[test] +#[cfg(feature = "metrics")] +fn test_prefix_query_longer_than_extractor_still_uses_filter() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + tree.insert(b"abc_001", b"1", 0); + tree.insert(b"abc_002", b"2", 0); + tree.insert(b"xyz_001", b"3", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix "def_" — 4 bytes, extractor is 3. Both "def_" and + // "def`" (or similar successor) extract to "def". Filter should be used. + let before = tree.metrics().filter_queries(); + + let keys: Vec<_> = tree + .prefix(b"def_", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0); + + let after = tree.metrics().filter_queries(); + assert!( + after > before, + "filter should be used for prefix 'def_' (longer than extractor len=3)" + ); + + Ok(()) +} + +/// The all-0xFF edge case: prefix_upper_range returns Unbounded for a prefix +/// of all 0xFF bytes. The table's max key is below 0xFF, so the filter +/// partition for 0xFF may not exist. The filter cannot be consulted in this +/// case, but correctness is preserved — no keys are returned. +#[test] +fn test_prefix_query_all_0xff_prefix_correctness() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(1))) + .open()?; + + tree.insert([0x00, 0x01], b"v1", 0); + tree.insert([0xFE, 0x01], b"v2", 0); + tree.flush_active_memtable(0)?; + + // Query for prefix [0xFF] — no keys should be found + let keys: Vec<_> = tree + .prefix([0xFF], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!(keys.len(), 0, "no keys with prefix 0xFF should be found"); + + Ok(()) +} + +/// FixedPrefixExtractor with a prefix query SHORTER than the extractor length. +/// The filter was built with 4-byte prefixes, but the query uses a 3-byte prefix. +/// The filter must not skip the table — the shorter prefix is a prefix of keys +/// that exist in the table. +#[test] +fn test_prefix_query_shorter_than_extractor_no_false_negative() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Insert keys with 4-byte prefix "user" + tree.insert(b"user_alice", b"v1", 0); + tree.insert(b"user_bob", b"v2", 0); + tree.flush_active_memtable(0)?; + + // Query with 3-byte prefix "use" — shorter than extractor length 4. + // Both keys start with "use" and MUST be returned. + let results: Vec<_> = tree + .prefix(b"use", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!( + results.len(), + 2, + "both user_alice and user_bob start with 'use' and must be found" + ); + + Ok(()) +} + +/// FullKeyExtractor with tree.prefix() — the hint is the prefix, not a full key. +/// The filter was built with full-key hashes, but the probe uses the prefix. +/// The table must NOT be skipped. +#[test] +fn test_prefix_query_full_key_extractor_no_false_negative() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(lsm_tree::prefix::FullKeyExtractor)) + .open()?; + + tree.insert(b"abc_001", b"v1", 0); + tree.insert(b"abc_002", b"v2", 0); + tree.flush_active_memtable(0)?; + + let results: Vec<_> = tree + .prefix(b"abc", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + assert_eq!( + results.len(), + 2, + "both abc_001 and abc_002 start with 'abc' and must be found" + ); + + Ok(()) +} + +/// After reopening with a different prefix extractor length, old tables retain +/// their original extractor name. The RunReader lazy loop calls +/// `probe_prefix_filter` directly (bypassing `should_skip_range_by_prefix_filter`) +/// when a validated_prefix_hint is available, so it must check +/// `prefix_filter_allowed` per-table. Otherwise, probing an old table's filter +/// with the wrong extractor can yield a false negative. +/// +/// This test writes enough data with extractor length 3 to produce multiple +/// L1 tables, then reopens with extractor length 4 and issues a prefix query +/// whose range spans 3+ tables to exercise the lazy loop. +#[test] +fn test_prefix_query_mixed_extractor_run_no_false_negative() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let version = SequenceNumberCounter::default(); + + // Phase 1: extractor length 3. Write many "abcd*" keys and compact to L1 + // with a small target size to produce multiple tables. + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .open()?; + + for i in 0..2000u32 { + let key = format!("abcd{i:06}"); + tree.insert(key.as_bytes(), b"v1", seqno.next()); + } + tree.flush_active_memtable(0)?; + // Small target size → many small tables in the last level + tree.major_compact(512, SeqNo::MAX)?; + + // Verify we got multiple tables + assert!( + tree.table_count() >= 3, + "need 3+ tables to exercise the lazy loop, got {}", + tree.table_count() + ); + } + + // Phase 2: reopen with extractor length 4 (different name). Don't write + // anything — just query. All existing tables have extractor "fixed_prefix:3". + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + // Prefix query "abcd" (4 bytes) with current extractor length 4. + // validated_prefix_hint = Some("abcd") (guard passes: extract_first("abcd") + // = Some("abcd"), extract_first("abcd\0") = Some("abcd")). + // + // The lazy loop hits tables with extractor "fixed_prefix:3". Without + // prefix_filter_allowed check: probe_prefix_filter("abcd", ext4) + // → extract("abcd") = Some("abcd") → hash("abcd") ≠ hash("abc") + // → Some(false) → table skipped → data lost. + assert_eq!( + tree.prefix(b"abcd", SeqNo::MAX, None).count(), + 2000, + "all 2000 'abcd*' keys must be found after reopen with different extractor" + ); + } + + Ok(()) +} + +// ================================================================ +// Multi-prefix extractor tests +// ================================================================ + +/// A multi-prefix extractor that returns both a 3-byte and a 6-byte prefix +/// for keys >= 6 bytes, only a 3-byte prefix for keys 3-5 bytes, and nothing +/// for shorter keys. This creates the interleaved hash pattern: +/// key1: [hash("abc"), hash("abc123")] +/// key2: [hash("abc"), hash("abc456")] +/// where hash("abc") is duplicated non-adjacently in the buffer. +struct HierarchicalPrefixExtractor; + +impl PrefixExtractor for HierarchicalPrefixExtractor { + fn extract<'a>(&self, key: &'a [u8]) -> Box + 'a> { + if key.len() >= 6 { + Box::new(vec![&key[..3], &key[..6]].into_iter()) + } else if key.len() >= 3 { + Box::new(std::iter::once(&key[..3])) + } else { + Box::new(std::iter::empty()) + } + } + + fn name(&self) -> &str { + "hierarchical_prefix" + } +} + +/// Basic correctness: point reads work with a multi-prefix extractor. +#[test] +fn test_multi_prefix_point_reads() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + tree.insert(b"abc123_data1", b"v1", 0); + tree.insert(b"abc123_data2", b"v2", 0); + tree.insert(b"abc456_data1", b"v3", 0); + tree.insert(b"def789_data1", b"v4", 0); + tree.flush_active_memtable(0)?; + + assert_eq!(&*tree.get(b"abc123_data1", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"abc123_data2", SeqNo::MAX)?.unwrap(), b"v2"); + assert_eq!(&*tree.get(b"abc456_data1", SeqNo::MAX)?.unwrap(), b"v3"); + assert_eq!(&*tree.get(b"def789_data1", SeqNo::MAX)?.unwrap(), b"v4"); + assert!(tree.get(b"abc999_data1", SeqNo::MAX)?.is_none()); + assert!(tree.get(b"zzz000_data1", SeqNo::MAX)?.is_none()); + + Ok(()) +} + +/// Range queries with multi-prefix extractor: query a range that shares +/// the 3-byte prefix "abc" but spans different 6-byte prefixes. +#[test] +fn test_multi_prefix_range_queries() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + for i in 0..50u32 { + let key = format!("abc{i:03}_data"); + tree.insert(key.as_bytes(), b"v", 0); + } + for i in 0..30u32 { + let key = format!("def{i:03}_data"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + // Range within the "abc" 3-byte prefix + let count = tree + .range("abc000".as_bytes()..="abc999".as_bytes(), SeqNo::MAX, None) + .count(); + assert_eq!(count, 50, "should find all 50 abc keys"); + + // Range within the "def" 3-byte prefix + let count = tree + .range("def000".as_bytes()..="def999".as_bytes(), SeqNo::MAX, None) + .count(); + assert_eq!(count, 30, "should find all 30 def keys"); + + // Full range + let count = tree.range::<&[u8], _>(.., SeqNo::MAX, None).count(); + assert_eq!(count, 80, "should find all 80 keys"); + + Ok(()) +} + +/// Prefix scan with a multi-prefix extractor. The prefix_hint guard +/// should correctly determine whether the hint is usable for probing. +#[test] +fn test_multi_prefix_prefix_scan() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + tree.insert(b"abc123_x", b"v1", 0); + tree.insert(b"abc123_y", b"v2", 0); + tree.insert(b"abc456_x", b"v3", 0); + tree.insert(b"def789_x", b"v4", 0); + tree.flush_active_memtable(0)?; + + // Prefix "abc123" (6 bytes) — extract_first gives "abc", extract_first("abc123\0") + // gives "abc". Equal → hint is usable. Probe for hash("abc") should match. + assert_eq!( + tree.prefix(b"abc123", SeqNo::MAX, None).count(), + 2, + "should find 2 keys with prefix abc123" + ); + + // Prefix "abc" (3 bytes) — extract_first gives "abc", extract_first("abc\0") + // gives "abc". Equal → hint usable. Probe for hash("abc") should match. + assert_eq!( + tree.prefix(b"abc", SeqNo::MAX, None).count(), + 3, + "should find 3 keys with 3-byte prefix abc" + ); + + // Prefix "def" — should find 1 key + assert_eq!( + tree.prefix(b"def", SeqNo::MAX, None).count(), + 1, + "should find 1 key with prefix def" + ); + + // Prefix "zzz" — no keys, should find 0 + assert_eq!( + tree.prefix(b"zzz", SeqNo::MAX, None).count(), + 0, + "should find 0 keys with prefix zzz" + ); + + Ok(()) +} + +/// The dedup optimization must correctly handle the interleaved hash pattern +/// from multi-prefix extractors. With keys sharing the same 3-byte prefix, +/// the hash buffer before dedup looks like: +/// [hash("abc"), hash("abc123"), hash("abc"), hash("abc456"), ...] +/// The sort+dedup at flush time must collapse these to unique hashes so the +/// filter is correctly sized. +#[test] +fn test_multi_prefix_dedup_correctness() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + // 100 keys all sharing 3-byte prefix "abc" but with different 6-byte prefixes. + // Without dedup: 200 hashes in buffer (100 × "abc" + 100 × "abc{NNN}"). + // With dedup: 101 unique hashes (1 × "abc" + 100 × "abc{NNN}"). + for i in 0..100u32 { + let key = format!("abc{i:03}_data_value"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + // All keys should be findable via point reads + for i in 0..100u32 { + let key = format!("abc{i:03}_data_value"); + assert!( + tree.get(key.as_bytes(), SeqNo::MAX)?.is_some(), + "key {key} should exist" + ); + } + + // Prefix scan for "abc" should find all 100 + assert_eq!( + tree.prefix(b"abc", SeqNo::MAX, None).count(), + 100, + "should find all 100 keys with prefix abc" + ); + + // A nonexistent prefix should find 0 + assert_eq!( + tree.prefix(b"zzz", SeqNo::MAX, None).count(), + 0, + "should find 0 keys with prefix zzz" + ); + + Ok(()) +} + +/// Recovery with a multi-prefix extractor: write, flush, reopen, verify. +#[test] +fn test_multi_prefix_recovery() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let version = SequenceNumberCounter::default(); + + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + tree.insert(b"abc123_data", b"v1", seqno.next()); + tree.insert(b"abc456_data", b"v2", seqno.next()); + tree.insert(b"def789_data", b"v3", seqno.next()); + tree.flush_active_memtable(0)?; + } + + // Reopen with the same extractor + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + assert_eq!(&*tree.get(b"abc123_data", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"abc456_data", SeqNo::MAX)?.unwrap(), b"v2"); + assert_eq!(&*tree.get(b"def789_data", SeqNo::MAX)?.unwrap(), b"v3"); + + assert_eq!( + tree.prefix(b"abc", SeqNo::MAX, None).count(), + 2, + "should find 2 keys with prefix abc after recovery" + ); + } + + // Reopen without extractor — filter should be bypassed, all data accessible + { + let tree = Config::new(&folder, seqno.clone(), version.clone()).open()?; + + assert_eq!(&*tree.get(b"abc123_data", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"abc456_data", SeqNo::MAX)?.unwrap(), b"v2"); + assert_eq!(&*tree.get(b"def789_data", SeqNo::MAX)?.unwrap(), b"v3"); + } + + Ok(()) +} + +/// Multi-prefix extractor after compaction: verify filters are correctly +/// rebuilt with deduped prefix hashes. +#[test] +fn test_multi_prefix_after_compaction() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + for i in 0..200u32 { + let key = format!("abc{i:03}_data_value"); + tree.insert(key.as_bytes(), b"v", 0); + } + for i in 0..100u32 { + let key = format!("def{i:03}_data_value"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + // All data accessible after compaction + for i in 0..200u32 { + let key = format!("abc{i:03}_data_value"); + assert!( + tree.get(key.as_bytes(), SeqNo::MAX)?.is_some(), + "key {key} should exist after compaction" + ); + } + + assert_eq!( + tree.prefix(b"abc", SeqNo::MAX, None).count(), + 200, + "should find all 200 abc keys after compaction" + ); + assert_eq!( + tree.prefix(b"def", SeqNo::MAX, None).count(), + 100, + "should find all 100 def keys after compaction" + ); + assert_eq!( + tree.prefix(b"zzz", SeqNo::MAX, None).count(), + 0, + "should find 0 zzz keys after compaction" + ); + + Ok(()) +} + +/// extract_last gives the most specific prefix for the precomputed hash. +/// For HierarchicalPrefixExtractor(3,6), a query for "abc123" uses +/// hash("abc123") instead of hash("abc"). A table containing only +/// "abc999*" keys has hash("abc") but NOT hash("abc123"), so the more +/// specific hash allows skipping the table. +/// +/// This test writes data into two distinct 6-byte prefix groups, compacts +/// into multiple tables, then queries with a 6-byte prefix that doesn't +/// exist but whose 3-byte prefix does. +#[test] +fn test_multi_prefix_extract_last_correctness() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .filter_policy(lsm_tree::config::FilterPolicy::all( + lsm_tree::config::FilterPolicyEntry::Bloom( + lsm_tree::config::BloomConstructionPolicy::BitsPerKey(10.0), + ), + )) + .open()?; + + // Write keys in two 6-byte prefix groups under the same 3-byte prefix + for i in 0..100u32 { + tree.insert(format!("abc000_{i:04}").as_bytes(), b"v", 0); + } + for i in 0..100u32 { + tree.insert(format!("abc999_{i:04}").as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + // Existing 6-byte prefixes are found + assert_eq!( + tree.prefix(b"abc000", SeqNo::MAX, None).count(), + 100, + "should find all abc000 keys" + ); + assert_eq!( + tree.prefix(b"abc999", SeqNo::MAX, None).count(), + 100, + "should find all abc999 keys" + ); + + // 3-byte prefix finds all 200 + assert_eq!( + tree.prefix(b"abc", SeqNo::MAX, None).count(), + 200, + "should find all abc keys via 3-byte prefix" + ); + + // Absent 6-byte prefix under existing 3-byte prefix + assert_eq!( + tree.prefix(b"abc123", SeqNo::MAX, None).count(), + 0, + "abc123 was never written" + ); + assert_eq!( + tree.prefix(b"abc500", SeqNo::MAX, None).count(), + 0, + "abc500 was never written" + ); + + // Absent 3-byte prefix + assert_eq!( + tree.prefix(b"zzz", SeqNo::MAX, None).count(), + 0, + "zzz was never written" + ); + + Ok(()) +} + +/// extract_last with a 3-byte hint falls back to the first prefix since +/// HierarchicalPrefixExtractor only returns one prefix for keys < 6 bytes. +/// Verify correctness is maintained. +#[test] +fn test_multi_prefix_extract_last_short_hint() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + tree.insert(b"abc000_data", b"v1", 0); + tree.insert(b"abc999_data", b"v2", 0); + tree.insert(b"def000_data", b"v3", 0); + tree.flush_active_memtable(0)?; + + // 3-byte prefix queries (hint < 6 bytes → extract_last == extract_first) + assert_eq!(tree.prefix(b"abc", SeqNo::MAX, None).count(), 2); + assert_eq!(tree.prefix(b"def", SeqNo::MAX, None).count(), 1); + assert_eq!(tree.prefix(b"zzz", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// extract_last with single-prefix extractors is identical to extract_first. +/// Verify no regression. +#[test] +fn test_extract_last_single_prefix_no_change() -> lsm_tree::Result<()> { + for prefix_len in [1, 3, 4, 8] { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + tree.insert(b"abcdefgh_001", b"v1", 0); + tree.insert(b"abcdefgh_002", b"v2", 0); + tree.insert(b"zzzzzzzzz_001", b"v3", 0); + tree.flush_active_memtable(0)?; + + // Existing prefix finds data + assert!(tree.prefix(b"abcdefgh", SeqNo::MAX, None).count() >= 2); + + // Point reads work + assert!(tree.get(b"abcdefgh_001", SeqNo::MAX)?.is_some()); + assert!(tree.get(b"nonexistent", SeqNo::MAX)?.is_none()); + } + + Ok(()) +} + +/// After compaction, extract_last still works correctly with multi-prefix. +#[test] +fn test_multi_prefix_extract_last_after_compaction() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + for i in 0..200u32 { + tree.insert(format!("abc{i:03}_data").as_bytes(), b"v", 0); + } + for i in 0..100u32 { + tree.insert(format!("def{i:03}_data").as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + // 6-byte prefix queries + assert_eq!(tree.prefix(b"abc000", SeqNo::MAX, None).count(), 1); + assert_eq!(tree.prefix(b"abc199", SeqNo::MAX, None).count(), 1); + assert_eq!(tree.prefix(b"abczzz", SeqNo::MAX, None).count(), 0); + + // 3-byte prefix queries + assert_eq!(tree.prefix(b"abc", SeqNo::MAX, None).count(), 200); + assert_eq!(tree.prefix(b"def", SeqNo::MAX, None).count(), 100); + assert_eq!(tree.prefix(b"zzz", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// The single-table path (should_skip_range_by_prefix_filter with hint) +/// also uses extract_last for more specific pruning. This test uses a +/// single flush without compaction — L0 tables are individual single-table +/// runs that go through the single-table path in range.rs, not RunReader. +#[test] +fn test_extract_last_single_table_path() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + // Single flush → single L0 table → single-table path + tree.insert(b"abc000_data1", b"v1", 0); + tree.insert(b"abc000_data2", b"v2", 0); + tree.insert(b"abc999_data1", b"v3", 0); + tree.flush_active_memtable(0)?; + + // Existing 6-byte prefixes found + assert_eq!(tree.prefix(b"abc000", SeqNo::MAX, None).count(), 2); + assert_eq!(tree.prefix(b"abc999", SeqNo::MAX, None).count(), 1); + + // Absent 6-byte prefix under existing 3-byte prefix. + // The single-table hint path uses extract_last → hash("abc123") + // which is absent from the filter, enabling the skip. + assert_eq!( + tree.prefix(b"abc123", SeqNo::MAX, None).count(), + 0, + "abc123 was never written (single-table path)" + ); + assert_eq!( + tree.prefix(b"abc500", SeqNo::MAX, None).count(), + 0, + "abc500 was never written (single-table path)" + ); + + // 3-byte prefix still works + assert_eq!(tree.prefix(b"abc", SeqNo::MAX, None).count(), 3); + assert_eq!(tree.prefix(b"zzz", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +/// Verify that the single-table extract_last path also works correctly +/// after multiple flushes (multiple L0 tables, each checked independently). +#[test] +fn test_extract_last_multiple_l0_tables() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(HierarchicalPrefixExtractor)) + .open()?; + + // Flush 1: abc000 keys + tree.insert(b"abc000_data1", b"v1", 0); + tree.insert(b"abc000_data2", b"v2", 0); + tree.flush_active_memtable(0)?; + + // Flush 2: abc999 keys + tree.insert(b"abc999_data1", b"v3", 0); + tree.flush_active_memtable(0)?; + + // Flush 3: def000 keys + tree.insert(b"def000_data1", b"v4", 0); + tree.flush_active_memtable(0)?; + + // Each L0 table is checked independently via single-table path + assert_eq!(tree.prefix(b"abc000", SeqNo::MAX, None).count(), 2); + assert_eq!(tree.prefix(b"abc999", SeqNo::MAX, None).count(), 1); + assert_eq!(tree.prefix(b"def000", SeqNo::MAX, None).count(), 1); + + // Absent 6-byte prefix — extract_last gives more specific hash + assert_eq!(tree.prefix(b"abc123", SeqNo::MAX, None).count(), 0); + assert_eq!(tree.prefix(b"abc500", SeqNo::MAX, None).count(), 0); + assert_eq!(tree.prefix(b"def999", SeqNo::MAX, None).count(), 0); + + // 3-byte prefixes + assert_eq!(tree.prefix(b"abc", SeqNo::MAX, None).count(), 3); + assert_eq!(tree.prefix(b"def", SeqNo::MAX, None).count(), 1); + assert_eq!(tree.prefix(b"zzz", SeqNo::MAX, None).count(), 0); + + Ok(()) +} + +// ================================================================ +// whole_key_filtering tests +// ================================================================ + +/// Helper: open a tree with given prefix extractor length and whole_key_filtering flag. +fn open_tree_wkf( + folder: &std::path::Path, + prefix_len: usize, + whole_key_filtering: bool, +) -> lsm_tree::Result { + Config::new( + folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .whole_key_filtering(whole_key_filtering) + .filter_policy(lsm_tree::config::FilterPolicy::all( + lsm_tree::config::FilterPolicyEntry::Bloom( + lsm_tree::config::BloomConstructionPolicy::BitsPerKey(10.0), + ), + )) + .open() +} + +/// With whole_key_filtering=true (default), point reads of nonexistent keys +/// whose prefix IS in the table should be caught by the full-key Bloom. +#[test] +#[cfg(feature = "metrics")] +fn test_wkf_enabled_point_read_uses_full_key_bloom() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, true)?; + + // All keys share prefix "aaaa" + for i in 0..100u32 { + let key = format!("aaaa_{i:04}"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let before_q = tree.metrics().filter_queries(); + let before_s = tree.metrics().io_skipped_by_filter(); + + // Look up a key that doesn't exist but shares the prefix "aaaa" + // and falls within the table's key range. + // With whole_key_filtering=true, the full-key Bloom should catch it. + assert!(tree.get(b"aaaa_0050x", SeqNo::MAX)?.is_none()); + + let after_q = tree.metrics().filter_queries(); + let after_s = tree.metrics().io_skipped_by_filter(); + + // The full-key Bloom should have caught the nonexistent key and + // incremented both counters (the key hash is not in the filter). + assert!( + after_s > before_s, + "full-key Bloom should skip the nonexistent key (whole_key_filtering=true)" + ); + assert!(after_q > before_q); + + Ok(()) +} + +/// With whole_key_filtering=false, point reads of nonexistent keys whose prefix +/// IS in the table will NOT be caught by the Bloom (no full-key hashes in filter). +/// The read falls through to the data blocks. +#[test] +#[cfg(feature = "metrics")] +fn test_wkf_disabled_point_read_no_full_key_bloom() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, false)?; + + for i in 0..100u32 { + let key = format!("aaaa_{i:04}"); + tree.insert(key.as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let before_s = tree.metrics().io_skipped_by_filter(); + + // Look up a nonexistent key with the same prefix. + // Without whole_key_filtering, the filter only has prefix hashes. + // The prefix "aaaa" IS in the filter → prefix check passes → data + // blocks are read → key not found. No Bloom skip. + assert!(tree.get(b"aaaa_9999", SeqNo::MAX)?.is_none()); + + let after_s = tree.metrics().io_skipped_by_filter(); + + // io_skipped_by_filter should NOT have increased — the prefix filter + // let the key through, and there's no full-key Bloom to catch it. + assert_eq!( + after_s, before_s, + "no Bloom skip expected (whole_key_filtering=false, prefix exists)" + ); + + Ok(()) +} + +/// Both modes correctly skip tables via prefix filter when the prefix +/// is absent (regardless of whole_key_filtering). +#[test] +#[cfg(feature = "metrics")] +fn test_wkf_both_modes_prefix_absent_skips() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, wkf)?; + + tree.insert(b"aaaa_001", b"v", 0); + tree.insert(b"zzzz_001", b"v", 0); + tree.flush_active_memtable(0)?; + + let before_s = tree.metrics().io_skipped_by_filter(); + + // Prefix "mmmm" is absent from the filter in both modes. + assert!(tree.get(b"mmmm_001", SeqNo::MAX)?.is_none()); + + let after_s = tree.metrics().io_skipped_by_filter(); + + assert!( + after_s > before_s, + "prefix filter should skip absent prefix (whole_key_filtering={wkf})" + ); + } + + Ok(()) +} + +/// Prefix scans work correctly with both modes. +#[test] +fn test_wkf_prefix_scan_correctness() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, wkf)?; + + for i in 0..50u32 { + tree.insert(format!("aaaa_{i:04}").as_bytes(), b"v", 0); + } + for i in 0..30u32 { + tree.insert(format!("bbbb_{i:04}").as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + assert_eq!( + tree.prefix(b"aaaa", SeqNo::MAX, None).count(), + 50, + "should find 50 aaaa keys (whole_key_filtering={wkf})" + ); + assert_eq!( + tree.prefix(b"bbbb", SeqNo::MAX, None).count(), + 30, + "should find 30 bbbb keys (whole_key_filtering={wkf})" + ); + assert_eq!( + tree.prefix(b"cccc", SeqNo::MAX, None).count(), + 0, + "should find 0 cccc keys (whole_key_filtering={wkf})" + ); + } + + Ok(()) +} + +/// Point reads of existing keys work correctly with both modes. +#[test] +fn test_wkf_point_read_existing_keys() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, wkf)?; + + tree.insert(b"aaaa_001", b"v1", 0); + tree.insert(b"bbbb_001", b"v2", 0); + tree.flush_active_memtable(0)?; + + assert_eq!( + &*tree.get(b"aaaa_001", SeqNo::MAX)?.unwrap(), + b"v1", + "existing key should be found (whole_key_filtering={wkf})" + ); + assert_eq!( + &*tree.get(b"bbbb_001", SeqNo::MAX)?.unwrap(), + b"v2", + "existing key should be found (whole_key_filtering={wkf})" + ); + } + + Ok(()) +} + +/// Range queries work correctly with both modes. +#[test] +fn test_wkf_range_query_correctness() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, wkf)?; + + for i in 0..100u32 { + tree.insert(format!("aaaa_{i:04}").as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + + let count = tree + .range( + "aaaa_0010".as_bytes()..="aaaa_0020".as_bytes(), + SeqNo::MAX, + None, + ) + .count(); + assert_eq!( + count, 11, + "range should return 11 keys (whole_key_filtering={wkf})" + ); + + let count = tree.range::<&[u8], _>(.., SeqNo::MAX, None).count(); + assert_eq!( + count, 100, + "full range should return 100 keys (whole_key_filtering={wkf})" + ); + } + + Ok(()) +} + +/// whole_key_filtering has no effect when no prefix extractor is configured +/// (the full-key Bloom is always used). +#[test] +#[cfg(feature = "metrics")] +fn test_wkf_no_effect_without_extractor() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .whole_key_filtering(wkf) + .filter_policy(lsm_tree::config::FilterPolicy::all( + lsm_tree::config::FilterPolicyEntry::Bloom( + lsm_tree::config::BloomConstructionPolicy::BitsPerKey(10.0), + ), + )) + .open()?; + + tree.insert(b"key_001", b"v1", 0); + tree.insert(b"key_002", b"v2", 0); + tree.insert(b"key_999", b"v3", 0); + tree.flush_active_memtable(0)?; + + // Existing key found + assert!(tree.get(b"key_001", SeqNo::MAX)?.is_some()); + + // Nonexistent key within the table's key range, caught by full-key Bloom + let before_s = tree.metrics().io_skipped_by_filter(); + + assert!(tree.get(b"key_500", SeqNo::MAX)?.is_none()); + + let after_s = tree.metrics().io_skipped_by_filter(); + assert!( + after_s > before_s, + "full-key Bloom should work regardless of whole_key_filtering={wkf} without extractor" + ); + } + + Ok(()) +} + +/// After recovery, the whole_key_filtering behavior is preserved because +/// the filter on disk already has (or doesn't have) full-key hashes. +#[test] +fn test_wkf_recovery() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let version = SequenceNumberCounter::default(); + + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .whole_key_filtering(wkf) + .open()?; + + for i in 0..50u32 { + tree.insert(format!("aaaa_{i:04}").as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with same settings + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .whole_key_filtering(wkf) + .open()?; + + // All keys should be found + for i in 0..50u32 { + let key = format!("aaaa_{i:04}"); + assert!( + tree.get(key.as_bytes(), SeqNo::MAX)?.is_some(), + "key {key} should exist after recovery (whole_key_filtering={wkf})" + ); + } + + // Prefix scan works + assert_eq!( + tree.prefix(b"aaaa", SeqNo::MAX, None).count(), + 50, + "prefix scan should find 50 keys after recovery (whole_key_filtering={wkf})" + ); + } + } + + Ok(()) +} + +/// After compaction, the filter is rebuilt with the current whole_key_filtering +/// setting. Both modes produce correct results. +#[test] +fn test_wkf_after_compaction() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 4, wkf)?; + + for i in 0..200u32 { + tree.insert(format!("aaaa_{i:04}").as_bytes(), b"v", 0); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + // All keys found + for i in 0..200u32 { + let key = format!("aaaa_{i:04}"); + assert!( + tree.get(key.as_bytes(), SeqNo::MAX)?.is_some(), + "key {key} should exist after compaction (whole_key_filtering={wkf})" + ); + } + + // Prefix scan works + assert_eq!( + tree.prefix(b"aaaa", SeqNo::MAX, None).count(), + 200, + "prefix scan after compaction (whole_key_filtering={wkf})" + ); + } + + Ok(()) +} + +/// With whole_key_filtering=true, the filter is larger (has both prefix and +/// full-key hashes) but point reads of nonexistent same-prefix keys are +/// more efficient. With false, the filter is smaller but those reads hit +/// data blocks. Both modes are correct — only performance differs. +#[test] +fn test_wkf_correctness_large_dataset() -> lsm_tree::Result<()> { + for wkf in [true, false] { + let folder = tempfile::tempdir()?; + let tree = open_tree_wkf(folder.path(), 3, wkf)?; + + // Multiple prefixes + for prefix in [b"aaa", b"bbb", b"ccc", b"ddd"] { + for i in 0..100u32 { + let mut key = prefix.to_vec(); + key.extend_from_slice(format!("_{i:04}").as_bytes()); + tree.insert(&key, b"v", 0); + } + } + tree.flush_active_memtable(0)?; + + // All existing keys found + for prefix in [b"aaa", b"bbb", b"ccc", b"ddd"] { + for i in 0..100u32 { + let mut key = prefix.to_vec(); + key.extend_from_slice(format!("_{i:04}").as_bytes()); + assert!( + tree.get(&key, SeqNo::MAX)?.is_some(), + "key should exist (whole_key_filtering={wkf})" + ); + } + } + + // Nonexistent keys not found + assert!(tree.get(b"aaa_9999", SeqNo::MAX)?.is_none()); + assert!(tree.get(b"eee_0001", SeqNo::MAX)?.is_none()); + assert!(tree.get(b"zzz_0001", SeqNo::MAX)?.is_none()); + + // Prefix scans correct + assert_eq!(tree.prefix(b"aaa", SeqNo::MAX, None).count(), 100); + assert_eq!(tree.prefix(b"bbb", SeqNo::MAX, None).count(), 100); + assert_eq!(tree.prefix(b"eee", SeqNo::MAX, None).count(), 0); + } + + Ok(()) +} + +/// Tables written with `whole_key_filtering=false` contain only prefix hashes +/// (no full-key hashes). After reopening with `whole_key_filtering=true`, point +/// reads must continue to find every key — the read path must rely on the +/// table's own persisted filter contents, not the runtime config. +#[test] +fn test_wkf_persisted_across_reopen() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + let version = SequenceNumberCounter::default(); + + // Phase 1: write with whole_key_filtering=false + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(false) + .filter_policy(lsm_tree::config::FilterPolicy::all( + lsm_tree::config::FilterPolicyEntry::Bloom( + lsm_tree::config::BloomConstructionPolicy::BitsPerKey(10.0), + ), + )) + .open()?; + + tree.insert(b"abcXYZ", b"v1", seqno.next()); + tree.insert(b"abcDEF", b"v2", seqno.next()); + tree.flush_active_memtable(0)?; + + assert_eq!(&*tree.get(b"abcXYZ", SeqNo::MAX)?.unwrap(), b"v1"); + } + + // Phase 2: reopen with whole_key_filtering=true (default) + { + let tree = Config::new(&folder, seqno.clone(), version.clone()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(true) + .open()?; + + // The key MUST be found. If it's not, we have data loss. + let result = tree.get(b"abcXYZ", SeqNo::MAX)?; + assert!( + result.is_some(), + "key abcXYZ exists but get returned None — data loss from wkf mismatch" + ); + assert_eq!(&*result.unwrap(), b"v1"); + + let result2 = tree.get(b"abcDEF", SeqNo::MAX)?; + assert!(result2.is_some(), "key abcDEF exists but get returned None"); + } + + Ok(()) +} + +/// Partition spills in the partitioned filter writer must occur on user-key +/// boundaries, never mid-key. With many keys, a small partition target, and +/// a prefix extractor + whole_key_filtering=true, every key produces multiple +/// hashes (prefix + full key). All of those hashes must end up in the same +/// partition so the partition's TLI key correctly maps to fully committed +/// hashes for that key. Otherwise point reads can miss keys whose full-key +/// hash lives in a different partition than the TLI lookup finds. +#[test] +fn test_partitioned_filter_no_midkey_spill() -> lsm_tree::Result<()> { + use lsm_tree::config::PartitioningPolicy; + + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(true) + .filter_block_partitioning_policy(PartitioningPolicy::all(true)) + .filter_policy(lsm_tree::config::FilterPolicy::all( + lsm_tree::config::FilterPolicyEntry::Bloom( + lsm_tree::config::BloomConstructionPolicy::BitsPerKey(10.0), + ), + )) + .open()?; + + let n = 50000; + for i in 0..n { + let key = format!("abc{i:08}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + let mut lost = 0; + let mut sample_lost = String::new(); + for i in 0..n { + let key = format!("abc{i:08}"); + if tree.get(key.as_bytes(), SeqNo::MAX)?.is_none() { + lost += 1; + if sample_lost.is_empty() { + sample_lost = key; + } + } + } + + assert_eq!( + lost, 0, + "lost {} of {} keys (sample: {})", + lost, n, sample_lost + ); + + Ok(()) +} + +/// Regression: `Writer::use_prefix_extractor` calls `enable_dedup` on the +/// current filter writer, but `use_partitioned_filter` *replaces* the filter +/// writer afterwards. Without propagating dedup into the new partitioned +/// writer, every duplicate prefix hash gets re-buffered, which inflates +/// `bloom_hash_buffer.len()` (used as `n` for filter sizing) and triggers +/// premature partition spills. +/// +/// Insert many keys that share a single prefix with `whole_key_filtering=false`, +/// so the only hashes registered are duplicate prefix hashes. With dedup, +/// the on-disk filter is tiny (one unique hash). Without dedup, it grows +/// proportionally to the key count. +#[test] +fn test_partitioned_filter_dedup_with_prefix_extractor() -> lsm_tree::Result<()> { + use lsm_tree::config::PartitioningPolicy; + + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .whole_key_filtering(false) + .filter_block_partitioning_policy(PartitioningPolicy::all(true)) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .open()?; + + // 10000 keys all share the 4-byte prefix b"abcd". With WKF=false the only + // hashes registered are prefix hashes — all identical — so after dedup the + // filter holds one unique hash. + let n = 10_000; + for i in 0..n { + let key = format!("abcd{i:08}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + tree.major_compact(u64::MAX, SeqNo::MAX)?; + + let filter_size = tree.filter_size(); + + // With dedup: ~one hash worth of bits per partition + TLI overhead. Even + // with framing/checksum/TLI a properly-deduped filter for one unique hash + // is well under 1 KiB. + // + // Without dedup: 10000 hashes × 10 bpk = 12 500 bytes of raw filter bits, + // plus partition framing and TLI entries → easily >10 KiB. + assert!( + filter_size < 1024, + "partitioned filter not deduped: {} bytes for {} duplicate prefix hashes (expected < 1 KiB)", + filter_size, + n, + ); + + // Sanity: reads still work. + for i in 0..100 { + let key = format!("abcd{i:08}"); + assert!( + tree.contains_key(key.as_bytes(), SeqNo::MAX)?, + "missing key {key}", + ); + } + + Ok(()) +} + +/// Same regression but exercising the `MultiWriter::rotate` path: compaction +/// with a small `target_size` to force rotation across multiple output tables. +#[test] +fn test_partitioned_filter_dedup_after_rotation() -> lsm_tree::Result<()> { + use lsm_tree::config::PartitioningPolicy; + + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .whole_key_filtering(false) + .filter_block_partitioning_policy(PartitioningPolicy::all(true)) + .filter_policy(FilterPolicy::all(FilterPolicyEntry::Bloom( + BloomConstructionPolicy::BitsPerKey(10.0), + ))) + .open()?; + + let n = 20_000; + for i in 0..n { + let key = format!("abcd{i:08}"); + tree.insert(key.as_bytes(), &vec![b'x'; 64], seqno.next()); + } + tree.flush_active_memtable(0)?; + + // Force the compaction MultiWriter to rotate by passing a small target + // size — every output table that is created here goes through `rotate`, + // which is the path where the dedup propagation bug lives. + tree.major_compact(64 * 1024, SeqNo::MAX)?; + + // Filter across all rotated tables should still be deduped per-partition. + // Each rotated table has its own filter; total stays small relative to N. + let filter_size = tree.filter_size(); + assert!( + filter_size < 64 * 1024, + "filter across rotated tables not deduped: {} bytes for {} duplicate prefix hashes (expected < 64 KiB)", + filter_size, + n, + ); + + Ok(()) +} + +/// When the configured prefix extractor doesn't match the table's persisted +/// extractor name (different extractor, removed extractor, or legacy table +/// without one), the prefix filter is bypassed — but the full-key Bloom is +/// still valid whenever the table was written with `whole_key_filtering=true`. +/// `point_read_from_table` should consult that Bloom instead of falling +/// straight through to a data-block scan. +#[test] +#[cfg(feature = "metrics")] +fn test_full_key_bloom_used_on_extractor_mismatch() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + // Write tables with extractor X and WKF=true (so full-key hashes are in + // the filter). + { + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(true) + .open()?; + + for i in 0..100 { + let key = format!("abc{i:05}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with a different extractor (length 5 vs 3) — names mismatch, so + // `prefix_filter_allowed` returns false on the existing tables. + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .whole_key_filtering(true) + .open()?; + + let queries_before = tree.metrics().filter_queries(); + + // Look up an absent key that lies *within* the table's key range so + // `get_for_key` doesn't prune before reaching `point_read_from_table`. + // The full-key Bloom should reject it — incrementing both + // `filter_queries` and `io_skipped_by_filter`. Without the optimization, + // the read would skip the filter entirely (no metric bump). + assert!(!tree.contains_key(b"abc00000_absent", u64::MAX)?); + + let queries_after = tree.metrics().filter_queries(); + assert!( + queries_after > queries_before, + "full-key Bloom not consulted on extractor mismatch: filter_queries unchanged ({} -> {})", + queries_before, + queries_after, + ); + + // Sanity: real keys still found. + for i in 0..10 { + let key = format!("abc{i:05}"); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + Ok(()) +} + +/// Same optimization for tables written without an extractor: legacy tables +/// store full-key hashes, and reopening with a configured extractor must +/// still be able to use the full-key Bloom. +#[test] +#[cfg(feature = "metrics")] +fn test_full_key_bloom_used_on_legacy_table_reopen() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + // Write legacy tables (no extractor configured). + { + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()).open()?; + + for i in 0..100 { + let key = format!("abc{i:05}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with an extractor — `prefix_filter_allowed` returns false because + // the table's persisted name is None but the current config is Some(_). + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(true) + .open()?; + + let queries_before = tree.metrics().filter_queries(); + + assert!(!tree.contains_key(b"abc00000_absent", u64::MAX)?); + + let queries_after = tree.metrics().filter_queries(); + assert!( + queries_after > queries_before, + "full-key Bloom not consulted on legacy table: filter_queries unchanged ({} -> {})", + queries_before, + queries_after, + ); + + for i in 0..10 { + let key = format!("abc{i:05}"); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + Ok(()) +} + +/// Mirror of the previous test: tree opened *without* an extractor reading +/// tables that were written *with* one. The full-key Bloom is still queryable. +#[test] +#[cfg(feature = "metrics")] +fn test_full_key_bloom_used_when_extractor_dropped() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + { + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(true) + .open()?; + + for i in 0..100 { + let key = format!("abc{i:05}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + // Reopen without an extractor. + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()).open()?; + + let queries_before = tree.metrics().filter_queries(); + assert!(!tree.contains_key(b"abc00000_absent", u64::MAX)?); + let queries_after = tree.metrics().filter_queries(); + assert!( + queries_after > queries_before, + "full-key Bloom not consulted after dropping extractor: filter_queries unchanged ({} -> {})", + queries_before, + queries_after, + ); + + for i in 0..10 { + let key = format!("abc{i:05}"); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + Ok(()) +} + +/// When the table was written with `whole_key_filtering=false` and a now- +/// mismatched extractor, the full-key Bloom is **not** valid (the filter +/// only contains prefix hashes). The read path must skip the filter entirely +/// in that case to avoid false negatives. +#[test] +#[cfg(feature = "metrics")] +fn test_full_key_bloom_skipped_when_table_wkf_false_and_mismatch() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let seqno = SequenceNumberCounter::default(); + + { + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(3))) + .whole_key_filtering(false) + .open()?; + + for i in 0..100 { + let key = format!("abc{i:05}"); + tree.insert(key.as_bytes(), b"v", seqno.next()); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with mismatched extractor. + let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default()) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(5))) + .whole_key_filtering(false) + .open()?; + + // No false negatives: every real key must still be found, even though the + // table's prefix-only filter is unusable with the new extractor. + for i in 0..100 { + let key = format!("abc{i:05}"); + assert!( + tree.contains_key(key.as_bytes(), u64::MAX)?, + "false negative on key {key}", + ); + } + + Ok(()) +} diff --git a/tests/prefix_filter_recovery.rs b/tests/prefix_filter_recovery.rs new file mode 100644 index 000000000..99fa6f4d9 --- /dev/null +++ b/tests/prefix_filter_recovery.rs @@ -0,0 +1,206 @@ +use lsm_tree::Guard; +use lsm_tree::SequenceNumberCounter; +use lsm_tree::{ + prefix::{FixedPrefixExtractor, PrefixExtractor}, + AbstractTree, Config, SeqNo, +}; +use std::sync::Arc; + +#[test] +fn test_prefix_filter_recovery() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + let prefix_len = 10; + + // Create and populate tree + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + for i in 0..100 { + let key = format!("persistent_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + // Sentinel with a different 10-byte prefix to widen the key range + tree.insert(b"zzzzzzzzzz_sentinel", b"value", 0); + + tree.flush_active_memtable(0)?; + } + + // Reopen tree and verify filter still works + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(prefix_len))) + .open()?; + + for i in 0..100 { + let key = format!("persistent_{:04}", i); + assert!(tree.contains_key(key.as_bytes(), u64::MAX)?); + } + + // Look up a key with a prefix absent from the filter but within the + // table's key range (between "persistent_*" and "zzzzzzzzzz_*"). + #[cfg(feature = "metrics")] + { + let initial_queries = tree.metrics().filter_queries(); + let initial_skips = tree.metrics().io_skipped_by_filter(); + + let non_existent = b"qqqqqqqqqq_0000"; + assert!(!tree.contains_key(non_existent, u64::MAX)?); + + let final_queries = tree.metrics().filter_queries(); + let final_skips = tree.metrics().io_skipped_by_filter(); + + assert!( + final_queries > initial_queries, + "filter should be consulted after recovery" + ); + assert!( + final_skips > initial_skips, + "filter should skip absent prefix after recovery" + ); + } + } + + Ok(()) +} + +/// Test prefix extractor name persistence across recovery cycles. +#[test] +fn test_prefix_extractor_name_persistence() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + // Write with "fixed_prefix:4" + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + tree.insert(b"aaaa_001", b"v1", 0); + tree.insert(b"bbbb_001", b"v2", 0); + tree.flush_active_memtable(0)?; + } + + // Reopen with same extractor — prefix filter should be used + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + assert_eq!(&*tree.get(b"aaaa_001", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"bbbb_001", SeqNo::MAX)?.unwrap(), b"v2"); + + let keys: Vec<_> = tree + .prefix(b"aaaa", SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + + assert_eq!(keys.len(), 1); + + #[cfg(feature = "metrics")] + { + let initial_queries = tree.metrics().filter_queries(); + + // Look up a key within the table's range whose prefix is absent + // from the filter ("abcd" is between "aaaa" and "bbbb"). + assert!(tree.get(b"abcd_001", SeqNo::MAX)?.is_none()); + + let final_queries = tree.metrics().filter_queries(); + assert!(final_queries > initial_queries, "filter should be used"); + } + } + + // Reopen WITHOUT any extractor — filter should be bypassed but reads work + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .open()?; + + assert_eq!(&*tree.get(b"aaaa_001", SeqNo::MAX)?.unwrap(), b"v1"); + assert_eq!(&*tree.get(b"bbbb_001", SeqNo::MAX)?.unwrap(), b"v2"); + + #[cfg(feature = "metrics")] + { + let final_queries = tree.metrics().filter_queries(); + + // Without an extractor, prefix filter should be bypassed + assert_eq!(0, final_queries, "filter should not be used"); + } + } + + Ok(()) +} + +/// Test should_skip_range_by_prefix_filter with an incompatible extractor. +#[test] +fn test_skip_range_incompatible_extractor() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?; + + struct InvalidExtractor; + + impl PrefixExtractor for InvalidExtractor { + fn extract<'a>(&self, _key: &'a [u8]) -> Box + 'a> { + unreachable!() + } + + fn name(&self) -> &str { + "asd" + } + } + + // Write data with "fixed_prefix:4" extractor + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(FixedPrefixExtractor::new(4))) + .open()?; + + for i in 0..10u32 { + let key = format!("aaaa_{:04}", i); + tree.insert(key.as_bytes(), b"value", 0); + } + tree.flush_active_memtable(0)?; + } + + // Reopen with incompatible extractor name — range should not skip tables + { + let tree = Config::new( + &folder, + SequenceNumberCounter::default(), + SequenceNumberCounter::default(), + ) + .prefix_extractor(Arc::new(InvalidExtractor)) + .open()?; + + let keys: Vec<_> = tree + .range::<&[u8], _>(&b"aaaa_0000"[..]..&b"aaaa_9999"[..], SeqNo::MAX, None) + .map(|g| g.key().unwrap()) + .collect(); + + assert_eq!(keys.len(), 10); + } + + Ok(()) +}