Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ pub struct Metrics {
/// Number of IOs that were skipped due to filter
pub(crate) io_skipped_by_filter: AtomicUsize,

/// Number of segments skipped during prefix scans via
/// [`Tree::create_prefix`] where the per-table prefix bloom filter
/// returned `Ok(false)`. Counted in both single-table and
/// multi-table run paths of `TreeIter::create_range`.
///
/// Note: `BlobTree` prefix scans do not currently record this metric.
pub(crate) prefix_bloom_skips: AtomicUsize,

/// Number of data block bytes that were requested from OS or disk
pub(crate) data_block_io_requested: AtomicU64,

Expand Down Expand Up @@ -249,6 +257,14 @@ impl Metrics {
pub fn io_skipped_by_filter(&self) -> usize {
self.io_skipped_by_filter.load(Relaxed)
}

/// Number of segments skipped during [`Tree::create_prefix`] scans
/// by prefix bloom filters (single-table and multi-table run paths).
///
/// Note: `BlobTree` prefix scans do not currently record this metric.
pub fn prefix_bloom_skips(&self) -> usize {
self.prefix_bloom_skips.load(Relaxed)
}
}

#[cfg(test)]
Expand Down
26 changes: 24 additions & 2 deletions src/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ pub struct IterState {
/// When set, segments whose bloom filter reports no match for this
/// hash will be skipped entirely during the scan.
pub(crate) prefix_hash: Option<u64>,

/// Optional metrics handle for recording prefix-related statistics (e.g. bloom skips).
///
/// `None` when the caller does not wish to record metrics; this is
/// independent of whether the iterator uses a prefix.
#[cfg(feature = "metrics")]
pub(crate) metrics: Option<Arc<crate::Metrics>>,
}

type BoxedMerge<'a> = Box<dyn DoubleEndedIterator<Item = crate::Result<InternalValue>> + Send + 'a>;
Expand Down Expand Up @@ -233,6 +240,11 @@ impl TreeIter {
Ok(false) => {
// Prefix bloom says this segment has no matching keys
// — skip it entirely.
#[cfg(feature = "metrics")]
if let Some(m) = &lock.metrics {
m.prefix_bloom_skips
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
Comment thread
polaz marked this conversation as resolved.
}
Ok(true) => {
single_tables.push(table.clone());
Expand Down Expand Up @@ -287,15 +299,25 @@ impl TreeIter {

// On I/O error reading the filter, include the
// table conservatively to avoid missing data.
table
let contains = table
.maybe_contains_prefix(prefix_hash)
.inspect_err(|e| {
log::debug!(
"prefix bloom check failed for table {:?}: {e}",
table.id(),
);
})
.unwrap_or(true)
.unwrap_or(true);

#[cfg(feature = "metrics")]
if !contains {
if let Some(m) = &lock.metrics {
m.prefix_bloom_skips
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
}

contains
})
.cloned()
.collect();
Expand Down
4 changes: 4 additions & 0 deletions src/tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,8 @@ impl Tree {
ephemeral,
merge_operator,
prefix_hash,
#[cfg(feature = "metrics")]
metrics: None,
};
Comment thread
polaz marked this conversation as resolved.

TreeIter::create_range(iter_state, bounds, seqno)
Expand Down Expand Up @@ -1327,6 +1329,8 @@ impl Tree {
ephemeral,
merge_operator: self.config.merge_operator.clone(),
prefix_hash,
#[cfg(feature = "metrics")]
metrics: Some(self.0.metrics.clone()),
};

TreeIter::create_range(iter_state, range, seqno).map(|item| match item {
Expand Down
118 changes: 118 additions & 0 deletions tests/tree_prefix_bloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -747,3 +747,121 @@ fn prefix_bloom_overlapping_l0_tables() -> lsm_tree::Result<()> {

Ok(())
}

/// Verifies the `prefix_bloom_skips` metric is incremented when bloom filters
/// reject prefixes that fall inside a table's key_range.
///
/// Creates a single L0 table (= single-table run, where bloom check applies)
/// with a wide key range, then performs many prefix scans for non-existent
/// prefixes. With 1000 keys and 10 bits-per-key the FP rate per query is ~1%,
/// so out of 24 distinct non-existent prefixes at least some must be rejected
/// by the bloom filter, producing a non-zero skip count.
Comment thread
polaz marked this conversation as resolved.
#[cfg(feature = "metrics")]
#[test]
fn prefix_bloom_skip_metrics() -> lsm_tree::Result<()> {
let folder = tempfile::tempdir()?;
let tree = tree_with_prefix_bloom(&folder)?;

// Insert 500 keys under "aaa:" and 500 under "zzz:" to create a single
// table with key_range [aaa:0000, zzz:0499] and a large bloom filter.
let mut seqno = 0u64;
for i in 0..500 {
tree.insert(format!("aaa:{i:04}"), "v", seqno);
seqno += 1;
}
for i in 0..500 {
tree.insert(format!("zzz:{i:04}"), "v", seqno);
seqno += 1;
}
tree.flush_active_memtable(0)?;

Comment thread
polaz marked this conversation as resolved.
// The test relies on the single-table prefix-bloom fast path
// (run.len() == 1) in TreeIter::create_range. Fail early if flush
// produces multiple tables so metric failures reflect a real bloom
// regression rather than layout differences.
assert_eq!(
tree.table_count(),
1,
"expected single-table run; flush produced {} tables",
tree.table_count()
);

assert_eq!(tree.metrics().prefix_bloom_skips(), 0);

// Scan for 24 non-existent prefixes that fall inside the key_range.
// Each prefix is a valid extractor boundary (ends with ':').
let scan_nonexistent_prefixes = || -> lsm_tree::Result<()> {
for c in b'b'..=b'y' {
let prefix = format!("{}:", c as char);
let results: Vec<_> = tree
.create_prefix(&prefix, seqno, None)
.collect::<Result<Vec<_>, _>>()?;
assert_eq!(results.len(), 0, "prefix '{prefix}' should match no keys");
}
Ok(())
};

// This is a probabilistic smoke test: we issue many lookups for prefixes
// that do not exist but fall within the table's key range. For any
// reasonable prefix-bloom configuration with a non-zero false-positive
// rate, we expect at least one of these lookups to be fully filtered by
// the bloom (counted as a skip). To make the test robust against rare
// all-false-positive runs or configuration changes, we retry the scan a
// generous number of times before failing.
const MAX_SCAN_ATTEMPTS: u32 = 20;

for _attempt in 0..MAX_SCAN_ATTEMPTS {
scan_nonexistent_prefixes()?;
if tree.metrics().prefix_bloom_skips() > 0 {
return Ok(());
}
}

let skips = tree.metrics().prefix_bloom_skips();
assert!(
skips > 0,
"expected at least one prefix bloom skip out of 24 non-existent prefix scans \
after {MAX_SCAN_ATTEMPTS} attempts, got {skips}"
);

Ok(())
}

/// Verifies `prefix_bloom_skips` stays at zero when no bloom filtering occurs.
///
/// Without a prefix extractor, no prefix hash is computed — the bloom check is
/// never reached, and the counter must remain zero.
#[cfg(feature = "metrics")]
#[test]
fn prefix_bloom_skip_metrics_zero_without_extractor() -> lsm_tree::Result<()> {
let folder = tempfile::tempdir()?;

let tree = Config::new(
&folder,
SequenceNumberCounter::default(),
SequenceNumberCounter::default(),
)
.open()?;

let tree = match tree {
lsm_tree::AnyTree::Standard(t) => t,
_ => panic!("expected standard tree"),
};

tree.insert("user:1", "v", 0);
tree.insert("user:2", "v", 1);
tree.flush_active_memtable(0)?;

let results: Vec<_> = tree
.create_prefix("user:", 2, None)
.collect::<Result<Vec<_>, _>>()?;
assert_eq!(results.len(), 2);

assert_eq!(
tree.metrics().prefix_bloom_skips(),
0,
"no bloom skips should occur without a prefix extractor"
);

Ok(())
}
Loading