Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3121610
feat: custom key comparison / comparator
polaz Mar 22, 2026
bbaa9d0
perf(comparator): zero-alloc prefixed slice compare, static default
polaz Mar 22, 2026
97b230d
fix(comparator): use comparator for RT sort, suppression, equality doc
polaz Mar 22, 2026
01cb591
style: fix unused import warnings in merge and table/util
polaz Mar 22, 2026
0e53007
style: fix CI clippy lints (redundant pub(crate), doc backticks, stal…
polaz Mar 22, 2026
3f13b57
fix(comparator): doc example bytewise-equality invariant, unsafe reason
polaz Mar 22, 2026
83d9494
style: backtick RocksDB in doc comment
polaz Mar 22, 2026
7a44516
fix(comparator): restore key-range early reject in RT suppression, SA…
polaz Mar 22, 2026
45177ea
fix(memtable): account for SharedComparator in approximate_size
polaz Mar 22, 2026
dd10000
docs(merge): explain Arc-per-HeapItem overhead is negligible
polaz Mar 22, 2026
e7919ce
refactor(config): make comparator field pub(crate), document run lookup
polaz Mar 22, 2026
bdf6dfa
style: backtick key_range in doc comment
polaz Mar 22, 2026
7efb69d
docs(memtable): note interval tree lexicographic limitation for custo…
polaz Mar 22, 2026
ccf07aa
docs(index_block): note test coverage location for iter seek behavior
polaz Mar 22, 2026
47f7879
fix(comparator): RT decode validation, SAFETY docs, slow-path test
polaz Mar 22, 2026
ce06c00
docs(memtable): clarify Memtable::new is pub for host crate (fjall), …
polaz Mar 22, 2026
167485d
docs(tests): clarify RT decode test uses default comparator intention…
polaz Mar 22, 2026
6a474f1
test(comparator): add bounded range scan tests for reverse and u64 co…
polaz Mar 22, 2026
9045f27
perf(comparator): stack buffer for custom comparator prefix comparison
polaz Mar 22, 2026
7dfe07c
fix(comparator): remove lexicographic debug_assert in RangeTombstone:…
polaz Mar 22, 2026
4f3a441
style: fix clippy items_after_statements and indexing_slicing in stac…
polaz Mar 22, 2026
0efc392
docs(run): strengthen precondition doc for get_for_key_cmp ordering i…
polaz Mar 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/abstract_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ pub trait AbstractTree: sealed::Sealed {
.iter()
.map(|mt| mt.iter().map(Ok))
.collect::<Vec<_>>(),
self.tree_config().comparator.clone(),
);
// RT suppression is not needed here: flush writes both entries and RTs
// to the output tables. Suppression happens at read time, not write time.
Expand Down
1 change: 1 addition & 0 deletions src/blob_tree/ingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ impl<'a> BlobIngestion<'a> {
index.config.descriptor_table.clone(),
false,
false,
index.config.comparator.clone(),
#[cfg(feature = "metrics")]
index.metrics.clone(),
)
Expand Down
32 changes: 23 additions & 9 deletions src/blob_tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,12 @@ impl BlobTree {
key: &[u8],
seqno: SeqNo,
) -> crate::Result<Option<UserValue>> {
let Some(item) = crate::Tree::get_internal_entry_from_version(super_version, key, seqno)?
let Some(item) = crate::Tree::get_internal_entry_from_version(
super_version,
key,
seqno,
self.index.config.comparator.as_ref(),
)?
else {
return Ok(None);
};
Expand Down Expand Up @@ -256,6 +261,7 @@ impl AbstractTree for BlobTree {
seqno,
index,
None, // BlobTree does not use merge operators for prefix scans
self.index.config.comparator.clone(),
prefix_hash,
)
.map(move |kv| {
Expand All @@ -278,14 +284,21 @@ impl AbstractTree for BlobTree {
let tree = self.clone();

Box::new(
crate::Tree::create_internal_range(super_version.clone(), &range, seqno, index, None)
.map(move |kv| {
IterGuardImpl::Blob(Guard {
tree: tree.clone(),
version: super_version.version.clone(),
kv,
})
}),
crate::Tree::create_internal_range(
super_version.clone(),
&range,
seqno,
index,
None,
self.index.config.comparator.clone(),
)
.map(move |kv| {
IterGuardImpl::Blob(Guard {
tree: tree.clone(),
version: super_version.version.clone(),
kv,
})
}),
)
}

Expand Down Expand Up @@ -519,6 +532,7 @@ impl AbstractTree for BlobTree {
self.index.config.descriptor_table.clone(),
pin_filter,
pin_index,
self.index.config.comparator.clone(),
#[cfg(feature = "metrics")]
self.index.metrics.clone(),
)
Expand Down
1 change: 1 addition & 0 deletions src/compaction/flavour.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ impl StandardCompaction {
opts.config.descriptor_table.clone(),
pin_filter,
pin_index,
opts.config.comparator.clone(),
#[cfg(feature = "metrics")]
opts.metrics.clone(),
)
Expand Down
13 changes: 11 additions & 2 deletions src/compaction/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ fn create_compaction_stream<'a>(
to_compact: &[TableId],
eviction_seqno: SeqNo,
merge_operator: Option<Arc<dyn crate::merge_operator::MergeOperator>>,
comparator: crate::comparator::SharedComparator,
) -> crate::Result<Option<CompactionStream<'a, Merger<CompactionReader<'a>>>>> {
let mut readers: Vec<CompactionReader<'_>> = vec![];
let mut found = 0;
Expand All @@ -178,7 +179,7 @@ fn create_compaction_stream<'a>(

Ok(if found == to_compact.len() {
Some(
CompactionStream::new(Merger::new(readers), eviction_seqno)
CompactionStream::new(Merger::new(readers, comparator), eviction_seqno)
.with_merge_operator(merge_operator),
)
} else {
Expand Down Expand Up @@ -390,6 +391,7 @@ fn merge_tables(
&payload.table_ids.iter().copied().collect::<Vec<_>>(),
opts.mvcc_gc_watermark,
opts.config.merge_operator.clone(),
opts.config.comparator.clone(),
)?
else {
log::warn!(
Expand Down Expand Up @@ -704,7 +706,14 @@ mod tests {
tree.insert("a", "a", 0);
tree.flush_active_memtable(0)?;

assert!(create_compaction_stream(&tree.current_version(), &[666], 0, None)?.is_none());
assert!(create_compaction_stream(
&tree.current_version(),
&[666],
0,
None,
crate::comparator::default_comparator()
)?
.is_none());

Ok(())
}
Expand Down
104 changes: 104 additions & 0 deletions src/comparator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright (c) 2024-present, fjall-rs
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

use std::sync::Arc;

/// Trait for custom user key comparison.
///
/// Comparators must be safe across unwind boundaries since they are stored
/// in tree structures that may be referenced inside `catch_unwind` blocks.
///
/// Implementations must define a **strict total order** suitable for use in
/// sorted data structures (memtable skip list, SST block index, merge heap).
/// Specifically:
///
/// - **Totality**: for all `a`, `b`, exactly one of `Less`, `Equal`, `Greater` holds
/// - **Transitivity**: `a < b` and `b < c` implies `a < c`
/// - **Antisymmetry**: `compare(a, b) == Less` iff `compare(b, a) == Greater`
/// - **Reflexivity**: `compare(a, a) == Equal`
///
/// - **Bytewise equality**: `compare(a, b) == Equal` **must** imply `a == b`
/// byte-for-byte. Bloom filters and hash indexes operate on raw bytes;
/// if two byte-different keys compare as equal, hash-based lookups will
/// produce false negatives.
///
/// Violating these invariants corrupts the sort order and produces incorrect
/// query results.
///
/// # Important
///
/// Once a tree is created with a comparator, it must always be opened with the
/// same comparator. Using a different comparator on an existing tree will produce
/// incorrect results.
Comment thread
polaz marked this conversation as resolved.
///
/// # Examples
///
/// ```
/// use lsm_tree::UserComparator;
/// use std::cmp::Ordering;
///
/// /// Comparator that orders u64 keys stored as big-endian bytes.
/// struct U64Comparator;
///
/// impl UserComparator for U64Comparator {
/// fn compare(&self, a: &[u8], b: &[u8]) -> Ordering {
/// if a.len() == 8 && b.len() == 8 {
/// // Length checked, conversion cannot fail.
/// let a_u64 = u64::from_be_bytes(a.try_into().unwrap());
/// let b_u64 = u64::from_be_bytes(b.try_into().unwrap());
/// a_u64.cmp(&b_u64)
/// } else {
/// // Non-8-byte keys: fall back to lexicographic ordering
/// // to preserve the bytewise-equality invariant.
/// a.cmp(b)
/// }
/// }
/// }
/// ```
pub trait UserComparator: Send + Sync + std::panic::RefUnwindSafe + 'static {
/// Compares two user keys, returning their ordering.
fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering;

/// Returns `true` if this comparator is lexicographic byte ordering.
///
/// When `true`, internal optimizations can avoid allocations in
/// prefix-compressed block comparisons. Override only if your
/// comparator is truly equivalent to `a.cmp(b)` on raw bytes.
fn is_lexicographic(&self) -> bool {
false
}
}

/// Default comparator using lexicographic byte ordering.
///
/// This is the comparator used when no custom comparator is configured,
/// preserving backward compatibility with existing trees.
#[derive(Clone, Debug)]
pub struct DefaultUserComparator;

impl UserComparator for DefaultUserComparator {
#[inline]
fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
a.cmp(b)
}

#[inline]
fn is_lexicographic(&self) -> bool {
true
}
}

/// Shared reference to a [`UserComparator`].
pub type SharedComparator = Arc<dyn UserComparator>;

/// Returns the default comparator (lexicographic byte ordering).
///
/// Uses a shared static instance to avoid repeated allocations.
#[must_use]
pub fn default_comparator() -> SharedComparator {
// LazyLock creates the Arc once; subsequent calls just clone the Arc (ref-count bump).
static DEFAULT: std::sync::LazyLock<SharedComparator> =
std::sync::LazyLock::new(|| Arc::new(DefaultUserComparator));
DEFAULT.clone()
}
Comment thread
polaz marked this conversation as resolved.
43 changes: 40 additions & 3 deletions src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,14 @@ pub use restart_interval::RestartIntervalPolicy;
pub type PartitioningPolicy = PinningPolicy;

use crate::{
compaction::filter::Factory, merge_operator::MergeOperator, path::absolute_path,
prefix::PrefixExtractor, version::DEFAULT_LEVEL_COUNT, AnyTree, BlobTree, Cache,
CompressionType, DescriptorTable, SequenceNumberCounter, SharedSequenceNumberGenerator, Tree,
compaction::filter::Factory,
comparator::{self, SharedComparator},
merge_operator::MergeOperator,
path::absolute_path,
prefix::PrefixExtractor,
version::DEFAULT_LEVEL_COUNT,
AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, SequenceNumberCounter,
SharedSequenceNumberGenerator, Tree,
};
use std::{
path::{Path, PathBuf},
Expand Down Expand Up @@ -246,6 +251,15 @@ pub struct Config {
#[doc(hidden)]
pub kv_separation_opts: Option<KvSeparationOptions>,

/// Custom user key comparator.
///
/// When set, all key comparisons use this comparator instead of the
/// default lexicographic byte ordering. Once a tree is opened with a
/// comparator, it must always be re-opened with the same comparator.
// Not `pub` — use `Config::comparator()` builder method as the public API.
#[doc(hidden)]
pub(crate) comparator: SharedComparator,

/// The global sequence number generator
///
/// Should be shared between multiple trees of a database
Expand Down Expand Up @@ -311,6 +325,8 @@ impl Default for Config {
expect_point_read_hits: false,

kv_separation_opts: None,

comparator: comparator::default_comparator(),
}
}
}
Expand Down Expand Up @@ -522,6 +538,27 @@ impl Config {
self
}

/// Sets a custom user key comparator.
///
/// When configured, all key ordering (memtable, block index, merge,
/// range scans) uses this comparator instead of the default lexicographic
/// byte ordering.
///
/// # Important
///
/// Once a tree is created with a custom comparator, it **must** be
/// re-opened with the same comparator. Using a different comparator
/// on an existing tree produces incorrect results.
///
/// The comparator identity is **not** persisted to disk — the caller
/// is responsible for ensuring the same comparator is used across
/// open/close cycles (same approach as `RocksDB`).
#[must_use]
pub fn comparator(mut self, comparator: SharedComparator) -> Self {
self.comparator = comparator;
self
}
Comment thread
polaz marked this conversation as resolved.

/// Opens a tree using the config.
///
/// # Errors
Expand Down
16 changes: 15 additions & 1 deletion src/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

use crate::{SeqNo, UserKey, ValueType};
use crate::{comparator::UserComparator, SeqNo, UserKey, ValueType};
use std::cmp::Reverse;

#[derive(Clone, Eq)]
Expand Down Expand Up @@ -56,6 +56,20 @@ impl InternalKey {
pub fn is_tombstone(&self) -> bool {
self.value_type.is_tombstone()
}

/// Compares two internal keys using a custom user key comparator.
///
/// User keys are compared via the given comparator; ties are broken
/// by sequence number in descending order (higher seqno = "smaller"
/// in sort order), matching the invariant of [`Ord for InternalKey`].
pub(crate) fn compare_with(
&self,
other: &Self,
cmp: &dyn UserComparator,
) -> std::cmp::Ordering {
cmp.compare(&self.user_key, &other.user_key)
.then_with(|| Reverse(self.seqno).cmp(&Reverse(other.seqno)))
}
}

impl PartialOrd for InternalKey {
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ mod abstract_tree;
#[doc(hidden)]
pub mod blob_tree;

mod comparator;

#[doc(hidden)]
mod cache;

Expand Down Expand Up @@ -185,6 +187,7 @@ pub use {
any_tree::AnyTree,
blob_tree::BlobTree,
cache::Cache,
comparator::{DefaultUserComparator, SharedComparator, UserComparator},
compression::CompressionType,
config::{Config, KvSeparationOptions, TreeType},
descriptor_table::DescriptorTable,
Expand Down
Loading
Loading