Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
fcedd42
perf: batch multi_get + PinnableSlice + WriteBatch
polaz Apr 5, 2026
56210bb
test: comprehensive integration tests for batch APIs
polaz Apr 5, 2026
6ef0bbe
refactor(table): extract check_bloom helper, fix batch safety
polaz Apr 5, 2026
c34bca7
fix(tree): hold read guard during batch insert, fix docs + bloom
polaz Apr 5, 2026
06e216e
fix(docs): correct PinnableSlice + WriteBatch semantics, fix filter miss
polaz Apr 5, 2026
10b260f
test(tree): add PinnableSlice + multi_get edge case tests
polaz Apr 5, 2026
23bc77e
fix(docs): align WriteBatch + get_pinned wording with visibility cont…
polaz Apr 5, 2026
698b1cf
fix(write_batch): document duplicate-key semantics, tighten visibility
polaz Apr 5, 2026
8db05fa
perf(blob_tree): batch multi_get for BlobTree, add batch_ops bench
polaz Apr 5, 2026
d2bd527
fix(blob_tree): add range tombstone suppression to batch multi_get
polaz Apr 5, 2026
431e975
fix(table): return global seqnos from Table::get and get_with_block
polaz Apr 5, 2026
8be9dbe
refactor(table): extract point_read_inner to DRY block-index walk
polaz Apr 5, 2026
ef9c21f
fix(tree): add debug_asserts for batch_get_from_tables index contract
polaz Apr 5, 2026
2b04c91
fix(tree): correct L0 fast-path seqno check, fix duplicate-key docs
polaz Apr 5, 2026
11c1f66
fix(docs): correct WriteBatch wording in README
polaz Apr 5, 2026
cb132fe
refactor(tree): unify table point-read walk via TablePointLookup trait
polaz Apr 5, 2026
6aa6d66
fix(write_batch): correct duplicate-key docs, add mixed-op debug_assert
polaz Apr 5, 2026
04af107
perf(tree): add L0 seqno ceiling skip to batch_get_from_tables
polaz Apr 5, 2026
80c3aa7
refactor(tree): extract resolve_pinned_entry helper for get_pinned
polaz Apr 5, 2026
766d34c
refactor(tree): unify resolve_entry via resolve_pinned_entry
polaz Apr 5, 2026
0fbf3f8
fix(write_batch): reject mixed-op duplicates unconditionally
polaz Apr 5, 2026
23da114
fix(docs): align WriteBatch mixed-op doc with unconditional validation
polaz Apr 5, 2026
a6af258
perf(tree): defer sort+hash after memtable phase, bitmap for L0 ceiling
polaz Apr 5, 2026
5b94ac4
refactor(table): reword filter_queries metrics annotation
polaz Apr 5, 2026
fbc99e4
fix(blob_tree): add merge resolution to batch multi_get path
polaz Apr 5, 2026
bebe140
fix(blob_tree): use resolve_merge_via_pipeline directly, not resolve_key
polaz Apr 5, 2026
d243003
perf(bench): reuse fixed keys to prevent memtable growth across itera…
polaz Apr 5, 2026
c6ac265
perf(table): saturating_add for global seqno, hash only remaining keys
polaz Apr 5, 2026
b86ca49
fix(blob_tree): return raw merge operand when operator absent
polaz Apr 5, 2026
3de18ea
perf(blob_tree): defer snapshot acquisition below empty-batch check
polaz Apr 5, 2026
dc56c17
perf(tree): pass (idx, hash) pairs to batch_get_from_tables
polaz Apr 6, 2026
2ba3f05
docs(copilot): add unit struct vs type alias rule
polaz Apr 6, 2026
238d13f
perf(bench): use constant seqno to prevent version accumulation
polaz Apr 6, 2026
a27fbdd
refactor(bench): extract setup_empty_tree helper for write benchmarks
polaz Apr 6, 2026
22e4042
perf(bench): use iter_batched for fresh tree per sample
polaz Apr 6, 2026
5297d52
perf(bench): add black_box to prevent optimizer elision
polaz Apr 6, 2026
a2c1f56
perf(tree): avoid clone in materialize validation, take miss_keys by …
polaz Apr 6, 2026
8f43698
refactor(error): reword MixedOperationBatch as ambiguous semantics
polaz Apr 6, 2026
8660343
refactor(tree): reword resolve_or_passthrough_pinned doc
polaz Apr 6, 2026
4a9d067
refactor(tree): update batch_get_from_tables doc and debug_assert
polaz Apr 6, 2026
5ee39fa
perf(tree): L1+ early exit after covering table miss in batch path
polaz Apr 6, 2026
70f973b
perf(tree): break L1+ single-key walk after covering table miss
polaz Apr 6, 2026
1422860
perf(tree): re-sort after L1+ covered_miss merge, DRY test helpers
polaz Apr 6, 2026
f17cd0a
test(pinnable_slice): improve test robustness
polaz Apr 6, 2026
6e025fb
test(multi_get): blob_tree batch RT suppression, merge, memtable-first
polaz Apr 6, 2026
05707d9
docs(write_batch): clarify mixed-op rejection rationale
polaz Apr 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ interval-heap = "0.0.5"
log = "0.4.27"
lz4_flex = { version = "0.13.0", optional = true, default-features = false }
zstd = { version = "0.13", optional = true, default-features = false }
structured-zstd = { version = "0.0.1", optional = true, default-features = false, features = ["std"] }
structured-zstd = { version = "0.0.6", optional = true, default-features = false, features = ["std"] }
Comment thread
polaz marked this conversation as resolved.
Outdated
quick_cache = { version = "0.6.16", default-features = false, features = [] }
Comment thread
polaz marked this conversation as resolved.
rustc-hash = "2.1.1"
self_cell = "1.2.0"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
[![License](https://img.shields.io/badge/license-Apache--2.0-blue)](#license)

> LSM-tree engine for [CoordiNode](https://github.com/structured-world/coordinode), maintained by [Structured World Foundation](https://sw.foundation).
> Derivative work of [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree), developed independently with diverging features: zstd dictionary compression, custom sequence number generators, multi_get, intra-L0 compaction, and security hardening.
> Derivative work of [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree), developed independently with diverging features: zstd dictionary compression, custom sequence number generators, multi_get (batch-optimized), PinnableSlice zero-copy reads, WriteBatch atomic writes, intra-L0 compaction, and security hardening.
Comment thread
polaz marked this conversation as resolved.
Outdated

> [!IMPORTANT]
> This fork now introduces a fork-specific **disk format V4** compatibility boundary.
Expand Down
66 changes: 66 additions & 0 deletions src/abstract_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,43 @@ pub trait AbstractTree: sealed::Sealed {
/// Will return `Err` if an IO error occurs.
fn get<K: AsRef<[u8]>>(&self, key: K, seqno: SeqNo) -> crate::Result<Option<UserValue>>;

/// Retrieves an item from the tree as a [`PinnableSlice`].
///
/// When the value is found in the block cache, the returned
/// [`PinnableSlice::Pinned`] variant holds a reference to the cached
/// block, avoiding a data copy. Memtable and blob-resolved values use
/// the [`PinnableSlice::Owned`] variant.
///
/// The existing [`AbstractTree::get`] method is unaffected.
///
/// # Examples
///
/// ```
/// # let folder = tempfile::tempdir()?;
/// use lsm_tree::{AbstractTree, Config, Tree};
///
/// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
/// tree.insert("a", "my_value", 0);
///
/// let item = tree.get_pinned("a", 1)?;
/// assert_eq!(item.as_ref().map(|v| v.as_ref()), Some("my_value".as_bytes()));
/// #
/// # Ok::<(), lsm_tree::Error>(())
/// ```
Comment thread
polaz marked this conversation as resolved.
///
/// # Errors
///
/// Will return `Err` if an IO error occurs.
fn get_pinned<K: AsRef<[u8]>>(
&self,
key: K,
seqno: SeqNo,
) -> crate::Result<Option<crate::PinnableSlice>> {
// Default: delegate to get() and wrap as Owned
self.get(key, seqno)
.map(|opt| opt.map(crate::PinnableSlice::owned))
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

/// Returns `true` if the tree contains the specified key.
///
/// # Examples
Expand Down Expand Up @@ -656,6 +693,35 @@ pub trait AbstractTree: sealed::Sealed {
keys.into_iter().map(|key| self.get(key, seqno)).collect()
}

/// Applies a [`WriteBatch`] atomically with the given sequence number.
///
/// All entries in the batch share a single seqno, making them appear as
/// an atomic unit for MVCC reads. This is more efficient than individual
/// writes because the version-history lock and memtable size accounting
/// are performed only once for the entire batch.
///
/// Returns the total bytes added and new size of the memtable.
///
/// # Examples
///
/// ```
/// # let folder = tempfile::tempdir()?;
/// use lsm_tree::{AbstractTree, Config, Tree, WriteBatch};
///
/// let tree = Config::new(folder, Default::default(), Default::default()).open()?;
///
/// let mut batch = WriteBatch::new();
/// batch.insert("key1", "value1");
/// batch.insert("key2", "value2");
/// batch.remove("key3");
///
/// let (bytes_added, memtable_size) = tree.apply_batch(batch, 0);
/// assert!(bytes_added > 0);
/// #
/// # Ok::<(), lsm_tree::Error>(())
/// ```
Comment thread
polaz marked this conversation as resolved.
fn apply_batch(&self, batch: crate::WriteBatch, seqno: SeqNo) -> (u64, u64);

/// Inserts a key-value pair into the tree.
///
/// If the key already exists, the item will be overwritten.
Expand Down
4 changes: 4 additions & 0 deletions src/blob_tree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,10 @@ impl AbstractTree for BlobTree {
self.index.get_highest_persisted_seqno()
}

fn apply_batch(&self, batch: crate::WriteBatch, seqno: SeqNo) -> (u64, u64) {
self.index.apply_batch(batch, seqno)
}

fn insert<K: Into<UserKey>, V: Into<UserValue>>(
&self,
key: K,
Expand Down
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ pub(crate) mod metrics;
pub mod mvcc_stream;

mod path;
mod pinnable_slice;
mod prefix;

#[doc(hidden)]
Expand Down Expand Up @@ -161,6 +162,7 @@ pub mod util;

mod value;
mod value_type;
mod write_batch;

/// Integrity verification for SST and blob files.
pub mod verify;
Expand Down Expand Up @@ -196,6 +198,9 @@ pub use encryption::EncryptionProvider;
#[cfg(feature = "encryption")]
pub use encryption::Aes256GcmProvider;

pub use pinnable_slice::PinnableSlice;
pub use write_batch::WriteBatch;

pub use {
abstract_tree::AbstractTree,
any_tree::AnyTree,
Expand Down
53 changes: 53 additions & 0 deletions src/memtable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,59 @@ impl Memtable {
self.items.is_empty() && self.range_tombstone_count() == 0
}

/// Inserts multiple items into the memtable in bulk.
///
/// More efficient than calling [`Memtable::insert`] in a loop because it
/// performs a single `fetch_add` for the total size and a single
/// `fetch_max` for the highest seqno.
///
/// Returns `(total_bytes_added, new_memtable_size)`.
#[doc(hidden)]
pub fn insert_batch(&self, items: Vec<InternalValue>) -> (u64, u64) {
if items.is_empty() {
let size = self
.approximate_size
.load(std::sync::atomic::Ordering::Acquire);
return (0, size);
}

let mut total_size: u64 = 0;
let mut max_seqno: u64 = 0;

let overhead =
std::mem::size_of::<InternalValue>() + std::mem::size_of::<SharedComparator>();

for item in &items {
#[expect(
clippy::expect_used,
reason = "keys are limited to 16-bit length + values are limited to 32-bit length"
)]
let item_size: u64 = (item.key.user_key.len() + item.value.len() + overhead)
.try_into()
.expect("should fit into u64");

total_size = total_size.saturating_add(item_size);

if item.key.seqno > max_seqno {
max_seqno = item.key.seqno;
}
}
Comment thread
polaz marked this conversation as resolved.

let size_before = self
.approximate_size
.fetch_add(total_size, std::sync::atomic::Ordering::AcqRel);

for item in items {
let key = InternalKey::new(item.key.user_key, item.key.seqno, item.key.value_type);
self.items.insert(&key, &item.value);
}
Comment thread
polaz marked this conversation as resolved.

self.highest_seqno
.fetch_max(max_seqno, std::sync::atomic::Ordering::AcqRel);

(total_size, size_before + total_size)
}

/// Inserts an item into the memtable
#[doc(hidden)]
pub fn insert(&self, item: InternalValue) -> (u64, u64) {
Expand Down
142 changes: 142 additions & 0 deletions src/pinnable_slice.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright (c) 2024-present, fjall-rs
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

//! Zero-copy value reference that keeps the decompressed block buffer alive.
//!
//! [`PinnableSlice`] is inspired by `RocksDB`'s `PinnableSlice`
//! (`include/rocksdb/slice.h:179-263`). It wraps a value that was read from
//! the LSM tree and indicates whether the underlying data shares the
//! decompressed block buffer or is independently owned (e.g. from a memtable
//! or merge result).
//!
//! When the value comes from an on-disk data block, holding a
//! `PinnableSlice::Pinned` keeps the block's decompressed buffer alive
//! (via the refcounted [`Slice`] / `ByteView` backing) for the duration of
//! the reference. The value bytes are a sub-slice of that buffer — no copy
//! is performed. Note: this does **not** prevent the block cache from
//! evicting its entry; it only ensures the backing memory remains valid.
//!
//! Memtable and blob-resolved values use the `Owned` variant.

use crate::{Slice, UserValue, table::Block};

/// A value reference that may be pinned in the block cache.
///
/// Use [`PinnableSlice::as_ref`] to access the raw bytes regardless of variant.
///
/// # Lifetime
///
/// The `Pinned` variant holds a [`Block`] clone whose `data` field is a
/// refcounted [`Slice`]. As long as the `PinnableSlice` is alive, the
/// decompressed block buffer remains valid. Dropping it releases the
/// reference count on the underlying `ByteView` allocation.
Comment thread
polaz marked this conversation as resolved.
Outdated
#[derive(Clone)]
pub enum PinnableSlice {
/// Value sharing the decompressed block buffer — zero copy.
///
/// The [`Block`] keeps the decompressed data alive via refcounted
/// `Slice` / `ByteView`. `value` is a sub-slice created via
/// [`Slice::slice`], sharing the same backing allocation.
Pinned {
/// Keeps the block alive in the cache.
_block: Block,
/// Zero-copy sub-slice into the block's decompressed data.
value: Slice,
},
Comment thread
polaz marked this conversation as resolved.

/// Value owned independently (memtable, blob, merge result).
Owned(UserValue),
}

impl PinnableSlice {
/// Creates a pinned value referencing data within a block cache entry.
#[must_use]
pub fn pinned(block: Block, value: Slice) -> Self {
Self::Pinned {
_block: block,
value,
}
}

/// Creates an owned value (not pinned in any cache).
#[must_use]
pub fn owned(value: UserValue) -> Self {
Self::Owned(value)
}

/// Returns `true` if this value is pinned in the block cache.
#[must_use]
pub fn is_pinned(&self) -> bool {
matches!(self, Self::Pinned { .. })
}

/// Returns the raw value bytes.
#[must_use]
pub fn value(&self) -> &[u8] {
self.as_ref()
}

/// Returns the length of the value in bytes.
#[must_use]
pub fn len(&self) -> usize {
self.as_ref().len()
}

/// Returns `true` if the value is empty.
#[must_use]
pub fn is_empty(&self) -> bool {
self.as_ref().is_empty()
}

/// Converts this `PinnableSlice` into an owned `UserValue`.
///
/// For the `Pinned` variant, the `Block` is dropped but the returned
/// `Slice` still shares the same `ByteView` backing allocation.
/// For the `Owned` variant, the value is returned directly.
#[must_use]
pub fn into_value(self) -> UserValue {
match self {
Self::Pinned { value, .. } => value,
Self::Owned(v) => v,
}
}
}

impl std::fmt::Debug for PinnableSlice {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Pinned { value, .. } => {
f.debug_struct("Pinned").field("len", &value.len()).finish()
}
Self::Owned(v) => f.debug_tuple("Owned").field(&v.len()).finish(),
}
}
}

impl AsRef<[u8]> for PinnableSlice {
fn as_ref(&self) -> &[u8] {
match self {
Self::Pinned { value, .. } => value.as_ref(),
Self::Owned(v) => v.as_ref(),
}
}
}

impl PartialEq<[u8]> for PinnableSlice {
fn eq(&self, other: &[u8]) -> bool {
self.as_ref() == other
}
}

impl PartialEq<&[u8]> for PinnableSlice {
fn eq(&self, other: &&[u8]) -> bool {
self.as_ref() == *other
}
}

impl From<PinnableSlice> for UserValue {
fn from(ps: PinnableSlice) -> Self {
ps.into_value()
}
}
Loading
Loading