Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
a3bc6c9
feat: add range tombstones (delete_range / delete_prefix)
polaz Mar 16, 2026
c3e80fc
Merge branch 'main' into feat/#16-feat-range-tombstones--deleterange-…
polaz Mar 16, 2026
ed272cf
fix: resolve all clippy warnings for range tombstone code
polaz Mar 16, 2026
718f2ba
fix(range-tombstone): validate bounds, fix RT-only flush and edge cases
polaz Mar 16, 2026
343f31c
fix(table): validate BlockType on range tombstone block load
polaz Mar 16, 2026
3e23f57
fix(range-tombstone): seqno visibility, decode hardening, lint attrs
polaz Mar 16, 2026
954c044
fix(lint): use cfg_attr(feature, expect) for metrics-gated arg count
polaz Mar 16, 2026
d53ecfe
fix(range-tombstone): propagate RTs before write loop, enforce u16 bo…
polaz Mar 16, 2026
cb60d63
test(range-tombstone): rotation, blob tree, table-skip, invalid interval
polaz Mar 16, 2026
e1db06d
fix(range-tombstone): RT-only SST persistence, pruning, lint attrs
polaz Mar 16, 2026
f0c90ea
style: fix rustfmt formatting in interval_tree
polaz Mar 17, 2026
1f87efa
Merge branch 'main' into feat/#16-feat-range-tombstones--deleterange-…
polaz Mar 17, 2026
334890c
fix(range-tombstone): preserve sentinel seqno bounds, soft-reject ove…
polaz Mar 17, 2026
5077e58
chore: remove .forge from git tracking
polaz Mar 17, 2026
6841260
chore: add .claude to .gitignore
polaz Mar 17, 2026
9522778
chore: merge main into feature branch
polaz Mar 17, 2026
df7c0f4
fix(range-tombstone): use #[expect] lints, optimize query_suppression
polaz Mar 17, 2026
9a67ff2
fix(interval-tree): remove unfired unnecessary_box_returns expects
polaz Mar 17, 2026
51c3429
fix(range-tombstone): preserve sentinel seqno bounds, soft-reject ove…
polaz Mar 17, 2026
86a985f
docs(test): clarify Guard import is a trait dependency for .key()
polaz Mar 17, 2026
e525c10
docs(test): clarify Vec<Vec<u8>> PartialEq coercion for assertions
polaz Mar 17, 2026
12a23c6
fix(range-tombstone): correct flush clipping and RT-only metadata range
polaz Mar 17, 2026
2dbe5c6
fix(range-tombstone): write all RTs in flush mode without overlap filter
polaz Mar 17, 2026
28d419f
test(range-tombstone): add disjoint RT flush and compaction tests
polaz Mar 17, 2026
e59eb29
fix(range-tombstone): use max RT seqno for sentinel to avoid seqno=0 …
polaz Mar 17, 2026
21249f2
docs(range-tombstone): clarify RT-only flush and sentinel design deci…
polaz Mar 18, 2026
6371dac
docs(ci): add design pattern exclusions to instruction files
polaz Mar 18, 2026
1e381fc
docs(ci): generalize design decision rules in instruction files
polaz Mar 18, 2026
317b9d8
fix(range-tombstone): dedup compaction RTs and warn on oversized keys
polaz Mar 18, 2026
a40eb7d
refactor(range-tombstone): tighten visibility and remove dead code
polaz Mar 18, 2026
d486a98
docs(range-tombstone): hide flush_to_tables_with_rt from public docs
polaz Mar 18, 2026
0123988
fix(range-tombstone): correct reverse sort comparator for sweep-line
polaz Mar 18, 2026
5a7308d
fix(range-tombstone): remove unused Reverse import
polaz Mar 18, 2026
6860597
fix(range-tombstone): use unique sentinel seqno in RT-only tables
polaz Mar 18, 2026
3462b39
docs(range-tombstone): annotate table-skip scan complexity
polaz Mar 18, 2026
dc8f68d
fix(range-tombstone): use MAX_SEQNO for sentinel to prevent cross-tab…
polaz Mar 18, 2026
8874e46
fix(range-tombstone): widen flush table key_range to cover RT span
polaz Mar 18, 2026
0d34769
docs(range-tombstone): clarify sentinel visibility at SeqNo::MAX
polaz Mar 18, 2026
ddda2c2
docs(range-tombstone): clarify compaction fallback for max-length keys
polaz Mar 18, 2026
a51facd
fix(range-tombstone): remove dead max_rt_seqno and add #[must_use]
polaz Mar 18, 2026
979d126
fix(range-tombstone): restore item counts after sentinel and widen co…
polaz Mar 18, 2026
bc508a0
fix(range-tombstone): use SeqNo::MAX for sentinel and short-circuit t…
polaz Mar 18, 2026
4371b2b
fix(range-tombstone): revert sentinel to MAX_SEQNO and tighten writer…
polaz Mar 18, 2026
8b720cc
fix(range-tombstone): remove key_range widening to preserve inclusive…
polaz Mar 18, 2026
7ec78e1
fix(range-tombstone): restore flush key_range widening for disjoint R…
polaz Mar 18, 2026
b383dcc
test(range-tombstone): add regression test for disjoint RT multi-comp…
polaz Mar 18, 2026
66b044b
fix(range-tombstone): use seqno 0 for sentinel to prevent merge domin…
polaz Mar 18, 2026
87a06f8
test(range-tombstone): add sentinel masking regression test
polaz Mar 18, 2026
0b9a096
fix(range-tombstone): use lowest RT seqno for sentinel instead of 0
polaz Mar 18, 2026
61c5268
docs(range-tombstone): document internal trait rationale for required…
polaz Mar 19, 2026
7452427
fix(range-tombstone): keep sentinel in item counts for on-disk consis…
polaz Mar 19, 2026
755e241
Potential fix for pull request finding
polaz Mar 19, 2026
b483a8b
Potential fix for pull request finding
polaz Mar 19, 2026
97e64b1
fix(range-tombstone): align rt-only sentinel key with lowest seqno rt
polaz Mar 19, 2026
d9ed474
refactor(api): seal AbstractTree internals
polaz Mar 19, 2026
edd92f5
fix(format): bump fork disk semantics to V4
polaz Mar 19, 2026
34ce53f
perf(range): remove redundant RT pre-scan
polaz Mar 19, 2026
a065720
test(range): harden range tombstone safety coverage
polaz Mar 19, 2026
c81c251
perf(range): filter memtable RTs by query overlap
polaz Mar 19, 2026
ece7a2a
style(test): format safety regression assertions
polaz Mar 19, 2026
e8d1c4a
fix(range): harden RT metadata and compaction safety
polaz Mar 19, 2026
e6c457b
fix(flush): keep RT-widened tables in disjoint L0 runs
polaz Mar 19, 2026
9ee4dcd
fix(range): harden max-key clipping and review regressions
polaz Mar 20, 2026
bc72a94
chore(lints): add reason to interval-tree test expect
polaz Mar 20, 2026
eb61d8f
docs(readme): document fork disk format V4 boundary
polaz Mar 20, 2026
2c0b836
fix(api): properly seal AbstractTree implementations
polaz Mar 20, 2026
a6a1436
test(range): add RT tamper coverage and memtable must_use
polaz Mar 20, 2026
61cfda2
fix(range-tombstone): dedup flush range tombstones before persistence
polaz Mar 20, 2026
210b69f
docs(ci): clarify call-site scope in review instructions
polaz Mar 20, 2026
642148d
test(range-tombstone): add compaction clip gap regression test
polaz Mar 20, 2026
7710935
test(range-tombstone): add RT-only table sentinel recovery regression…
polaz Mar 20, 2026
56addb8
test(range-tombstone): assert specific error type in RT block tamper …
polaz Mar 20, 2026
00b5b9a
test(range-tombstone): disable compression in multi-table flush test
polaz Mar 20, 2026
8b083ad
fix(range-tombstone): disable compression in flush rotation test and …
polaz Mar 20, 2026
42d4726
docs(ci): scope design-decision exclusion to Tier 3-4 only
polaz Mar 20, 2026
261bbf9
fix(range-tombstone): pin compression in compaction rotation test and…
polaz Mar 20, 2026
c863073
perf(range-tombstone): dedup collected RTs before range iteration filter
polaz Mar 20, 2026
1065361
docs(range-tombstone): document newest-first search order for point t…
polaz Mar 20, 2026
c46710e
docs(test): clarify owned File auto-borrow for read_u64 call
polaz Mar 20, 2026
5d7ab2d
test(recovery): assert InvalidVersion error type in manifest version …
polaz Mar 20, 2026
67e6a27
fix(range-tombstone): add test expect_used lint, RT diagram, metrics …
polaz Mar 20, 2026
1564cdd
docs(range-tombstone): link RT metrics TODO to issue #34
polaz Mar 20, 2026
0ed9bf0
docs(ci): scope design analysis to PR-visible call chain and preserve…
polaz Mar 20, 2026
9b1aab8
style(range-tombstone): rename filter tests to what_condition_expecte…
polaz Mar 20, 2026
387617f
test(range-tombstone): add structural assertions for compaction and r…
polaz Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,24 @@ cargo fmt --all -- --check # Format check
| `bytes_1` | Use `bytes` crate for Slice type |
| `metrics` | Expose prometheus metrics |

## Design Decision Analysis (CRITICAL)

**Before flagging any potential issue, trace the call chain visible in the PR diff:**

1. **Read the caller** — does the caller already handle the case you're about to flag? If a function returns a value that looks problematic in isolation, check call sites visible in the diff. The caller may handle the edge case explicitly (e.g., empty collections, `None` vs `Some(empty)`).
2. **Check type-level guarantees** — does the type system prevent the issue? Different enum variants, wrapper types, or visibility modifiers may make a "collision" or "misuse" structurally impossible.
3. **Read adjacent comments** — comments starting with "NOTE:", "Use X instead of Y because...", or explaining WHY a specific approach was chosen document deliberate design decisions. If the comment accurately describes the code behavior, the design is intentional.

**Only suppress Tier 3/Tier 4 findings if they fail these checks.** If the pattern still indicates a Tier 1 (logic/correctness) or Tier 2 (safety/crash) issue, flag it regardless of documented rationale or caller handling.

## Architecture Notes

- `src/table/block/` — On-disk block format (header + compressed payload)
- `src/vlog/blob_file/` — Value log for large values (separate from LSM blocks)
- `src/compaction/` — Compaction strategies (leveled, FIFO, tiered)
- `src/seqno.rs` — Sequence number generator (MVCC versioning)
- `src/range_tombstone.rs` — Range tombstone data model and serialization
- `src/range_tombstone_filter.rs` — MVCC-aware range tombstone filtering for iterators
- `src/active_tombstone_set.rs` — Tracks active range tombstones during compaction
- `src/memtable/interval_tree.rs` — Interval tree for memtable range tombstone queries
- Compression is pluggable via `CompressionType` enum with `#[cfg(feature)]` variants
4 changes: 4 additions & 0 deletions .github/instructions/rust.instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ Focus review effort on real bugs, not cosmetics. Stop after finding issues in hi

These are not actionable review findings. Do not raise them:

- **Caller-handled edge cases**: Before flagging a function for not handling an edge case (empty collection, `None` vs `Some(empty)`, missing guard), check call sites visible in the PR diff. If all visible callers already handle the case, the function's behavior is part of a deliberate contract — not a bug. Only flag if the edge case is truly unhandled end-to-end within the scope of the PR.
- **Type-system-prevented issues**: Before flagging a potential collision, overlap, or misuse, check whether distinct enum variants, wrapper types, or visibility modifiers make the issue structurally impossible. A `WeakTombstone` variant that never appears in user-facing merge paths cannot collide with user data regardless of key/seqno overlap.
- **Documented design decisions** (Tier 3-4 only): When code has a comment explaining WHY a specific approach was chosen, trust the documented reasoning for style and API design choices. Flag only if the comment contradicts the actual code behavior — not if you would have chosen a different approach. This exclusion does NOT apply to Tier 1 (logic bugs, data corruption) or Tier 2 (safety, crash recovery) — always flag those regardless of documentation.

- **Comment wording vs code behavior**: If a comment says "flush when full" but the threshold is checked with `>=` not `>`, the intent is clear — the boundary condition is a design choice. Do not suggest rewording comments to match exact comparison operators.
- **Comment precision**: "returns the block" when it technically returns `Result<Block>` — the comment conveys meaning, not type signature.
- **Magic numbers with context**: `4` in `assert_eq!(header.len(), 4, "expected u32 checksum")` — the assertion message provides the context. Do not suggest a named constant when the value is used once in a test with an explanatory message.
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
> **Maintained fork** by [Structured World Foundation](https://sw.foundation) for the [CoordiNode](https://github.com/structured-world/coordinode) database engine.
> Based on [fjall-rs/lsm-tree](https://github.com/fjall-rs/lsm-tree). We contribute patches upstream and maintain additional features needed for CoordiNode (zstd compression, custom sequence number generators, batch get, intra-L0 compaction, security hardening).

> [!IMPORTANT]
> This fork now introduces a fork-specific **disk format V4** compatibility boundary.
> `V4` is a breaking on-disk change relative to `V3` because the fork persists new semantics such as range tombstones and merge operands.
> New code may continue reading supported `V3` databases, but databases written with these `V4` semantics must not be opened by older `V3` binaries.

A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rust.

> [!NOTE]
Expand Down
94 changes: 90 additions & 4 deletions src/abstract_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,22 @@ pub type RangeItem = crate::Result<KvPair>;

type FlushToTablesResult = (Vec<Table>, Option<Vec<BlobFile>>);

// Sealed on purpose: this trait is still public as a consumer-side bound
// (`&impl AbstractTree`), but external implementations are no longer part of
// the supported extension surface. Internal flush/version hooks keep evolving
// with crate-owned tree types and must not create downstream semver traps.
//
// `sealed` stays `pub` only so sibling modules in this crate can write
// `crate::abstract_tree::sealed::Sealed` in their impls. The parent module
// `abstract_tree` is not publicly exported from the crate root, so downstream
// crates still cannot name or implement this trait.
pub mod sealed {
Comment thread
polaz marked this conversation as resolved.
pub trait Sealed {}
}
Comment thread
polaz marked this conversation as resolved.

/// Generic Tree API
#[enum_dispatch::enum_dispatch]
Comment thread
polaz marked this conversation as resolved.
pub trait AbstractTree {
pub trait AbstractTree: sealed::Sealed {
/// Debug method for tracing the MVCC history of a key.
#[doc(hidden)]
fn print_trace(&self, key: &[u8]) -> crate::Result<()>;
Expand Down Expand Up @@ -76,7 +89,9 @@ pub trait AbstractTree {
_lock: &MutexGuard<'_, ()>,
seqno_threshold: SeqNo,
) -> crate::Result<Option<u64>> {
use crate::{compaction::stream::CompactionStream, merge::Merger};
use crate::{
compaction::stream::CompactionStream, merge::Merger, range_tombstone::RangeTombstone,
};

let version_history = self.get_version_history_lock();
let latest = version_history.latest_version();
Expand All @@ -93,6 +108,14 @@ pub trait AbstractTree {

let flushed_size = latest.sealed_memtables.iter().map(|mt| mt.size()).sum();

// Collect range tombstones from sealed memtables
let mut range_tombstones: Vec<RangeTombstone> = Vec::new();
for mt in latest.sealed_memtables.iter() {
range_tombstones.extend(mt.range_tombstones_sorted());
}
range_tombstones.sort();
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
range_tombstones.dedup();

let merger = Merger::new(
latest
.sealed_memtables
Expand All @@ -104,7 +127,22 @@ pub trait AbstractTree {

drop(version_history);

if let Some((tables, blob_files)) = self.flush_to_tables(stream)? {
// Clone needed: flush_to_tables_with_rt consumes the Vec, but on the
// RT-only path (no KV data, tables.is_empty()) we re-insert RTs into the
// active memtable. Flush is infrequent and RT count is small.
if let Some((tables, blob_files)) =
self.flush_to_tables_with_rt(stream, range_tombstones.clone())?
{
Comment thread
polaz marked this conversation as resolved.
// If no tables were produced (RT-only memtable), re-insert RTs
// into active memtable so they aren't lost
if tables.is_empty() && !range_tombstones.is_empty() {
let active = self.active_memtable();
for rt in &range_tombstones {
let _ =
active.insert_range_tombstone(rt.start.clone(), rt.end.clone(), rt.seqno);
}
}

self.register_tables(
&tables,
blob_files.as_deref(),
Expand Down Expand Up @@ -216,10 +254,26 @@ pub trait AbstractTree {
/// # Errors
///
/// Will return `Err` if an IO error occurs.
#[warn(clippy::type_complexity)]
fn flush_to_tables(
&self,
stream: impl Iterator<Item = crate::Result<InternalValue>>,
) -> crate::Result<Option<FlushToTablesResult>> {
self.flush_to_tables_with_rt(stream, Vec::new())
}

/// Like [`AbstractTree::flush_to_tables`], but also writes range tombstones.
///
/// This is an internal extension hook on the crate's sealed tree types and
/// is hidden from generated documentation.
///
/// # Errors
///
/// Will return `Err` if an IO error occurs.
#[doc(hidden)]
fn flush_to_tables_with_rt(
&self,
stream: impl Iterator<Item = crate::Result<InternalValue>>,
range_tombstones: Vec<crate::range_tombstone::RangeTombstone>,
Comment thread
polaz marked this conversation as resolved.
) -> crate::Result<Option<FlushToTablesResult>>;
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.
Comment thread
polaz marked this conversation as resolved.

/// Atomically registers flushed tables into the tree, removing their associated sealed memtables.
Expand Down Expand Up @@ -680,4 +734,36 @@ pub trait AbstractTree {
/// Will return `Err` if an IO error occurs.
#[doc(hidden)]
fn remove_weak<K: Into<UserKey>>(&self, key: K, seqno: SeqNo) -> (u64, u64);

/// Deletes all keys in the range `[start, end)` by inserting a range tombstone.
///
/// This is much more efficient than deleting keys individually when
/// removing a contiguous range of keys.
///
/// Returns the approximate size added to the memtable.
/// Returns 0 if `start >= end` (invalid interval is silently ignored).
///
/// This is a required method on the crate's sealed tree types.
fn remove_range<K: Into<UserKey>>(&self, start: K, end: K, seqno: SeqNo) -> u64;
Comment thread
polaz marked this conversation as resolved.

/// Deletes all keys with the given prefix by inserting a range tombstone.
///
/// This is sugar over [`AbstractTree::remove_range`] using prefix bounds.
///
/// Returns the approximate size added to the memtable.
/// Returns 0 for empty prefixes or all-`0xFF` prefixes (cannot form valid half-open range).
fn remove_prefix<K: AsRef<[u8]>>(&self, prefix: K, seqno: SeqNo) -> u64 {
use crate::range::prefix_to_range;
use std::ops::Bound;

let (lo, hi) = prefix_to_range(prefix.as_ref());

let Bound::Included(start) = lo else { return 0 };

// Bound::Unbounded means the prefix is all 0xFF — no representable
// exclusive upper bound exists, so we cannot form a valid range tombstone.
let Bound::Excluded(end) = hi else { return 0 };

self.remove_range(start, end, seqno)
Comment thread
polaz marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}
Loading