Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
746a513
feat(compression): enable dictionary compression in pure Rust backend
polaz Apr 7, 2026
aacc1f1
fix(compression): normalize ZstdDictionary::id() to always return non…
polaz Apr 7, 2026
7650e6b
perf(compression): cache FrameCompressor in TLS for pure Rust dict path
polaz Apr 7, 2026
1aec535
perf(compression): reuse source Vec capacity in TLS compress_with_dict
polaz Apr 7, 2026
59bd591
refactor(compression): extract DICT_MAGIC to module-level constant
polaz Apr 7, 2026
9ce1517
test(compression): add cross-backend raw-content dict interop tests
polaz Apr 7, 2026
a8118f2
fix(compression): guard raw-content decompress against decompression …
polaz Apr 7, 2026
e765717
refactor(compression): extract decode_raw_content_bounded helper
polaz Apr 8, 2026
4f8b969
test(compression): add raw-content dict capacity guard tests
polaz Apr 8, 2026
e8c9fb3
refactor(compression): extract decompress dispatch into named function
polaz Apr 8, 2026
381957b
docs: note ZstdDict support in zstd-pure feature section
polaz Apr 8, 2026
4c52232
ci(codecov): add zstd-pure coverage run to merge report
polaz Apr 8, 2026
8887d18
fix(compression): use read_exact to drain FrameDecoder buffer
polaz Apr 8, 2026
f8c33a2
test(compression): add unit tests for strip_dict_id and error branches
polaz Apr 8, 2026
08bd0fa
test(compression): cover bounded_read Io paths; tighten unreachable b…
polaz Apr 8, 2026
ffd9512
test(compression): directly test decode_raw_content_bounded error paths
polaz Apr 8, 2026
ac31ce6
feat(compression): enable dictionary compression in pure Rust backend
polaz Apr 8, 2026
13a8f45
test(compression): add regression tests for empty raw-content dict ro…
polaz Apr 8, 2026
53befa2
fix(compression): allow empty raw-content frames at capacity=0 in bou…
polaz Apr 8, 2026
2bfa811
ci: add MSRV to test-zstd matrix
polaz Apr 8, 2026
3f47c25
docs(compression): remove C FFI references; rename cold bench to tls_hit
polaz Apr 8, 2026
299cfe2
docs(compression): correct DICT_MAGIC endian notation; expand ZstdDic…
polaz Apr 8, 2026
44bb55e
fix(compression): use checked_add for overflow guard; reuse cached di…
polaz Apr 8, 2026
f6dde9f
docs(compression): TLS cache is backend-internal; ZstdDictionary hold…
polaz Apr 8, 2026
a6200d3
build(deps): update structured-zstd 0.0.7 → 0.0.10
polaz Apr 8, 2026
28676bb
test(compression): add compaction path integration test for ZstdDict
polaz Apr 8, 2026
ef55325
docs(compression): clarify ZstdDictionary::new doc and test name
polaz Apr 8, 2026
c9c7ddf
docs(compression): standardize magic byte notation and fix minor doc …
polaz Apr 8, 2026
dd0275f
test(compression): assert L0 is empty after major_compact in zstd dic…
polaz Apr 9, 2026
5f2e3a0
build(deps): update structured-zstd 0.0.10 → 0.0.11
polaz Apr 9, 2026
eb34b7f
docs(compression): document UptoBytes one-block over-decode behaviour…
polaz Apr 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/workflows/coordinode-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,26 @@ jobs:
working-directory: tools/db_bench
run: cargo check --all-features

test-zstd-pure:
needs: lint
timeout-minutes: 15
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
Comment thread
polaz marked this conversation as resolved.
Outdated
with:
toolchain: stable
- uses: Swatinem/rust-cache@v2
with:
prefix-key: ubuntu-cargo-zstd-pure
- uses: taiki-e/install-action@nextest
- name: Run tests (zstd-pure backend, no C zstd)
# zstd_pure_dict integration tests are gated with
# #[cfg(all(feature = "zstd-pure", not(feature = "zstd")))], so they
# are skipped by --all-features. Run without "zstd" to exercise the
# pure Rust dictionary compression path independently.
run: cargo nextest run --profile ci --no-default-features --features zstd-pure,lz4
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

cross:
needs: lint
timeout-minutes: 15
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ can be decompressed by the other. When both `zstd` and `zstd-pure` are enabled,
the C FFI backend takes precedence.

**Current limitations:**
- Dictionary compression is not yet supported (dictionary decompression works)
- Decompression throughput is ~2–3.5× slower than the C reference

Comment thread
coderabbitai[bot] marked this conversation as resolved.
*Disabled by default.*
Expand Down
29 changes: 21 additions & 8 deletions src/compression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ use std::sync::Arc;
/// This trait abstracts the zstd implementation behind a compile-time
/// selected backend. The C FFI backend (`zstd` feature) provides full
/// compression levels 1–22 and dictionary support. The pure Rust backend
/// (`zstd-pure` feature) provides compression levels 1–22 with no C
/// dependencies (dictionary compression not yet supported).
/// (`zstd-pure` feature) provides compression levels 1–22 and dictionary
/// support with no C dependencies.
///
/// Both backends produce RFC 8878 compliant zstd frames, so data
/// compressed by one can be decompressed by the other.
Expand All @@ -35,7 +35,13 @@ pub trait CompressionProvider {
/// Decompress a zstd frame, pre-allocating `capacity` bytes.
fn decompress(data: &[u8], capacity: usize) -> crate::Result<Vec<u8>>;

/// Compress `data` using a pre-trained dictionary.
/// Compress `data` using a zstd dictionary.
///
/// `dict_raw` may be either a finalized zstd dictionary (magic `0x37A430EC`
/// header, entropy tables, content — produced by `zstd --train` or
Comment thread
polaz marked this conversation as resolved.
Outdated
/// [`ZstdDictionary::raw`]) or a raw content dictionary (bare bytes used as
/// LZ77 history). Both the C FFI backend and the pure Rust backend accept
/// either representation.
Comment thread
polaz marked this conversation as resolved.
Outdated
fn compress_with_dict(data: &[u8], level: i32, dict_raw: &[u8]) -> crate::Result<Vec<u8>>;

/// Decompress a zstd frame that was compressed with a dictionary.
Expand Down Expand Up @@ -149,18 +155,25 @@ impl ZstdDictionary {
.get_or_init(|| zstd::dict::DecoderDictionary::copy(&self.raw))
}

/// Returns a 32-bit dictionary fingerprint (lower 32 bits of xxh3).
/// Returns a normalized 32-bit dictionary fingerprint (lower 32 bits of
/// xxh3, clamped to 1).
///
/// The ID is always ≥ 1 because zstd dict ID 0 means "no dictionary".
/// All callers — config validation, `CompressionType::ZstdDict`, and the
/// frame encoder/decoder — must use this method so that stored metadata and
/// frame headers agree on the same value.
///
/// Intended for display and external interop (e.g., matching against the
/// dict ID embedded in a zstd frame header). For internal cache keying
/// use [`id64`](ZstdDictionary::id64) to avoid hash collisions.
/// For internal cache keying use [`id64`](ZstdDictionary::id64) to avoid
Comment thread
polaz marked this conversation as resolved.
Outdated
/// hash collisions.
Comment thread
polaz marked this conversation as resolved.
Outdated
#[must_use]
#[expect(
clippy::cast_possible_truncation,
reason = "intentional: public API returns 32-bit fingerprint"
)]
pub fn id(&self) -> u32 {
self.id as u32
// id=0 means "no dictionary" in the zstd frame format; clamp to 1 so
// metadata, config validation, and frame headers all see the same value.
(self.id as u32).max(1)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
}

/// Returns the full 64-bit xxh3 fingerprint used as a collision-resistant
Expand Down
198 changes: 185 additions & 13 deletions src/compression/zstd_pure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
//! This backend requires no C compiler or system libraries — it compiles
//! with `cargo build` alone.
//!
//! # Limitations
//! # Notes
//!
//! - Dictionary compression is not yet supported (returns an error).
//! - Dictionary compression is supported via [`FrameCompressor::set_dictionary_from_bytes`].
Comment thread
polaz marked this conversation as resolved.
Outdated
//! - Dictionary decompression is supported.
//! - Decompression throughput is ~2–3.5x slower than the C reference.

Expand Down Expand Up @@ -75,12 +75,62 @@ impl CompressionProvider for ZstdPureProvider {
bounded_read(&mut decoder, capacity)
}

fn compress_with_dict(_data: &[u8], _level: i32, _dict_raw: &[u8]) -> crate::Result<Vec<u8>> {
Err(crate::Error::Io(std::io::Error::new(
std::io::ErrorKind::Unsupported,
"zstd dictionary compression is not yet supported by the pure Rust backend \
(structured-zstd); use the `zstd` feature for dictionary compression",
)))
fn compress_with_dict(data: &[u8], level: i32, dict_raw: &[u8]) -> crate::Result<Vec<u8>> {
use structured_zstd::decoding::Dictionary;
use structured_zstd::encoding::{CompressionLevel, FrameCompressor};

// `FrameCompressor::set_dictionary` accepts a parsed `Dictionary`.
//
// Two dictionary formats are supported:
//
// 1. **Finalized zstd dictionary** (magic `0x37A430EC` prefix): produced by
Comment thread
polaz marked this conversation as resolved.
Outdated
// `zstd --train` / `zstd::dict::from_continuous` and the C zstd library.
// Contains entropy tables (Huffman + FSE) that prime the compressor's
// coding state for better ratios. Parsed via `Dictionary::decode_dict`.
//
// 2. **Raw content dictionary** (no magic): a bare byte sequence used as
// LZ77 history to improve match distances on repetitive data. No entropy
// table seeding. Parsed via `Dictionary::from_raw_content`.
//
// The C backend's `Compressor::with_dictionary` transparently handles both
// formats. We replicate this behaviour here so that `ZstdDictionary` values
// created from raw training corpora (without a finalized header) also work.
//
// ID derivation for raw content dictionaries:
// - Use the lower 32 bits of the xxh3 hash of `dict_raw` (matching the
// formula in `ZstdDictionary::id()`), clamped to at least 1 because
// id=0 is rejected by `FrameCompressor::set_dictionary`.
// - Both compress and decompress derive the same ID from the same bytes,
// so the dict_id written into the frame header is consistent.
const DICT_MAGIC: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

let dictionary = if dict_raw.starts_with(&DICT_MAGIC) {
Dictionary::decode_dict(dict_raw)
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?
} else {
#[expect(
clippy::cast_possible_truncation,
reason = "intentional: lower 32 bits of xxh3 as dict id"
)]
let id = {
let h = xxhash_rust::xxh3::xxh3_64(dict_raw) as u32;
h.max(1) // id=0 is invalid; collision probability is negligible
};
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
Dictionary::from_raw_content(id, dict_raw.to_vec())
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?
};
Comment thread
polaz marked this conversation as resolved.
Outdated

let mut compressor = FrameCompressor::new(CompressionLevel::from_level(level));
compressor
.set_dictionary(dictionary)
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?;

let mut output = Vec::new();
compressor.set_source(std::io::Cursor::new(data));
compressor.set_drain(&mut output);
compressor.compress();

Ok(output)
}

fn decompress_with_dict(
Expand Down Expand Up @@ -117,8 +167,20 @@ impl CompressionProvider for ZstdPureProvider {
// Re-initialise if this is the first call in this thread or if
// the dictionary has changed (different id64 → different table).
if !matches!(&*state, Some((id, _)) if *id == dict.id64()) {
let parsed = Dictionary::decode_dict(dict.raw())
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?;
// Mirror the format-detection logic in `compress_with_dict`:
// finalized dictionaries (magic `0x37A430EC`) are parsed with
// `decode_dict`; raw content bytes use `from_raw_content` with
// the same ID formula so the dict_id in the frame header matches.
const DICT_MAGIC: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
let parsed = if dict.raw().starts_with(&DICT_MAGIC) {
Dictionary::decode_dict(dict.raw())
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?
} else {
// `dict.id()` already returns a normalized non-zero u32
// (lower 32 bits of xxh3, clamped to 1); use it directly.
Dictionary::from_raw_content(dict.id(), dict.raw().to_vec())
.map_err(|e| crate::Error::Io(std::io::Error::other(e)))?
};
let mut decoder = FrameDecoder::new();
decoder
.add_dict(parsed)
Expand Down Expand Up @@ -189,7 +251,6 @@ impl CompressionProvider for ZstdPureProvider {
}

#[cfg(test)]
#[expect(clippy::unwrap_used, reason = "test code")]
#[expect(clippy::expect_used, reason = "test code")]
mod tests {
use super::*;
Expand Down Expand Up @@ -258,8 +319,119 @@ mod tests {
let result = ZstdPureProvider::decompress_with_dict(COMPRESSED, &dict, too_small);
assert!(
matches!(result, Err(crate::Error::DecompressedSizeTooLarge { .. })),
"expected DecompressedSizeTooLarge but got {:?}",
result
"expected DecompressedSizeTooLarge but got {result:?}",
);
}

// --- compress_with_dict tests ---

#[test]
fn compress_with_dict_roundtrip_pure_to_pure() {
// Verify the core contract: data compressed with the pure backend using
// a dictionary must decompress back to the original plaintext using the
// same pure backend.
let dict = ZstdDictionary::new(DICT);

let compressed = ZstdPureProvider::compress_with_dict(PLAINTEXT, 3, DICT)
.expect("compression with dict should succeed");

// The output must be a non-empty zstd frame.
assert!(
!compressed.is_empty(),
"compressed output must not be empty"
);

let decompressed =
ZstdPureProvider::decompress_with_dict(&compressed, &dict, PLAINTEXT.len() + 1)
.expect("decompression with dict should succeed");

assert_eq!(
decompressed, PLAINTEXT,
"round-tripped output must equal the original plaintext"
);
}

#[test]
fn compress_with_dict_produces_zstd_magic() {
// zstd frames always start with the little-endian magic number 0xFD2FB528
// (bytes: 0x28, 0xB5, 0x2F, 0xFD). A mismatched magic means the frame is
// corrupt or the output is not a valid zstd frame.
let compressed = ZstdPureProvider::compress_with_dict(PLAINTEXT, 3, DICT)
.expect("compression should succeed");

assert!(
compressed.starts_with(&[0x28, 0xB5, 0x2F, 0xFD]),
"output must start with zstd magic 0xFD2FB528 (LE); got {:?}",
compressed.get(..4.min(compressed.len()))
);
}

#[test]
fn compress_with_dict_roundtrip_all_levels() {
// Compression must round-trip correctly across the full valid level range.
let dict = ZstdDictionary::new(DICT);

for level in [1, 3, 9, 19] {
let compressed =
ZstdPureProvider::compress_with_dict(PLAINTEXT, level, DICT).expect("compress");

let decompressed =
ZstdPureProvider::decompress_with_dict(&compressed, &dict, PLAINTEXT.len() + 1)
.expect("decompress");

assert_eq!(
decompressed, PLAINTEXT,
"round-trip failed at compression level={level}"
);
}
Comment thread
polaz marked this conversation as resolved.
}

#[test]
fn compress_with_dict_empty_dict_returns_error() {
// An empty dictionary slice must return an error because there is no
// content to use as LZ77 history. Both the finalized-format path and
// the raw-content path reject empty input.
let result = ZstdPureProvider::compress_with_dict(PLAINTEXT, 3, b"");
assert!(
result.is_err(),
"expected an error for empty dictionary, got Ok"
);
}

#[test]
fn compress_with_dict_raw_content_dict_works() {
// A raw byte sequence (no finalized-dict magic) must be accepted as a
// raw content dictionary and produce a valid compressed frame.
let raw_content_dict = b"this is raw content dictionary data for matching";
let dict = ZstdDictionary::new(raw_content_dict);

let compressed = ZstdPureProvider::compress_with_dict(PLAINTEXT, 3, raw_content_dict)
.expect("compression with raw content dict should succeed");

let decompressed =
ZstdPureProvider::decompress_with_dict(&compressed, &dict, PLAINTEXT.len() + 1)
.expect("decompression with raw content dict should succeed");

assert_eq!(
decompressed, PLAINTEXT,
"round-trip with raw content dict must equal the original plaintext"
);
}

#[test]
fn compress_with_dict_empty_plaintext_roundtrips() {
// Edge case: compressing an empty payload with a dictionary must round-trip.
let dict = ZstdDictionary::new(DICT);

let compressed = ZstdPureProvider::compress_with_dict(&[], 3, DICT)
.expect("compression of empty payload should succeed");

let decompressed = ZstdPureProvider::decompress_with_dict(&compressed, &dict, 1)
.expect("decompression of empty payload should succeed");

assert!(
decompressed.is_empty(),
"decompressed output of empty payload must be empty"
);
}
}
17 changes: 0 additions & 17 deletions src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,23 +508,6 @@ impl Config {
/// at open time rather than at first block write/read.
#[cfg(zstd_any)]
fn validate_zstd_dictionary(&self) -> crate::Result<()> {
// The pure Rust backend does not support dictionary *compression*.
// Reject ZstdDict write policies upfront so Config::open() fails early
// instead of deferring to the first block spill.
// Dictionary *reading* remains available under zstd_any for opening
// tables written by the C FFI backend.
#[cfg(all(feature = "zstd-pure", not(feature = "zstd")))]
if self
.data_block_compression_policy
.iter()
.any(|ct| matches!(ct, CompressionType::ZstdDict { .. }))
{
return Err(crate::Error::Io(std::io::Error::new(
std::io::ErrorKind::Unsupported,
"zstd dictionary compression is not supported by the pure Rust backend",
)));
}

let dict_id = self.zstd_dictionary.as_ref().map(|d| d.id());

// NOTE: Only data block policies are validated. Index blocks never
Expand Down
Loading
Loading