diff --git a/Cargo.lock b/Cargo.lock index 2c1a294ee9..c400f3653d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1507,6 +1507,8 @@ dependencies = [ name = "gix-blame" version = "0.10.0" dependencies = [ + "criterion", + "gix", "gix-commitgraph", "gix-date", "gix-diff", @@ -1559,6 +1561,7 @@ dependencies = [ "gix-hash", "gix-testtools", "memmap2", + "murmur3", "nonempty", "serde", ] @@ -3549,6 +3552,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "native-tls" version = "0.2.18" diff --git a/gitoxide-core/src/repository/log.rs b/gitoxide-core/src/repository/log.rs index ed4d1cb4fd..49299bf9fc 100644 --- a/gitoxide-core/src/repository/log.rs +++ b/gitoxide-core/src/repository/log.rs @@ -1,5 +1,4 @@ -use anyhow::bail; -use gix::bstr::{BString, ByteSlice}; +use gix::bstr::{BStr, BString, ByteSlice}; pub fn log(mut repo: gix::Repository, out: &mut dyn std::io::Write, path: Option) -> anyhow::Result<()> { repo.object_cache_size_if_unset(repo.compute_object_cache_size_for_tree_diffs(&**repo.index_or_empty()?)); @@ -25,8 +24,20 @@ fn log_all(repo: gix::Repository, out: &mut dyn std::io::Write) -> Result<(), an Ok(()) } -fn log_file(_repo: gix::Repository, _out: &mut dyn std::io::Write, _path: BString) -> anyhow::Result<()> { - bail!("File-based lookup isn't yet implemented in a way that is competitively fast"); +fn log_file(repo: gix::Repository, out: &mut dyn std::io::Write, path: BString) -> anyhow::Result<()> { + let path = gix::path::to_unix_separators_on_windows(path.as_bstr()).into_owned(); + let head = repo.head()?.peel_to_commit()?; + let cache = repo.commit_graph_if_enabled()?; + let topo = gix::traverse::commit::topo::Builder::from_iters(&repo.objects, [head.id], None::>) + .build()?; + + for info in topo { + let info = info?; + if commit_changes_path(&repo, cache.as_ref(), &info, path.as_ref())? { + write_info(&repo, &mut *out, &info)?; + } + } + Ok(()) } fn write_info( @@ -48,3 +59,41 @@ fn write_info( Ok(()) } + +fn commit_changes_path( + repo: &gix::Repository, + cache: Option<&gix::commitgraph::Graph>, + info: &gix::traverse::commit::Info, + path: &BStr, +) -> anyhow::Result { + let commit = repo.find_commit(info.id)?; + let commit_tree = commit.tree()?; + let commit_entry = lookup_path_entry(&commit_tree, path)?; + + if info.parent_ids.is_empty() { + return Ok(commit_entry.is_some()); + } + + for (index, parent_id) in info.parent_ids.iter().enumerate() { + if index == 0 && cache.and_then(|graph| graph.maybe_contains_path_by_id(info.id, path)) == Some(false) { + continue; + } + + let parent = repo.find_commit(*parent_id)?; + let parent_tree = parent.tree()?; + let parent_entry = lookup_path_entry(&parent_tree, path)?; + if commit_entry != parent_entry { + return Ok(true); + } + } + + Ok(false) +} + +fn lookup_path_entry( + tree: &gix::Tree<'_>, + path: &BStr, +) -> anyhow::Result> { + let entry = tree.lookup_entry(path.split(|b| *b == b'/'))?; + Ok(entry.map(|entry| (entry.mode(), entry.object_id()))) +} diff --git a/gix-blame/Cargo.toml b/gix-blame/Cargo.toml index fdbfc527f0..793d55c43b 100644 --- a/gix-blame/Cargo.toml +++ b/gix-blame/Cargo.toml @@ -10,6 +10,11 @@ authors = ["Christoph Rüßler ", "Sebastian Thi edition = "2021" rust-version = "1.82" +[[bench]] +name = "incremental-blame" +harness = false +path = "./benches/incremental_blame.rs" + [features] ## Enable support for the SHA-1 hash by forwarding the feature to dependencies. sha1 = [ @@ -38,6 +43,8 @@ smallvec = "1.15.1" thiserror = "2.0.18" [dev-dependencies] +criterion = "0.8.0" +gix = { path = "../gix", default-features = false, features = ["basic", "sha1"] } gix-ref = { path = "../gix-ref" } gix-filter = { path = "../gix-filter" } gix-fs = { path = "../gix-fs" } diff --git a/gix-blame/benches/incremental_blame.rs b/gix-blame/benches/incremental_blame.rs new file mode 100644 index 0000000000..cf045b6f21 --- /dev/null +++ b/gix-blame/benches/incremental_blame.rs @@ -0,0 +1,134 @@ +use std::{ + env, + ops::ControlFlow, + path::{Path, PathBuf}, + process::Command, +}; + +use criterion::{criterion_group, criterion_main, Criterion}; +use gix_blame::{BlameEntry, BlameRanges, BlameSink}; +use gix_object::bstr::BString; + +const DEFAULT_BENCH_PATH: &str = "gix-blame/src/file/function.rs"; + +struct DiscardSink; + +impl BlameSink for DiscardSink { + fn push(&mut self, _entry: BlameEntry) -> ControlFlow<()> { + ControlFlow::Continue(()) + } +} + +fn incremental_options() -> gix_blame::Options { + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + ranges: BlameRanges::default(), + since: None, + rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, + } +} + +fn benchmark_incremental_blame(c: &mut Criterion) { + let repo_path = env::var_os("GIX_BLAME_BENCH_REPO").map_or_else( + || { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .expect("gix-blame crate has a parent directory") + .to_path_buf() + }, + PathBuf::from, + ); + let source_file_path: BString = env::var("GIX_BLAME_BENCH_PATH") + .unwrap_or_else(|_| DEFAULT_BENCH_PATH.into()) + .into_bytes() + .into(); + let commit_spec = env::var("GIX_BLAME_BENCH_COMMIT").unwrap_or_else(|_| "HEAD".into()); + let git_dir = repo_path.join(".git"); + + if !has_commit_graph_cache(&git_dir) { + write_changed_path_commit_graph(&repo_path).expect("commit-graph should be writable for benchmark repository"); + } + + let repo = gix::open(&repo_path).expect("repository can be opened"); + let suspect = repo + .rev_parse_single(commit_spec.as_str()) + .expect("commit spec resolves to one object") + .detach(); + + let mut group = c.benchmark_group("gix-blame::incremental"); + group.bench_function("without-commit-graph", |b| { + let mut resource_cache = repo + .diff_resource_cache_for_tree_diff() + .expect("tree-diff resource cache can be created"); + + b.iter(|| { + let mut sink = DiscardSink; + gix_blame::incremental( + &repo, + suspect, + None, + &mut resource_cache, + source_file_path.as_ref(), + &mut sink, + incremental_options(), + ) + .expect("incremental blame should succeed"); + }); + }); + group.bench_function("with-commit-graph", |b| { + let mut resource_cache = repo + .diff_resource_cache_for_tree_diff() + .expect("tree-diff resource cache can be created"); + let cache = repo.commit_graph().expect("commit-graph can be loaded from repository"); + b.iter(|| { + let mut sink = DiscardSink; + gix_blame::incremental( + &repo, + suspect, + Some(&cache), + &mut resource_cache, + source_file_path.as_ref(), + &mut sink, + incremental_options(), + ) + .expect("incremental blame should succeed"); + }); + }); + group.finish(); +} + +fn has_commit_graph_cache(git_dir: &Path) -> bool { + let info_dir = git_dir.join("objects/info"); + info_dir.join("commit-graph").is_file() || info_dir.join("commit-graphs").is_dir() +} + +fn write_changed_path_commit_graph(worktree_path: &Path) -> std::io::Result<()> { + let config_status = Command::new("git") + .args(["config", "commitGraph.changedPathsVersion", "2"]) + .current_dir(worktree_path) + .status()?; + assert!( + config_status.success(), + "setting commitGraph.changedPathsVersion should succeed" + ); + + let write_status = Command::new("git") + .args([ + "commit-graph", + "write", + "--no-progress", + "--reachable", + "--changed-paths", + ]) + .current_dir(worktree_path) + .status()?; + assert!( + write_status.success(), + "writing changed-path commit-graph should succeed" + ); + Ok(()) +} + +criterion_group!(benches, benchmark_incremental_blame); +criterion_main!(benches); diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index cf6c0400fb..fbbcaec7a6 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -1,4 +1,4 @@ -use std::num::NonZeroU32; +use std::ops::ControlFlow; use gix_diff::{blob::intern::TokenSource, tree::Visit}; use gix_hash::ObjectId; @@ -9,8 +9,8 @@ use gix_object::{ use gix_traverse::commit::find as find_commit; use smallvec::SmallVec; -use super::{process_changes, Change, UnblamedHunk}; -use crate::{types::BlamePathEntry, BlameEntry, Error, Options, Outcome, Statistics}; +use super::{coalesce_blame_entries, process_changes, Change, UnblamedHunk}; +use crate::{types::BlamePathEntry, BlameEntry, BlameSink, Error, IncrementalOutcome, Options, Outcome, Statistics}; /// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file /// at `suspect:` originated in. @@ -60,26 +60,21 @@ use crate::{types::BlamePathEntry, BlameEntry, Error, Options, Outcome, Statisti /// <---><----------><-------><-----><-------> /// <---><---><-----><-------><-----><-------> /// <---><---><-----><-------><-----><-><-><-> -pub fn file( +pub fn incremental( odb: impl gix_object::Find + gix_object::FindHeader, suspect: ObjectId, - cache: Option, + cache: Option<&gix_commitgraph::Graph>, resource_cache: &mut gix_diff::blob::Platform, file_path: &BStr, + sink: &mut impl BlameSink, options: Options, -) -> Result { - let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect); +) -> Result { + let _span = gix_trace::coarse!("gix_blame::incremental()", ?file_path, ?suspect); let mut stats = Statistics::default(); let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); let blamed_file_entry_id = find_path_entry_in_commit( - &odb, - &suspect, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, + &odb, &suspect, file_path, cache, &mut buf, &mut buf2, &mut stats, )? .ok_or_else(|| Error::FileMissing { file_path: file_path.to_owned(), @@ -90,7 +85,7 @@ pub fn file( // Binary or otherwise empty? if num_lines_in_blamed == 0 { - return Ok(Outcome::default()); + return Ok(IncrementalOutcome::default()); } let ranges_to_blame = options.ranges.to_zero_based_exclusive_ranges(num_lines_in_blamed); @@ -100,12 +95,11 @@ pub fn file( .collect::>(); let (mut buf, mut buf2) = (Vec::new(), Vec::new()); - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; + let commit = find_commit(cache, &odb, &suspect, &mut buf)?; let mut queue: gix_revwalk::PriorityQueue = gix_revwalk::PriorityQueue::new(); queue.insert(commit.commit_time()?, suspect); - let mut out = Vec::new(); let mut diff_state = gix_diff::tree::State::default(); let mut previous_entry: Option<(ObjectId, ObjectId)> = None; let mut blame_path = if options.debug_track_path { @@ -113,6 +107,7 @@ pub fn file( } else { None }; + let mut stopped_early = false; 'outer: while let Some(suspect) = queue.pop_value() { stats.commits_traversed += 1; @@ -132,20 +127,23 @@ pub fn file( .clone() .unwrap_or_else(|| file_path.to_owned()); - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; + let commit = find_commit(cache, &odb, &suspect, &mut buf)?; let commit_time = commit.commit_time()?; if let Some(since) = options.since { if commit_time < since.seconds { - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; + match emit_unblamed_hunks(&mut hunks_to_blame, sink, suspect) { + ControlFlow::Continue(true) => break 'outer, + ControlFlow::Continue(false) => continue, + ControlFlow::Break(()) => { + stopped_early = true; + break 'outer; + } } - - continue; } } - let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref(), &mut buf2)?; + let parent_ids: ParentIds = collect_parents(commit, &odb, cache, &mut buf2)?; if parent_ids.is_empty() { if queue.is_empty() { @@ -154,31 +152,52 @@ pub fn file( // the remaining lines to it, even though we don’t explicitly check whether that is // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against // an empty tree to validate this assumption. - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - if let Some(ref mut blame_path) = blame_path { - let entry = previous_entry - .take() - .filter(|(id, _)| *id == suspect) - .map(|(_, entry)| entry); - - let blame_path_entry = BlamePathEntry { - source_file_path: current_file_path.clone(), - previous_source_file_path: None, - commit_id: suspect, - blob_id: entry.unwrap_or(ObjectId::null(gix_hash::Kind::Sha1)), - previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), - parent_index: 0, - }; - blame_path.push(blame_path_entry); - } + match emit_unblamed_hunks(&mut hunks_to_blame, sink, suspect) { + ControlFlow::Continue(true) => { + if let Some(ref mut blame_path) = blame_path { + let entry = previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| entry); - break 'outer; + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: None, + commit_id: suspect, + blob_id: entry.unwrap_or(ObjectId::null(gix_hash::Kind::Sha1)), + previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), + parent_index: 0, + }; + blame_path.push(blame_path_entry); + } + + break 'outer; + } + ControlFlow::Continue(false) => {} + ControlFlow::Break(()) => { + stopped_early = true; + break 'outer; + } } } // There is more, keep looking. continue; } + let first_parent_bloom_result = + maybe_changed_path_in_bloom_filter(cache, suspect, current_file_path.as_ref(), &mut stats); + + if first_parent_bloom_result == Some(false) { + let (first_parent_id, first_parent_commit_time) = parent_ids[0]; + pass_blame_from_to(suspect, first_parent_id, &mut hunks_to_blame); + queue.insert(first_parent_commit_time, first_parent_id); + previous_entry = previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| (first_parent_id, entry)); + continue 'outer; + } + let mut entry = previous_entry .take() .filter(|(id, _)| *id == suspect) @@ -188,7 +207,7 @@ pub fn file( &odb, &suspect, current_file_path.as_ref(), - cache.as_ref(), + cache, &mut buf, &mut buf2, &mut stats, @@ -239,7 +258,7 @@ pub fn file( &odb, parent_id, current_file_path.as_ref(), - cache.as_ref(), + cache, &mut buf, &mut buf2, &mut stats, @@ -259,12 +278,13 @@ pub fn file( let more_than_one_parent = parent_ids.len() > 1; for (index, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { queue.insert(*parent_commit_time, *parent_id); + let changes_for_file_path = tree_diff_at_file_path( &odb, current_file_path.as_ref(), suspect, *parent_id, - cache.as_ref(), + cache, &mut stats, &mut diff_state, resource_cache, @@ -274,6 +294,9 @@ pub fn file( options.rewrites, )?; let Some(modification) = changes_for_file_path else { + if index == 0 && first_parent_bloom_result == Some(true) { + stats.bloom_false_positive += 1; + } if more_than_one_parent { // None of the changes affected the file we’re currently blaming. // Copy blame to parent. @@ -292,20 +315,29 @@ pub fn file( // Do nothing under the assumption that this always (or almost always) // implies that the file comes from a different parent, compared to which // it was modified, not added. - } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - if let Some(ref mut blame_path) = blame_path { - let blame_path_entry = BlamePathEntry { - source_file_path: current_file_path.clone(), - previous_source_file_path: None, - commit_id: suspect, - blob_id: id, - previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), - parent_index: index, - }; - blame_path.push(blame_path_entry); + } else { + match emit_unblamed_hunks(&mut hunks_to_blame, sink, suspect) { + ControlFlow::Continue(true) => { + if let Some(ref mut blame_path) = blame_path { + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: None, + commit_id: suspect, + blob_id: id, + previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), + parent_index: index, + }; + blame_path.push(blame_path_entry); + } + + break 'outer; + } + ControlFlow::Continue(false) => {} + ControlFlow::Break(()) => { + stopped_early = true; + break 'outer; + } } - - break 'outer; } } TreeDiffChange::Deletion => { @@ -322,7 +354,7 @@ pub fn file( options.diff_algorithm, &mut stats, )?; - hunks_to_blame = process_changes(hunks_to_blame, changes.clone(), suspect, *parent_id); + hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, *parent_id); if let Some(ref mut blame_path) = blame_path { let has_blame_been_passed = hunks_to_blame.iter().any(|hunk| hunk.has_suspect(parent_id)); @@ -354,7 +386,7 @@ pub fn file( options.diff_algorithm, &mut stats, )?; - hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, *parent_id); + hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, *parent_id); let mut has_blame_been_passed = false; @@ -383,39 +415,65 @@ pub fn file( } } - hunks_to_blame.retain_mut(|unblamed_hunk| { - if unblamed_hunk.suspects.len() == 1 { - if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) { - // At this point, we have copied blame for every hunk to a parent. Hunks - // that have only `suspect` left in `suspects` have not passed blame to any - // parent, and so they can be converted to a `BlameEntry` and moved to - // `out`. - out.push(entry); - return false; - } - } - unblamed_hunk.remove_blame(suspect); - true - }); + if let ControlFlow::Break(()) = emit_single_suspect_hunks(&mut hunks_to_blame, sink, suspect) { + stopped_early = true; + break 'outer; + } } - debug_assert_eq!( - hunks_to_blame, - vec![], - "only if there is no portion of the file left we have completed the blame" - ); + if !stopped_early { + debug_assert_eq!( + hunks_to_blame, + vec![], + "only if there is no portion of the file left we have completed the blame" + ); + } - // I don’t know yet whether it would make sense to use a data structure instead that preserves - // order on insertion. - out.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); - Ok(Outcome { - entries: coalesce_blame_entries(out), + Ok(IncrementalOutcome { blob: blamed_file_blob, statistics: stats, blame_path, }) } +/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file +/// at `suspect:` originated in. +/// +/// This is built on top of [`incremental()`], collecting entries into a [`Vec`] sink. +pub fn file( + odb: impl gix_object::Find + gix_object::FindHeader, + suspect: ObjectId, + cache: Option, + resource_cache: &mut gix_diff::blob::Platform, + file_path: &BStr, + options: Options, +) -> Result { + let mut entries = Vec::new(); + let IncrementalOutcome { + blob, + statistics, + blame_path, + } = incremental( + odb, + suspect, + cache.as_ref(), + resource_cache, + file_path, + &mut entries, + options, + )?; + + // Keep the stable output semantics of `file()` even though `incremental()` emits in generation order. + entries.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); + + Ok(Outcome { + entries: coalesce_blame_entries(entries), + blob, + statistics, + blame_path, + }) +} + /// Pass ownership of each unblamed hunk of `from` to `to`. /// /// This happens when `from` didn't actually change anything in the blamed file. @@ -425,73 +483,60 @@ fn pass_blame_from_to(from: ObjectId, to: ObjectId, hunks_to_blame: &mut Vec, - out: &mut Vec, + sink: &mut impl BlameSink, suspect: ObjectId, -) -> bool { +) -> ControlFlow<(), bool> { let mut without_suspect = Vec::new(); - out.extend(hunks_to_blame.drain(..).filter_map(|hunk| { - BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| { + let mut remaining_hunks = std::mem::take(hunks_to_blame).into_iter(); + while let Some(hunk) = remaining_hunks.next() { + if let Some(entry) = BlameEntry::from_unblamed_hunk(&hunk, suspect) { + if let ControlFlow::Break(()) = sink.push(entry) { + without_suspect.extend(remaining_hunks); + *hunks_to_blame = without_suspect; + return ControlFlow::Break(()); + } + } else { without_suspect.push(hunk); - None - }) - })); + } + } *hunks_to_blame = without_suspect; - hunks_to_blame.is_empty() + ControlFlow::Continue(hunks_to_blame.is_empty()) } -/// This function merges adjacent blame entries. It merges entries that are adjacent both in the -/// blamed file and in the source file that introduced them. This follows `git`’s -/// behaviour. `libgit2`, as of 2024-09-19, only checks whether two entries are adjacent in the -/// blamed file which can result in different blames in certain edge cases. See [the commit][1] -/// that introduced the extra check into `git` for context. See [this commit][2] for a way to test -/// for this behaviour in `git`. -/// -/// [1]: https://github.com/git/git/commit/c2ebaa27d63bfb7c50cbbdaba90aee4efdd45d0a -/// [2]: https://github.com/git/git/commit/6dbf0c7bebd1c71c44d786ebac0f2b3f226a0131 -fn coalesce_blame_entries(lines_blamed: Vec) -> Vec { - let len = lines_blamed.len(); - lines_blamed - .into_iter() - .fold(Vec::with_capacity(len), |mut acc, entry| { - let previous_entry = acc.last(); - - if let Some(previous_entry) = previous_entry { - let previous_blamed_range = previous_entry.range_in_blamed_file(); - let current_blamed_range = entry.range_in_blamed_file(); - let previous_source_range = previous_entry.range_in_source_file(); - let current_source_range = entry.range_in_source_file(); - if previous_entry.commit_id == entry.commit_id - && previous_blamed_range.end == current_blamed_range.start - // As of 2024-09-19, the check below only is in `git`, but not in `libgit2`. - && previous_source_range.end == current_source_range.start - { - let coalesced_entry = BlameEntry { - start_in_blamed_file: previous_blamed_range.start as u32, - start_in_source_file: previous_source_range.start as u32, - len: NonZeroU32::new((current_source_range.end - previous_source_range.start) as u32) - .expect("BUG: hunks are never zero-sized"), - commit_id: previous_entry.commit_id, - source_file_name: previous_entry.source_file_name.clone(), - }; - - acc.pop(); - acc.push(coalesced_entry); - } else { - acc.push(entry); +/// Convert hunks that still have `suspect` as their only candidate into [`BlameEntry`] values. +fn emit_single_suspect_hunks( + hunks_to_blame: &mut Vec, + sink: &mut impl BlameSink, + suspect: ObjectId, +) -> ControlFlow<()> { + let mut remaining_hunks = Vec::with_capacity(hunks_to_blame.len()); + let mut pending_hunks = std::mem::take(hunks_to_blame).into_iter(); + while let Some(mut unblamed_hunk) = pending_hunks.next() { + if unblamed_hunk.suspects.len() == 1 { + if let Some(entry) = BlameEntry::from_unblamed_hunk(&unblamed_hunk, suspect) { + // At this point, we have copied blame for every hunk to a parent. Hunks that have + // only `suspect` left in `suspects` have not passed blame to any parent, and so + // they can be converted to a `BlameEntry` and moved to the sink. + if let ControlFlow::Break(()) = sink.push(entry) { + remaining_hunks.extend(pending_hunks); + *hunks_to_blame = remaining_hunks; + return ControlFlow::Break(()); } - - acc - } else { - acc.push(entry); - - acc + continue; } - }) + } + unblamed_hunk.remove_blame(suspect); + remaining_hunks.push(unblamed_hunk); + } + + *hunks_to_blame = remaining_hunks; + ControlFlow::Continue(()) } /// The union of [`gix_diff::tree::recorder::Change`] and [`gix_diff::tree_with_rewrites::Change`], @@ -879,8 +924,111 @@ fn collect_parents( Ok(parent_ids) } +fn maybe_changed_path_in_bloom_filter( + cache: Option<&gix_commitgraph::Graph>, + commit_id: ObjectId, + path: &BStr, + stats: &mut Statistics, +) -> Option { + if let Some(cache) = cache { + let result = cache.maybe_contains_path_by_id(commit_id, path); + match result { + Some(false) => { + stats.bloom_queries += 1; + stats.bloom_definitely_not += 1; + } + Some(true) => { + stats.bloom_queries += 1; + stats.bloom_maybe += 1; + } + None => { + stats.bloom_filter_not_present += 1; + } + } + result + } else { + None + } +} + /// Return an iterator over tokens for use in diffing. These are usually lines, but it's important /// to unify them so the later access shows the right thing. pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource { gix_diff::blob::sources::byte_lines_with_terminator(data) } + +#[cfg(test)] +mod tests { + use gix_object::bstr::BString; + use gix_testtools::Result; + + use super::{file, incremental}; + use crate::{file::coalesce_blame_entries, BlameRanges, Options}; + + struct Fixture { + repo: gix::Repository, + resource_cache: gix_diff::blob::Platform, + suspect: gix_hash::ObjectId, + } + + impl Fixture { + fn new() -> Result { + let repo_path = gix_testtools::scripted_fixture_read_only("make_blame_repo.sh")?; + let repo = gix::open(&repo_path)?; + let suspect = repo.rev_parse_single("HEAD")?.detach(); + let resource_cache = repo.diff_resource_cache_for_tree_diff()?; + Ok(Self { + repo, + resource_cache, + suspect, + }) + } + } + + fn options() -> Options { + Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + ranges: BlameRanges::default(), + since: None, + rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, + } + } + + #[test] + fn vec_sink_can_reconstruct_file_entries() -> Result { + let Fixture { + repo, + mut resource_cache, + suspect, + } = Fixture::new()?; + let source_file_name: BString = "simple.txt".into(); + + let mut streamed_entries = Vec::new(); + incremental( + &repo, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + &mut streamed_entries, + options(), + )?; + + streamed_entries.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); + let streamed_entries = coalesce_blame_entries(streamed_entries); + + let file_entries = file( + &repo, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + options(), + )? + .entries; + + pretty_assertions::assert_eq!(streamed_entries, file_entries); + Ok(()) + } +} diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index 85b3f3a8f3..4e6cd561dc 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -8,6 +8,56 @@ use crate::types::{BlameEntry, Change, Either, LineRange, Offset, UnblamedHunk}; pub(super) mod function; +/// This function merges adjacent blame entries. It merges entries that are adjacent both in the +/// blamed file and in the source file that introduced them. This follows `git`'s +/// behaviour. `libgit2`, as of 2024-09-19, only checks whether two entries are adjacent in the +/// blamed file which can result in different blames in certain edge cases. See [the commit][1] +/// that introduced the extra check into `git` for context. See [this commit][2] for a way to test +/// for this behaviour in `git`. +/// +/// [1]: https://github.com/git/git/commit/c2ebaa27d63bfb7c50cbbdaba90aee4efdd45d0a +/// [2]: https://github.com/git/git/commit/6dbf0c7bebd1c71c44d786ebac0f2b3f226a0131 +pub(super) fn coalesce_blame_entries(lines_blamed: Vec) -> Vec { + let len = lines_blamed.len(); + lines_blamed + .into_iter() + .fold(Vec::with_capacity(len), |mut acc, entry| { + let previous_entry = acc.last(); + + if let Some(previous_entry) = previous_entry { + let previous_blamed_range = previous_entry.range_in_blamed_file(); + let current_blamed_range = entry.range_in_blamed_file(); + let previous_source_range = previous_entry.range_in_source_file(); + let current_source_range = entry.range_in_source_file(); + if previous_entry.commit_id == entry.commit_id + && previous_blamed_range.end == current_blamed_range.start + // As of 2024-09-19, the check below only is in `git`, but not in `libgit2`. + && previous_source_range.end == current_source_range.start + { + let coalesced_entry = BlameEntry { + start_in_blamed_file: previous_blamed_range.start as u32, + start_in_source_file: previous_source_range.start as u32, + len: NonZeroU32::new((current_source_range.end - previous_source_range.start) as u32) + .expect("BUG: hunks are never zero-sized"), + commit_id: previous_entry.commit_id, + source_file_name: previous_entry.source_file_name.clone(), + }; + + acc.pop(); + acc.push(coalesced_entry); + } else { + acc.push(entry); + } + + acc + } else { + acc.push(entry); + + acc + } + }) +} + /// Compare a section from a potential *Source File* (`hunk`) with a change from a diff and see if /// there is an intersection with `change`. Based on that intersection, we may generate a /// [`BlameEntry`] for `out` and/or split the `hunk` into multiple. @@ -322,24 +372,36 @@ fn process_change( /// Consume `hunks_to_blame` and `changes` to pair up matches ranges (also overlapping) with each other. /// Once a match is found, it's pushed onto `out`. /// -/// `process_changes` assumes that ranges coming from the same *Source File* can and do -/// occasionally overlap. If it were a desirable property of the blame algorithm as a whole to -/// never have two different lines from a *Blamed File* mapped to the same line in a *Source File*, -/// this property would need to be enforced at a higher level than `process_changes`. -/// Then the nested loops could potentially be flattened into one. +/// Uses a cursor to avoid restarting the changes iteration from the beginning for each hunk. +/// When suspect ranges overlap (rare, from merge blame convergence), the cursor resets to +/// maintain correctness. fn process_changes( hunks_to_blame: Vec, - changes: Vec, + changes: &[Change], suspect: ObjectId, parent: ObjectId, ) -> Vec { let mut new_hunks_to_blame = Vec::new(); + let mut offset_in_destination = Offset::Added(0); + let mut changes_pos = 0; + let mut last_suspect_end: Option = None; + + for hunk in hunks_to_blame { + if !hunk.has_suspect(&suspect) { + new_hunks_to_blame.push(hunk); + continue; + } - for mut hunk in hunks_to_blame.into_iter().map(Some) { - let mut offset_in_destination = Offset::Added(0); + let suspect_range = hunk.get_range(&suspect).expect("has_suspect was true"); + if last_suspect_end.is_some_and(|end| suspect_range.start < end) { + changes_pos = 0; + offset_in_destination = Offset::Added(0); + } + last_suspect_end = Some(suspect_range.end); - let mut changes_iter = changes.iter().cloned(); - let mut change = changes_iter.next(); + let mut hunk = Some(hunk); + let mut pos = changes_pos; + let mut change: Option = changes.get(pos).cloned(); loop { (hunk, change) = process_change( @@ -351,12 +413,17 @@ fn process_changes( change, ); - change = change.or_else(|| changes_iter.next()); + if change.is_none() { + pos += 1; + change = changes.get(pos).cloned(); + } if hunk.is_none() { break; } } + + changes_pos = pos; } new_hunks_to_blame } diff --git a/gix-blame/src/file/tests.rs b/gix-blame/src/file/tests.rs index 2e60524205..ef5b48f0c7 100644 --- a/gix-blame/src/file/tests.rs +++ b/gix-blame/src/file/tests.rs @@ -742,7 +742,7 @@ mod process_changes { fn nothing() { let suspect = zero_sha(); let parent = one_sha(); - let new_hunks_to_blame = process_changes(vec![], vec![], suspect, parent); + let new_hunks_to_blame = process_changes(vec![], &[], suspect, parent); assert_eq!(new_hunks_to_blame, []); } @@ -753,7 +753,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(0..4, suspect).into()]; let changes = vec![Change::AddedOrReplaced(0..4, 0)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!(new_hunks_to_blame, [(0..4, suspect).into()]); } @@ -764,7 +764,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(0..6, suspect).into()]; let changes = vec![Change::AddedOrReplaced(0..4, 0), Change::Unchanged(4..6)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -782,7 +782,7 @@ mod process_changes { Change::AddedOrReplaced(2..4, 0), Change::Unchanged(4..6), ]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -804,7 +804,7 @@ mod process_changes { Change::AddedOrReplaced(1..4, 0), Change::Unchanged(4..6), ]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -822,7 +822,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(0..6, suspect).into()]; let changes = vec![Change::AddedOrReplaced(0..1, 0)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -836,7 +836,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(2..6, suspect, 0..4).into()]; let changes = vec![Change::AddedOrReplaced(0..1, 0)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -850,7 +850,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(0..6, suspect).into()]; let changes = vec![Change::AddedOrReplaced(0..4, 3), Change::Unchanged(4..6)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -864,7 +864,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(4..6, suspect, 3..5).into()]; let changes = vec![Change::AddedOrReplaced(0..3, 0), Change::Unchanged(3..5)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!(new_hunks_to_blame, [(4..6, parent, 0..2).into()]); } @@ -875,7 +875,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(1..3, suspect, 0..2).into()]; let changes = vec![Change::AddedOrReplaced(0..1, 2)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -893,7 +893,7 @@ mod process_changes { Change::Unchanged(2..3), Change::AddedOrReplaced(3..4, 0), ]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -915,7 +915,7 @@ mod process_changes { Change::AddedOrReplaced(16..17, 0), Change::Unchanged(17..37), ]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -938,7 +938,7 @@ mod process_changes { Change::AddedOrReplaced(6..9, 0), Change::Unchanged(9..11), ]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -958,7 +958,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(0..4, suspect).into(), (4..7, suspect).into()]; let changes = vec![Change::Deleted(0, 3), Change::AddedOrReplaced(0..4, 0)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, @@ -972,7 +972,7 @@ mod process_changes { let parent = one_sha(); let hunks_to_blame = vec![(13..16, suspect).into(), (10..17, suspect).into()]; let changes = vec![Change::AddedOrReplaced(10..14, 0)]; - let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + let new_hunks_to_blame = process_changes(hunks_to_blame, &changes, suspect, parent); assert_eq!( new_hunks_to_blame, diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index 2a31c874f8..36e8aa85a7 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -17,7 +17,7 @@ mod error; pub use error::Error; mod types; -pub use types::{BlameEntry, BlamePathEntry, BlameRanges, Options, Outcome, Statistics}; +pub use types::{BlameEntry, BlamePathEntry, BlameRanges, BlameSink, IncrementalOutcome, Options, Outcome, Statistics}; mod file; -pub use file::function::file; +pub use file::function::{file, incremental}; diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index 79c49b7a62..2b65eefe1f 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -4,12 +4,29 @@ use smallvec::SmallVec; use std::ops::RangeInclusive; use std::{ num::NonZeroU32, - ops::{AddAssign, Range, SubAssign}, + ops::{AddAssign, ControlFlow, Range, SubAssign}, }; use crate::file::function::tokens_for_diffing; use crate::Error; +/// Receives [`BlameEntry`] values incrementally as they are discovered by +/// [`incremental()`](crate::incremental()). +pub trait BlameSink { + /// Receive a single blame chunk in generation order. + /// + /// Return [`ControlFlow::Break`] to stop streaming early while still allowing + /// [`incremental()`](crate::incremental()) to return the partial metadata gathered so far. + fn push(&mut self, entry: BlameEntry) -> ControlFlow<()>; +} + +impl BlameSink for Vec { + fn push(&mut self, entry: BlameEntry) -> ControlFlow<()> { + Vec::push(self, entry); + ControlFlow::Continue(()) + } +} + /// A type to represent one or more line ranges to blame in a file. /// /// It handles the conversion between git's 1-based inclusive ranges and the internal @@ -204,6 +221,21 @@ pub struct Outcome { pub blame_path: Option>, } +/// The outcome of [`incremental()`](crate::incremental()). +/// +/// It contains all non-entry information so callers can process [`BlameEntry`] instances +/// incrementally through a [`BlameSink`] while still receiving the metadata that was +/// previously available through [`Outcome`]. +#[derive(Debug, Default, Clone)] +pub struct IncrementalOutcome { + /// A buffer with the file content of the *Blamed File*, ready for tokenization. + pub blob: Vec, + /// Additional information about the amount of work performed to produce the blame. + pub statistics: Statistics, + /// Contains a log of all changes that affected the outcome of this blame. + pub blame_path: Option>, +} + /// Additional information about the performed operations. #[derive(Debug, Default, Copy, Clone)] pub struct Statistics { @@ -222,6 +254,16 @@ pub struct Statistics { /// The amount of blobs there were compared to each other to learn what changed between commits. /// Note that in order to diff a blob, one needs to load both versions from the database. pub blobs_diffed: usize, + /// Number of times changed-path Bloom filters were queried successfully. + pub bloom_queries: usize, + /// Number of queries where Bloom filters were unavailable for the commit. + pub bloom_filter_not_present: usize, + /// Number of Bloom answers that were `maybe`. + pub bloom_maybe: usize, + /// Number of Bloom answers that were `definitely not`. + pub bloom_definitely_not: usize, + /// Number of `maybe` Bloom answers that turned out not to affect the path. + pub bloom_false_positive: usize, } impl Outcome { diff --git a/gix-blame/tests/blame.rs b/gix-blame/tests/blame.rs index 8ac0564904..1f22539fe4 100644 --- a/gix-blame/tests/blame.rs +++ b/gix-blame/tests/blame.rs @@ -1,4 +1,8 @@ -use std::{collections::BTreeMap, path::PathBuf}; +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, + process::Command, +}; use gix_blame::BlameRanges; use gix_hash::ObjectId; @@ -351,6 +355,56 @@ fn diff_algorithm_parity() { } } +#[test] +fn changed_path_bloom_prefilter_keeps_blame_output_identical() -> gix_testtools::Result { + let worktree_path = fixture_path()?; + write_changed_path_commit_graph(&worktree_path)?; + + let source_file_name: gix_object::bstr::BString = "simple.txt".into(); + let options = gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + ranges: BlameRanges::default(), + since: None, + rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, + }; + + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::for_worktree_path(worktree_path.clone())?; + let without_bloom = gix_blame::file( + &odb, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + options.clone(), + )?; + + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::for_worktree_path(worktree_path.clone())?; + let cache = gix_commitgraph::at(worktree_path.join(".git/objects/info")) + .map_err(|err| std::io::Error::other(format!("loading commit-graph failed: {err}")))?; + let with_bloom = gix_blame::file( + &odb, + suspect, + Some(cache), + &mut resource_cache, + source_file_name.as_ref(), + options, + )?; + + pretty_assertions::assert_eq!(with_bloom.entries, without_bloom.entries); + assert!(with_bloom.statistics.bloom_queries > 0); + assert!(with_bloom.statistics.trees_diffed <= without_bloom.statistics.trees_diffed); + Ok(()) +} + #[test] fn file_that_was_added_in_two_branches() -> gix_testtools::Result { let worktree_path = gix_testtools::scripted_fixture_read_only("make_blame_two_roots_repo.sh")?; @@ -621,6 +675,159 @@ mod rename_tracking { } } +mod incremental { + use std::ops::ControlFlow; + + use gix_blame::{BlameEntry, BlameRanges, BlameSink}; + + use crate::Fixture; + + #[derive(Default)] + struct CountingSink { + chunks_received: usize, + } + + impl BlameSink for CountingSink { + fn push(&mut self, _entry: BlameEntry) -> ControlFlow<()> { + self.chunks_received += 1; + ControlFlow::Continue(()) + } + } + + struct BreakingSink { + chunks_received: usize, + stop_after: usize, + } + + impl BlameSink for BreakingSink { + fn push(&mut self, _entry: BlameEntry) -> ControlFlow<()> { + self.chunks_received += 1; + if self.chunks_received >= self.stop_after { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + } + } + + fn options() -> gix_blame::Options { + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + ranges: BlameRanges::default(), + since: None, + rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, + } + } + + #[test] + fn streams_chunks_to_custom_sink() -> gix_testtools::Result { + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::new()?; + let source_file_name: gix_object::bstr::BString = "simple.txt".into(); + let mut sink = CountingSink::default(); + + gix_blame::incremental( + &odb, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + &mut sink, + options(), + )?; + + assert!( + sink.chunks_received > 0, + "incremental blame should stream at least one chunk" + ); + Ok(()) + } + + #[test] + fn sink_can_stop_streaming_early() -> gix_testtools::Result { + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::new()?; + let source_file_name: gix_object::bstr::BString = "simple.txt".into(); + let mut sink = BreakingSink { + chunks_received: 0, + stop_after: 1, + }; + + let outcome = gix_blame::incremental( + &odb, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + &mut sink, + options(), + )?; + + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::new()?; + let mut full_sink = CountingSink::default(); + gix_blame::incremental( + &odb, + suspect, + None, + &mut resource_cache, + source_file_name.as_ref(), + &mut full_sink, + options(), + )?; + + assert_eq!( + sink.chunks_received, 1, + "the sink should stop after the first streamed chunk" + ); + assert!( + full_sink.chunks_received > sink.chunks_received, + "the early-stopping sink should observe fewer chunks than a full run" + ); + assert!( + !outcome.blob.is_empty(), + "incremental blame should still return partial metadata on early stop" + ); + Ok(()) + } +} + +fn write_changed_path_commit_graph(worktree_path: &Path) -> gix_testtools::Result { + let config_status = Command::new("git") + .args(["config", "commitGraph.changedPathsVersion", "2"]) + .current_dir(worktree_path) + .status()?; + assert!( + config_status.success(), + "setting commitGraph.changedPathsVersion should succeed" + ); + + let write_status = Command::new("git") + .args([ + "commit-graph", + "write", + "--no-progress", + "--reachable", + "--changed-paths", + ]) + .current_dir(worktree_path) + .status()?; + assert!( + write_status.success(), + "writing changed-path commit-graph should succeed" + ); + Ok(()) +} fn fixture_path() -> gix_testtools::Result { gix_testtools::scripted_fixture_read_only("make_blame_repo.sh") } diff --git a/gix-commitgraph/Cargo.toml b/gix-commitgraph/Cargo.toml index ea950dea9d..7aafa41c73 100644 --- a/gix-commitgraph/Cargo.toml +++ b/gix-commitgraph/Cargo.toml @@ -32,6 +32,7 @@ nonempty = "0.12.0" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } document-features = { version = "0.2.0", optional = true } +murmur3 = "0.5.2" [dev-dependencies] gix-testtools = { path = "../tests/tools" } diff --git a/gix-commitgraph/src/access.rs b/gix-commitgraph/src/access.rs index ad89c2ada0..41c78db139 100644 --- a/gix-commitgraph/src/access.rs +++ b/gix-commitgraph/src/access.rs @@ -1,4 +1,4 @@ -use crate::{file, file::Commit, File, Graph, Position}; +use crate::{file, file::Commit, BloomFilterSettings, File, Graph, Position}; /// Access impl Graph { @@ -52,6 +52,11 @@ impl Graph { pub fn num_commits(&self) -> u32 { self.files.iter().map(File::num_commits).sum() } + + /// Return changed-path Bloom filter settings used by the top-most compatible graph layer, if available. + pub fn bloom_filter_settings(&self) -> Option { + self.files.iter().rev().find_map(File::bloom_filter_settings) + } } /// Access fundamentals diff --git a/gix-commitgraph/src/bloom.rs b/gix-commitgraph/src/bloom.rs new file mode 100644 index 0000000000..be2823542d --- /dev/null +++ b/gix-commitgraph/src/bloom.rs @@ -0,0 +1,350 @@ +//! Query support for changed-path Bloom filters stored in commit-graph files. + +use std::io::Cursor; + +use bstr::BStr; + +use crate::{file, from_be_u32, BloomFilterSettings, File, Graph, Position}; + +const SEED0: u32 = 0x293a_e76f; +const SEED1: u32 = 0x7e64_6e2c; +const BITS_PER_WORD: u64 = 8; + +/// Precomputed hash positions for a path using Bloom filter settings. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct BloomKey { + h0: u32, + h1: u32, + num_hashes: u32, +} + +impl BloomKey { + /// Build a key for `path`. + /// + /// `path` must use `/` as separator, matching Git's changed-path Bloom filter expectations. + pub fn from_path(path: &BStr, settings: BloomFilterSettings) -> Self { + Self::from_bytes(path.as_ref(), settings) + } + + /// Build keys for `path` and each directory prefix. + /// + /// For `a/b/c`, this yields keys for `a/b/c`, `a/b`, and `a`. + /// `path` must use `/` as separator. + pub fn from_path_with_prefixes(path: &BStr, settings: BloomFilterSettings) -> Vec { + let bytes = path.as_ref(); + let mut out = vec![Self::from_bytes(bytes, settings)]; + + let mut idx = bytes.len(); + while idx > 0 { + idx -= 1; + if bytes[idx] == b'/' { + out.push(Self::from_bytes(&bytes[..idx], settings)); + } + } + out + } + + fn from_bytes(path: &[u8], settings: BloomFilterSettings) -> Self { + let (h0, h1) = match settings.hash_version { + 1 => (murmur3_v1(SEED0, path), murmur3_v1(SEED1, path)), + 2 => (murmur3_v2(SEED0, path), murmur3_v2(SEED1, path)), + version => panic!("BUG: unsupported Bloom hash version {version} should have been filtered earlier"), + }; + Self { + h0, + h1, + num_hashes: settings.num_hashes, + } + } + + /// Query whether this key may be contained in `filter_data`. + /// + /// Returns `None` if the filter is unusable (empty data), `Some(false)` on a definite miss, + /// and `Some(true)` on a possible hit. + pub fn contains(&self, filter_data: &[u8]) -> Option { + let modulo = (filter_data.len() as u64) * BITS_PER_WORD; + if modulo == 0 { + return None; + } + + for i in 0..self.num_hashes { + let hash = self.h0.wrapping_add(i.wrapping_mul(self.h1)); + let bit_pos = u64::from(hash) % modulo; + let byte_pos = (bit_pos / BITS_PER_WORD) as usize; + let mask = 1u8 << (bit_pos % BITS_PER_WORD); + if filter_data[byte_pos] & mask == 0 { + return Some(false); + } + } + Some(true) + } +} + +impl File { + /// Query if `path` may be present in the changed-path Bloom filter for commit `pos`. + /// + /// Checks the full path and every directory prefix against the filter, + /// matching Git's `bloom_filter_contains_vec()` behavior for reduced false positives. + pub fn maybe_contains_path(&self, pos: file::Position, path: &BStr) -> Option { + let (data, settings) = self.bloom_filter_at(pos)?; + let keys = BloomKey::from_path_with_prefixes(path, settings); + for key in &keys { + match key.contains(data) { + Some(false) => return Some(false), + None => return None, + Some(true) => {} + } + } + Some(true) + } + + /// Query if all `keys` may be present in the changed-path Bloom filter for commit `pos`. + /// + /// This corresponds to Git's `bloom_filter_contains_vec()` behavior. + pub fn maybe_contains_all_keys(&self, pos: file::Position, keys: &[BloomKey]) -> Option { + let (data, _settings) = self.bloom_filter_at(pos)?; + if keys.iter().all(|key| key.contains(data) == Some(true)) { + Some(true) + } else { + Some(false) + } + } + + fn bloom_filter_at(&self, pos: file::Position) -> Option<(&[u8], BloomFilterSettings)> { + let settings = self.bloom_filter_settings?; + let index_offset = self.bloom_filter_index_offset?; + let data_offset = self.bloom_filter_data_offset?; + if pos.0 >= self.num_commits() { + return None; + } + + let lex = pos.0 as usize; + let end = from_be_u32(&self.data[index_offset + lex * 4..][..4]); + let start = if lex == 0 { + 0 + } else { + from_be_u32(&self.data[index_offset + (lex - 1) * 4..][..4]) + }; + let start = start as usize; + let end = end as usize; + if start > end || end > self.bloom_filter_data_len { + return None; + } + let start = data_offset.checked_add(start)?; + let end = data_offset.checked_add(end)?; + self.data.get(start..end).map(|data| (data, settings)) + } +} + +impl Graph { + /// Query by commit id if `path` may be present in changed-path Bloom filters. + pub fn maybe_contains_path_by_id(&self, id: impl AsRef, path: &BStr) -> Option { + let pos = self.lookup(id)?; + self.maybe_contains_path(pos, path) + } + + /// Query by graph position if `path` may be present in changed-path Bloom filters. + pub fn maybe_contains_path(&self, pos: Position, path: &BStr) -> Option { + self.commit_at(pos).maybe_contains_path(path) + } +} + +pub(crate) fn murmur3_v1(seed: u32, data: &[u8]) -> u32 { + const C1: u32 = 0xcc9e_2d51; + const C2: u32 = 0x1b87_3593; + const R1: u32 = 15; + const R2: u32 = 13; + const M: u32 = 5; + const N: u32 = 0xe654_6b64; + + fn byte_to_u32(byte: u8) -> u32 { + u32::from_ne_bytes(i32::from(i8::from_ne_bytes([byte])).to_ne_bytes()) + } + + let mut seed = seed; + let chunks = data.chunks_exact(4); + let tail = chunks.remainder(); + for chunk in chunks { + let byte1 = byte_to_u32(chunk[0]); + let byte2 = byte_to_u32(chunk[1]) << 8; + let byte3 = byte_to_u32(chunk[2]) << 16; + let byte4 = byte_to_u32(chunk[3]) << 24; + let mut k = byte1 | byte2 | byte3 | byte4; + k = k.wrapping_mul(C1); + k = k.rotate_left(R1); + k = k.wrapping_mul(C2); + + seed ^= k; + seed = seed.rotate_left(R2).wrapping_mul(M).wrapping_add(N); + } + + let mut k1 = 0u32; + match tail.len() { + 3 => { + k1 ^= byte_to_u32(tail[2]) << 16; + k1 ^= byte_to_u32(tail[1]) << 8; + k1 ^= byte_to_u32(tail[0]); + } + 2 => { + k1 ^= byte_to_u32(tail[1]) << 8; + k1 ^= byte_to_u32(tail[0]); + } + 1 => { + k1 ^= byte_to_u32(tail[0]); + } + 0 => {} + _ => unreachable!("remainder is shorter than 4 bytes"), + } + if !tail.is_empty() { + k1 = k1.wrapping_mul(C1); + k1 = k1.rotate_left(R1); + k1 = k1.wrapping_mul(C2); + seed ^= k1; + } + + seed ^= data.len() as u32; + seed ^= seed >> 16; + seed = seed.wrapping_mul(0x85eb_ca6b); + seed ^= seed >> 13; + seed = seed.wrapping_mul(0xc2b2_ae35); + seed ^= seed >> 16; + seed +} + +pub(crate) fn murmur3_v2(seed: u32, data: &[u8]) -> u32 { + let mut reader = Cursor::new(data); + murmur3::murmur3_32(&mut reader, seed).expect("reading from memory does not fail") +} +#[cfg(test)] +mod tests { + use super::{murmur3_v2, BloomKey, BITS_PER_WORD}; + use crate::BloomFilterSettings; + use bstr::BStr; + + fn filter_bytes_for_path(path: &[u8], settings: BloomFilterSettings, len: usize) -> Vec { + let key = BloomKey::from_path(BStr::new(path), settings); + let mut out = vec![0u8; len]; + let modulo = (len as u64) * BITS_PER_WORD; + for i in 0..key.num_hashes { + let hash = key.h0.wrapping_add(i.wrapping_mul(key.h1)); + let bit_pos = u64::from(hash) % modulo; + let byte_pos = (bit_pos / BITS_PER_WORD) as usize; + out[byte_pos] |= 1u8 << (bit_pos % BITS_PER_WORD); + } + out + } + + #[test] + fn murmur3_known_vectors_match_git_and_reference_values() { + assert_eq!(murmur3_v2(0, b""), 0x0000_0000); + assert_eq!(murmur3_v2(0, b"Hello world!"), 0x627b_0c2c); + assert_eq!( + murmur3_v2(0, b"The quick brown fox jumps over the lazy dog"), + 0x2e4f_f723 + ); + } + + #[test] + fn murmur3_v2_matches_git_high_bit_vector() { + assert_eq!(murmur3_v2(0, b"\x99\xaa\xbb\xcc\xdd\xee\xff"), 0xa183_ccfd); + } + + #[test] + fn bloom_key_for_empty_path_matches_git_v1_vector() { + let settings = BloomFilterSettings { + hash_version: 1, + num_hashes: 7, + bits_per_entry: 10, + }; + let key = BloomKey::from_path(BStr::new(b""), settings); + assert_eq!( + (0..key.num_hashes) + .map(|i| key.h0.wrapping_add(i.wrapping_mul(key.h1))) + .collect::>(), + &[ + 0x5615_800c, + 0x5b96_6560, + 0x6117_4ab4, + 0x6698_3008, + 0x6c19_155c, + 0x7199_fab0, + 0x771a_e004 + ] + ); + } + + #[test] + fn bloom_key_for_empty_path_matches_git_v2_vector() { + let settings = BloomFilterSettings { + hash_version: 2, + num_hashes: 7, + bits_per_entry: 10, + }; + let key = BloomKey::from_path(BStr::new(b""), settings); + assert_eq!( + (0..key.num_hashes) + .map(|i| key.h0.wrapping_add(i.wrapping_mul(key.h1))) + .collect::>(), + &[ + 0x5615_800c, + 0x5b96_6560, + 0x6117_4ab4, + 0x6698_3008, + 0x6c19_155c, + 0x7199_fab0, + 0x771a_e004 + ] + ); + } + + #[test] + fn bloom_key_for_high_bit_path_differs_between_versions() { + let path = BStr::new(b"\xc2\xa2"); + let v1 = BloomKey::from_path( + path, + BloomFilterSettings { + hash_version: 1, + num_hashes: 7, + bits_per_entry: 10, + }, + ); + let v2 = BloomKey::from_path( + path, + BloomFilterSettings { + hash_version: 2, + num_hashes: 7, + bits_per_entry: 10, + }, + ); + assert_ne!(v1, v2); + } + + #[test] + fn bloom_filter_for_high_bit_path_matches_git_v1_and_v2_vectors() { + let path = b"\xc2\xa2"; + assert_eq!( + filter_bytes_for_path( + path, + BloomFilterSettings { + hash_version: 1, + num_hashes: 7, + bits_per_entry: 10, + }, + 2, + ), + vec![0x52, 0xa9] + ); + assert_eq!( + filter_bytes_for_path( + path, + BloomFilterSettings { + hash_version: 2, + num_hashes: 7, + bits_per_entry: 10, + }, + 2, + ), + vec![0xc0, 0x1f] + ); + } +} diff --git a/gix-commitgraph/src/file/access.rs b/gix-commitgraph/src/file/access.rs index 6ccb98cc32..064ac8e903 100644 --- a/gix-commitgraph/src/file/access.rs +++ b/gix-commitgraph/src/file/access.rs @@ -5,7 +5,7 @@ use std::{ use crate::{ file::{self, commit::Commit, COMMIT_DATA_ENTRY_SIZE_SANS_HASH}, - File, + BloomFilterSettings, File, }; /// Access @@ -107,6 +107,11 @@ impl File { pub fn path(&self) -> &Path { &self.path } + + /// Return changed-path Bloom filter settings if this file has a usable Bloom index and data pair. + pub fn bloom_filter_settings(&self) -> Option { + self.bloom_filter_settings + } } impl File { @@ -131,6 +136,13 @@ impl File { pub(crate) fn extra_edges_data(&self) -> Option<&[u8]> { Some(&self.data[self.extra_edges_list_range.clone()?]) } + + pub(crate) fn clear_bloom_filters(&mut self) { + self.bloom_filter_data_len = 0; + self.bloom_filter_data_offset = None; + self.bloom_filter_index_offset = None; + self.bloom_filter_settings = None; + } } impl Debug for File { diff --git a/gix-commitgraph/src/file/commit.rs b/gix-commitgraph/src/file/commit.rs index 29522bb5ca..d4f8a0ec5d 100644 --- a/gix-commitgraph/src/file/commit.rs +++ b/gix-commitgraph/src/file/commit.rs @@ -1,8 +1,10 @@ //! Low-level operations on individual commits. use crate::{ + bloom::BloomKey, file::{self, EXTENDED_EDGES_MASK, LAST_EXTENDED_EDGE_MASK, NO_PARENT}, - File, Position, + from_be_u32, File, Position, }; +use bstr::BStr; use gix_error::{message, Message}; use std::{ fmt::{Debug, Formatter}, @@ -22,11 +24,6 @@ pub struct Commit<'a> { root_tree_id: &'a gix_hash::oid, } -#[inline] -fn read_u32(b: &[u8]) -> u32 { - u32::from_be_bytes(b.try_into().unwrap()) -} - impl<'a> Commit<'a> { pub(crate) fn new(file: &'a File, pos: file::Position) -> Self { let bytes = file.commit_data_bytes(pos); @@ -34,12 +31,12 @@ impl<'a> Commit<'a> { file, pos, root_tree_id: gix_hash::oid::from_bytes_unchecked(&bytes[..file.hash_len]), - parent1: ParentEdge::from_raw(read_u32(&bytes[file.hash_len..][..4])), - parent2: ParentEdge::from_raw(read_u32(&bytes[file.hash_len + 4..][..4])), + parent1: ParentEdge::from_raw(from_be_u32(&bytes[file.hash_len..][..4])), + parent2: ParentEdge::from_raw(from_be_u32(&bytes[file.hash_len + 4..][..4])), // TODO: Add support for corrected commit date offset overflow. // See https://github.com/git/git/commit/e8b63005c48696a26f976f5f9b0ccaf1983e439d and // https://github.com/git/git/commit/f90fca638e99a031dce8e3aca72427b2f9b4bb38 for more details and hints at a test. - generation: read_u32(&bytes[file.hash_len + 8..][..4]) >> 2, + generation: from_be_u32(&bytes[file.hash_len + 8..][..4]) >> 2, commit_timestamp: u64::from_be_bytes(bytes[file.hash_len + 8..][..8].try_into().unwrap()) & 0x0003_ffff_ffff, } @@ -90,6 +87,16 @@ impl<'a> Commit<'a> { pub fn root_tree_id(&self) -> &gix_hash::oid { self.root_tree_id } + + /// Query if `path` may be present in this commit's changed-path Bloom filter. + pub fn maybe_contains_path(&self, path: &BStr) -> Option { + self.file.maybe_contains_path(self.pos, path) + } + + /// Query if all `keys` may be present in this commit's changed-path Bloom filter. + pub fn maybe_contains_all_keys(&self, keys: &[BloomKey]) -> Option { + self.file.maybe_contains_all_keys(self.pos, keys) + } } impl Debug for Commit<'_> { @@ -176,7 +183,7 @@ impl Iterator for Parents<'_> { }, ParentIteratorState::Extra(mut chunks) => { if let Some(chunk) = chunks.next() { - let extra_edge = read_u32(chunk); + let extra_edge = from_be_u32(chunk); match ExtraEdge::from_raw(extra_edge) { ExtraEdge::Internal(pos) => { self.state = ParentIteratorState::Extra(chunks); diff --git a/gix-commitgraph/src/file/init.rs b/gix-commitgraph/src/file/init.rs index b78e77a60a..8a51506827 100644 --- a/gix-commitgraph/src/file/init.rs +++ b/gix-commitgraph/src/file/init.rs @@ -4,10 +4,11 @@ use gix_error::{message, ErrorExt, Exn, Message, ResultExt}; use crate::{ file::{ - BASE_GRAPHS_LIST_CHUNK_ID, COMMIT_DATA_CHUNK_ID, COMMIT_DATA_ENTRY_SIZE_SANS_HASH, - EXTENDED_EDGES_LIST_CHUNK_ID, FAN_LEN, HEADER_LEN, OID_FAN_CHUNK_ID, OID_LOOKUP_CHUNK_ID, SIGNATURE, + BASE_GRAPHS_LIST_CHUNK_ID, BLOOM_FILTER_DATA_CHUNK_ID, BLOOM_FILTER_HEADER_SIZE, BLOOM_FILTER_INDEX_CHUNK_ID, + COMMIT_DATA_CHUNK_ID, COMMIT_DATA_ENTRY_SIZE_SANS_HASH, EXTENDED_EDGES_LIST_CHUNK_ID, FAN_LEN, HEADER_LEN, + OID_FAN_CHUNK_ID, OID_LOOKUP_CHUNK_ID, SIGNATURE, }, - File, + from_be_u32, BloomFilterSettings, File, }; const MIN_FILE_SIZE: usize = HEADER_LEN @@ -129,6 +130,34 @@ impl File { .or_raise(|| message("Error getting offset for OID lookup chunk"))?; let extra_edges_list_range = chunks.usize_offset_by_id(EXTENDED_EDGES_LIST_CHUNK_ID).ok(); + let bloom_filter_index_range = chunks.usize_offset_by_id(BLOOM_FILTER_INDEX_CHUNK_ID).ok(); + let bloom_filter_data_range = chunks.usize_offset_by_id(BLOOM_FILTER_DATA_CHUNK_ID).ok(); + + let mut bloom_filter_data_len = 0usize; + let mut bloom_filter_data_offset = None; + let mut bloom_filter_index_offset = None; + let mut bloom_filter_settings = None; + if let (Some(index_range), Some(data_range)) = + (bloom_filter_index_range.as_ref(), bloom_filter_data_range.as_ref()) + { + let expected_index_chunk_size = commit_data_count as usize * std::mem::size_of::(); + if index_range.len() == expected_index_chunk_size && data_range.len() >= BLOOM_FILTER_HEADER_SIZE { + let settings = BloomFilterSettings { + hash_version: from_be_u32(&data[data_range.start..][..4]), + num_hashes: from_be_u32(&data[data_range.start + 4..][..4]), + bits_per_entry: from_be_u32(&data[data_range.start + 8..][..4]), + }; + let bloom_data_payload_len = data_range.len() - BLOOM_FILTER_HEADER_SIZE; + if settings.is_supported() + && bloom_index_offsets_are_valid(&data[index_range.clone()], bloom_data_payload_len) + { + bloom_filter_settings = Some(settings); + bloom_filter_data_len = bloom_data_payload_len; + bloom_filter_data_offset = Some(data_range.start + BLOOM_FILTER_HEADER_SIZE); + bloom_filter_index_offset = Some(index_range.start); + } + } + } let trailer = &data[chunks.highest_offset() as usize..]; if trailer.len() != object_hash.len_in_bytes() { @@ -162,6 +191,10 @@ impl File { Ok(File { base_graph_count, base_graphs_list_offset, + bloom_filter_data_len, + bloom_filter_data_offset, + bloom_filter_index_offset, + bloom_filter_settings, commit_data_offset, data, extra_edges_list_range, @@ -201,3 +234,18 @@ fn read_fan(d: &[u8]) -> ([u32; FAN_LEN], usize) { } (fan, FAN_LEN * 4) } + +fn bloom_index_offsets_are_valid(index_data: &[u8], bloom_data_payload_len: usize) -> bool { + let mut previous = 0usize; + for offset_bytes in index_data.chunks_exact(std::mem::size_of::()) { + let offset = from_be_u32(offset_bytes) as usize; + if offset > bloom_data_payload_len { + return false; + } + if offset < previous { + return false; + } + previous = offset; + } + true +} diff --git a/gix-commitgraph/src/file/mod.rs b/gix-commitgraph/src/file/mod.rs index 93b90a012d..d64cc0873c 100644 --- a/gix-commitgraph/src/file/mod.rs +++ b/gix-commitgraph/src/file/mod.rs @@ -17,6 +17,9 @@ const SIGNATURE: &[u8] = b"CGPH"; type ChunkId = gix_chunk::Id; const BASE_GRAPHS_LIST_CHUNK_ID: ChunkId = *b"BASE"; +const BLOOM_FILTER_DATA_CHUNK_ID: ChunkId = *b"BDAT"; +const BLOOM_FILTER_INDEX_CHUNK_ID: ChunkId = *b"BIDX"; +const BLOOM_FILTER_HEADER_SIZE: usize = 3 * std::mem::size_of::(); const COMMIT_DATA_CHUNK_ID: ChunkId = *b"CDAT"; const EXTENDED_EDGES_LIST_CHUNK_ID: ChunkId = *b"EDGE"; const OID_FAN_CHUNK_ID: ChunkId = *b"OIDF"; diff --git a/gix-commitgraph/src/init.rs b/gix-commitgraph/src/init.rs index 27166e75db..2d6f19b4ef 100644 --- a/gix-commitgraph/src/init.rs +++ b/gix-commitgraph/src/init.rs @@ -1,4 +1,4 @@ -use crate::{File, Graph, MAX_COMMITS}; +use crate::{BloomFilterSettings, File, Graph, MAX_COMMITS}; use gix_error::{message, ErrorExt, Exn, Message, ResultExt}; use std::{ io::{BufRead, BufReader}, @@ -54,7 +54,7 @@ impl Graph { /// Create a new commit graph from a list of `files`. pub fn new(files: Vec) -> Result { - let files = nonempty::NonEmpty::from_vec(files) + let mut files = nonempty::NonEmpty::from_vec(files) .ok_or_else(|| message!("Commit-graph must contain at least one file"))?; let num_commits: u64 = files.iter().map(|f| u64::from(f.num_commits())).sum(); if num_commits > u64::from(MAX_COMMITS) { @@ -77,10 +77,28 @@ impl Graph { f1 = f2; } + disable_incompatible_bloom_layers(&mut files); + Ok(Self { files }) } } +fn disable_incompatible_bloom_layers(files: &mut nonempty::NonEmpty) { + let mut preferred: Option = None; + for file in files.iter_mut().rev() { + let Some(settings) = file.bloom_filter_settings else { + continue; + }; + if preferred.is_none() { + preferred = Some(settings); + continue; + } + if preferred != Some(settings) { + file.clear_bloom_filters(); + } + } +} + impl TryFrom<&Path> for Graph { type Error = Exn; diff --git a/gix-commitgraph/src/lib.rs b/gix-commitgraph/src/lib.rs index 3370a6bfc5..ae2f3cd5be 100644 --- a/gix-commitgraph/src/lib.rs +++ b/gix-commitgraph/src/lib.rs @@ -25,6 +25,10 @@ use std::path::Path; pub struct File { base_graph_count: u8, base_graphs_list_offset: Option, + bloom_filter_data_len: usize, + bloom_filter_data_offset: Option, + bloom_filter_index_offset: Option, + bloom_filter_settings: Option, commit_data_offset: usize, data: memmap2::Mmap, extra_edges_list_range: Option>, @@ -35,6 +39,28 @@ pub struct File { object_hash: gix_hash::Kind, } +/// Bloom filter parameters as stored in a commit-graph `BDAT` chunk header. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub struct BloomFilterSettings { + /// Version of the hash algorithm used by changed-path Bloom filters. + pub hash_version: u32, + /// Number of hashes computed for each key. + pub num_hashes: u32, + /// Number of bits per Bloom filter entry. + pub bits_per_entry: u32, +} + +impl BloomFilterSettings { + pub(crate) fn is_supported(&self) -> bool { + match self.hash_version { + // Git's changed-path Bloom filter v1 hashes are deprecated, but we still need to + // read them to match Git's historical commit-graph behavior. + 1 | 2 => true, + _ => false, + } + } +} + /// A complete commit graph. /// /// The data in the commit graph may come from a monolithic `objects/info/commit-graph` file, or it @@ -50,6 +76,7 @@ pub fn at(path: impl AsRef) -> Result> { } mod access; +pub mod bloom; pub mod file; /// pub mod init; @@ -67,6 +94,11 @@ pub const GENERATION_NUMBER_MAX: u32 = 0x3fff_ffff; /// The maximum number of commits that can be stored in a commit graph. pub const MAX_COMMITS: u32 = (1 << 30) + (1 << 29) + (1 << 28) - 1; +#[inline] +pub(crate) fn from_be_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) +} + /// A generalized position for use in [`Graph`]. #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Hash)] pub struct Position(pub u32); diff --git a/gix-commitgraph/tests/access/mod.rs b/gix-commitgraph/tests/access/mod.rs index a6ec1913da..726452878b 100644 --- a/gix-commitgraph/tests/access/mod.rs +++ b/gix-commitgraph/tests/access/mod.rs @@ -1,4 +1,20 @@ use crate::{check_common, graph_and_expected, graph_and_expected_named}; +use bstr::BStr; +use gix_testtools::{scripted_fixture_read_only, scripted_fixture_writable}; +use std::{fs, path::Path}; + +fn should_skip_path_v2_unsupported() -> bool { + // Git learned to read and write changed-path Bloom filter version 2 in 2.46. + // Older binaries ignore the config in these fixtures and keep producing v1 data. + // Use the raw version here instead of `should_skip_as_git_version_is_smaller_than()`, + // which intentionally never skips on CI. + *gix_testtools::GIT_VERSION < (2, 46, 0) +} + +fn fixture_changed_path_version(script_name: &str, layer: BloomLayer) -> Option { + let repo_path = scripted_fixture_read_only(script_name).expect("fixture available"); + bloom_hash_version(&repo_path, layer) +} #[test] fn single_parent() { @@ -111,3 +127,317 @@ fn two_parents() { assert_eq!(cg.commit_at(refs["parent2"].pos()).generation(), 1); assert_eq!(cg.commit_at(refs["child"].pos()).generation(), 2); } + +#[test] +fn changed_paths_v1_settings_are_read() { + assert_eq!( + fixture_changed_path_version("changed_paths_v1.sh", BloomLayer::Monolithic), + Some(1), + "fixture explicitly requests v1 filters" + ); + let (cg, _refs) = graph_and_expected("changed_paths_v1.sh", &["HEAD"]); + let settings = cg + .bloom_filter_settings() + .expect("changed-path Bloom settings are available"); + assert_eq!(settings.hash_version, 1, "fixture explicitly requests v1 filters"); + assert_eq!(settings.bits_per_entry, 10, "git default bits per entry"); + assert_eq!(settings.num_hashes, 7, "git default hash count"); +} + +#[test] +fn changed_paths_v2_settings_are_read() { + if should_skip_path_v2_unsupported() { + return; + } + assert_eq!( + fixture_changed_path_version("changed_paths_v2.sh", BloomLayer::Monolithic), + Some(2), + "fixture explicitly requests v2 filters" + ); + let (cg, _refs) = graph_and_expected("changed_paths_v2.sh", &["HEAD"]); + let settings = cg + .bloom_filter_settings() + .expect("changed-path Bloom settings are available"); + assert_eq!(settings.hash_version, 2, "fixture explicitly requests v2 filters"); + assert_eq!(settings.bits_per_entry, 10, "git default bits per entry"); + assert_eq!(settings.num_hashes, 7, "git default hash count"); +} + +#[test] +fn changed_paths_v1_maybe_contains_changed_paths() { + let (cg, refs) = graph_and_expected("changed_paths_v1.sh", &["HEAD"]); + assert_eq!( + cg.maybe_contains_path_by_id(refs["HEAD"].id(), BStr::new(b"dir/subdir/file")), + Some(true) + ); + assert_eq!( + cg.maybe_contains_path_by_id(refs["HEAD"].id(), BStr::new(b"other")), + Some(true) + ); +} + +#[test] +fn changed_paths_v2_maybe_contains_changed_paths() { + if should_skip_path_v2_unsupported() { + return; + } + let (cg, refs) = graph_and_expected("changed_paths_v2.sh", &["HEAD"]); + assert_eq!( + cg.maybe_contains_path_by_id(refs["HEAD"].id(), BStr::new(b"dir/subdir/file")), + Some(true) + ); + assert_eq!( + cg.maybe_contains_path_by_id(refs["HEAD"].id(), BStr::new(b"other")), + Some(true) + ); +} + +#[test] +fn incompatible_split_chain_prefers_top_layer_bloom_settings() { + if should_skip_path_v2_unsupported() { + return; + } + assert_eq!( + fixture_changed_path_version("split_chain_changed_paths_mismatch.sh", BloomLayer::Base), + Some(1), + "base layer should keep v1 settings" + ); + assert_eq!( + fixture_changed_path_version("split_chain_changed_paths_mismatch.sh", BloomLayer::Top), + Some(2), + "top layer should keep v2 settings" + ); + let (cg, _refs) = graph_and_expected("split_chain_changed_paths_mismatch.sh", &["c1", "c2"]); + let settings = cg + .bloom_filter_settings() + .expect("top layer has changed-path Bloom settings"); + assert_eq!(settings.hash_version, 2, "top layer uses v2 and should remain usable"); +} + +#[test] +fn incompatible_split_chain_disables_base_bloom_queries() { + if should_skip_path_v2_unsupported() { + return; + } + let (cg, refs) = graph_and_expected("split_chain_changed_paths_mismatch.sh", &["c1", "c2"]); + assert_eq!( + cg.maybe_contains_path_by_id(refs["c1"].id(), BStr::new(b"tracked")), + None, + "base layer Bloom data is cleared when the top layer uses incompatible settings" + ); + assert_eq!( + cg.maybe_contains_path_by_id(refs["c2"].id(), BStr::new(b"tracked")), + Some(true), + "top layer Bloom data remains usable" + ); +} + +#[test] +fn split_chain_uses_base_bloom_when_top_has_none() { + if should_skip_path_v2_unsupported() { + return; + } + assert_eq!( + fixture_changed_path_version("split_chain_top_without_bloom.sh", BloomLayer::Base), + Some(2), + "base layer should keep v2 settings" + ); + let (cg, _refs) = graph_and_expected("split_chain_top_without_bloom.sh", &["c1", "c2"]); + let settings = cg + .bloom_filter_settings() + .expect("base layer changed-path settings remain usable"); + assert_eq!(settings.hash_version, 2); +} + +#[test] +fn split_chain_uses_base_bloom_only_for_base_commits() { + let (cg, refs) = graph_and_expected("split_chain_top_without_bloom.sh", &["c1", "c2"]); + assert_eq!( + cg.maybe_contains_path_by_id(refs["c1"].id(), BStr::new(b"tracked")), + Some(true), + "base layer Bloom data remains usable" + ); + assert_eq!( + cg.maybe_contains_path_by_id(refs["c2"].id(), BStr::new(b"tracked")), + None, + "top layer without Bloom data should not answer Bloom queries" + ); +} + +#[test] +fn bloom_is_disabled_if_bidx_chunk_is_missing() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bidx = find_chunk_index(&entries, *b"BIDX").expect("BIDX present in fixture"); + set_chunk_id(data, bidx, *b"XIDX"); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!(graph.bloom_filter_settings().is_none(), "missing BIDX disables Bloom"); +} + +#[test] +fn bloom_is_disabled_if_bdat_chunk_is_missing() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bdat = find_chunk_index(&entries, *b"BDAT").expect("BDAT present in fixture"); + set_chunk_id(data, bdat, *b"XDAT"); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!(graph.bloom_filter_settings().is_none(), "missing BDAT disables Bloom"); +} + +#[test] +fn bloom_is_disabled_if_bidx_is_too_small() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bidx = find_chunk_index(&entries, *b"BIDX").expect("BIDX present in fixture"); + let bidx_offset = entries[bidx].offset; + set_chunk_offset(data, bidx + 1, bidx_offset + 4); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!(graph.bloom_filter_settings().is_none(), "too-small BIDX disables Bloom"); +} + +#[test] +fn bloom_is_disabled_if_bdat_is_too_small() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bdat = find_chunk_index(&entries, *b"BDAT").expect("BDAT present in fixture"); + let next_offset = entries[bdat + 1].offset; + set_chunk_offset(data, bdat, next_offset - 4); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!(graph.bloom_filter_settings().is_none(), "too-small BDAT disables Bloom"); +} + +#[test] +fn bloom_is_disabled_if_bidx_offsets_are_invalid() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bidx = find_chunk_index(&entries, *b"BIDX").expect("BIDX present in fixture"); + let start = entries[bidx].offset as usize; + data[start..start + 4].copy_from_slice(&u32::MAX.to_be_bytes()); + data[start + 4..start + 8].copy_from_slice(&1u32.to_be_bytes()); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!( + graph.bloom_filter_settings().is_none(), + "out-of-range and decreasing BIDX offsets disable Bloom" + ); +} + +#[test] +fn bloom_is_disabled_if_hash_version_is_unsupported() { + let tmp = scripted_fixture_writable("changed_paths_v2.sh").expect("fixture available"); + mutate_commit_graph(tmp.path(), |data| { + let entries = parse_chunk_table(data); + let bdat = find_chunk_index(&entries, *b"BDAT").expect("BDAT present in fixture"); + let bdat_offset = entries[bdat].offset as usize; + data[bdat_offset..bdat_offset + 4].copy_from_slice(&3u32.to_be_bytes()); + }); + let graph = gix_commitgraph::Graph::from_info_dir(&info_dir(tmp.path())).expect("graph remains readable"); + assert!( + graph.bloom_filter_settings().is_none(), + "unsupported hash versions disable Bloom so callers fall back safely" + ); +} + +#[derive(Clone, Copy)] +struct ChunkTableEntry { + id: [u8; 4], + offset: u64, +} + +#[derive(Clone, Copy)] +enum BloomLayer { + Monolithic, + Base, + Top, +} + +fn info_dir(repo_path: &Path) -> std::path::PathBuf { + repo_path.join(".git").join("objects").join("info") +} + +fn bloom_hash_version(repo_path: &Path, layer: BloomLayer) -> Option { + let graph_path = bloom_graph_path(repo_path, layer)?; + let data = fs::read(graph_path).expect("read commit-graph fixture"); + let entries = parse_chunk_table(&data); + let bdat = find_chunk_index(&entries, *b"BDAT")?; + let start = entries[bdat].offset as usize; + let bytes: [u8; 4] = data.get(start..start + 4)?.try_into().ok()?; + Some(u32::from_be_bytes(bytes)) +} + +fn bloom_graph_path(repo_path: &Path, layer: BloomLayer) -> Option { + let info_dir = info_dir(repo_path); + let monolithic = info_dir.join("commit-graph"); + if monolithic.is_file() { + return match layer { + BloomLayer::Monolithic => Some(monolithic), + BloomLayer::Base | BloomLayer::Top => None, + }; + } + + let chain_dir = info_dir.join("commit-graphs"); + let chain = fs::read_to_string(chain_dir.join("commit-graph-chain")).ok()?; + let graphs: Vec<_> = chain + .lines() + .map(|hash| chain_dir.join(format!("graph-{hash}.graph"))) + .collect(); + let graph_path = match layer { + BloomLayer::Monolithic => return None, + BloomLayer::Base => graphs.first()?, + BloomLayer::Top => graphs.last()?, + }; + Some(graph_path.clone()) +} + +#[allow(clippy::permissions_set_readonly_false)] +fn mutate_commit_graph(repo_path: &Path, mutate: impl FnOnce(&mut [u8])) { + let graph_path = info_dir(repo_path).join("commit-graph"); + let mut permissions = fs::metadata(&graph_path).expect("commit-graph metadata").permissions(); + permissions.set_readonly(false); + fs::set_permissions(&graph_path, permissions).expect("set commit-graph writable"); + let mut data = fs::read(&graph_path).expect("read commit-graph fixture"); + mutate(&mut data); + fs::write(graph_path, data).expect("rewrite mutated commit-graph"); +} + +fn parse_chunk_table(data: &[u8]) -> Vec { + let chunk_count = usize::from(data[6]); + let mut out = Vec::with_capacity(chunk_count + 1); + let table_start = 8; + for idx in 0..=chunk_count { + let entry_offset = table_start + idx * 12; + let id = data[entry_offset..entry_offset + 4] + .try_into() + .expect("chunk id has 4 bytes"); + let offset = u64::from_be_bytes( + data[entry_offset + 4..entry_offset + 12] + .try_into() + .expect("chunk offset has 8 bytes"), + ); + out.push(ChunkTableEntry { id, offset }); + } + out +} + +fn find_chunk_index(entries: &[ChunkTableEntry], id: [u8; 4]) -> Option { + entries.iter().position(|entry| entry.id == id) +} + +fn set_chunk_id(data: &mut [u8], chunk_index: usize, id: [u8; 4]) { + let entry_offset = 8 + chunk_index * 12; + data[entry_offset..entry_offset + 4].copy_from_slice(&id); +} + +fn set_chunk_offset(data: &mut [u8], chunk_index: usize, offset: u64) { + let entry_offset = 8 + chunk_index * 12 + 4; + data[entry_offset..entry_offset + 8].copy_from_slice(&offset.to_be_bytes()); +} diff --git a/gix-commitgraph/tests/fixtures/changed_paths_v1.sh b/gix-commitgraph/tests/fixtures/changed_paths_v1.sh new file mode 100644 index 0000000000..146e11e1ca --- /dev/null +++ b/gix-commitgraph/tests/fixtures/changed_paths_v1.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -eu -o pipefail + +git init -q +git config user.name committer +git config user.email committer@example.com +git config commitGraph.changedPathsVersion 1 + +mkdir -p dir/subdir +echo one > dir/subdir/file +git add dir/subdir/file +git commit -q -m c1 + +echo two > dir/subdir/file +echo hello > other +git add dir/subdir/file other +git commit -q -m c2 + +# Keep this fixture distinct so cached outputs are regenerated with a Git that can emit v1 data. +git commit-graph write --no-progress --reachable --changed-paths diff --git a/gix-commitgraph/tests/fixtures/changed_paths_v2.sh b/gix-commitgraph/tests/fixtures/changed_paths_v2.sh new file mode 100755 index 0000000000..20ec0fc2aa --- /dev/null +++ b/gix-commitgraph/tests/fixtures/changed_paths_v2.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -eu -o pipefail + +git init -q +git config user.name committer +git config user.email committer@example.com +git config commitGraph.changedPathsVersion 2 + +mkdir -p dir/subdir +echo one > dir/subdir/file +git add dir/subdir/file +git commit -q -m c1 + +echo two > dir/subdir/file +echo hello > other +git add dir/subdir/file other +git commit -q -m c2 + +# Keep this fixture distinct so cached outputs are regenerated with a Git that can emit v2 data. +git commit-graph write --no-progress --reachable --changed-paths diff --git a/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v1.tar b/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v1.tar new file mode 100644 index 0000000000..6f23731d9c Binary files /dev/null and b/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v1.tar differ diff --git a/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v2.tar b/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v2.tar new file mode 100644 index 0000000000..e76473c410 Binary files /dev/null and b/gix-commitgraph/tests/fixtures/generated-archives/changed_paths_v2.tar differ diff --git a/gix-commitgraph/tests/fixtures/generated-archives/split_chain_changed_paths_mismatch.tar b/gix-commitgraph/tests/fixtures/generated-archives/split_chain_changed_paths_mismatch.tar new file mode 100644 index 0000000000..b0ee1b7262 Binary files /dev/null and b/gix-commitgraph/tests/fixtures/generated-archives/split_chain_changed_paths_mismatch.tar differ diff --git a/gix-commitgraph/tests/fixtures/generated-archives/split_chain_top_without_bloom.tar b/gix-commitgraph/tests/fixtures/generated-archives/split_chain_top_without_bloom.tar new file mode 100644 index 0000000000..7a954fb0c2 Binary files /dev/null and b/gix-commitgraph/tests/fixtures/generated-archives/split_chain_top_without_bloom.tar differ diff --git a/gix-commitgraph/tests/fixtures/split_chain_changed_paths_mismatch.sh b/gix-commitgraph/tests/fixtures/split_chain_changed_paths_mismatch.sh new file mode 100755 index 0000000000..ab26e184f7 --- /dev/null +++ b/gix-commitgraph/tests/fixtures/split_chain_changed_paths_mismatch.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -eu -o pipefail + +git init -q +git config user.name committer +git config user.email committer@example.com + +echo one > tracked +git add tracked +git commit -q -m c1 +git branch c1 + +echo two > tracked +git add tracked +git commit -q -m c2 +git branch c2 + +# Keep this fixture distinct so cached outputs are regenerated with a Git that can emit both versions. +git show-ref -s c1 | git -c commitGraph.changedPathsVersion=1 commit-graph write \ + --no-progress --changed-paths --split=no-merge --stdin-commits +git show-ref -s c2 | git -c commitGraph.changedPathsVersion=2 commit-graph write \ + --no-progress --changed-paths --split=no-merge --stdin-commits diff --git a/gix-commitgraph/tests/fixtures/split_chain_top_without_bloom.sh b/gix-commitgraph/tests/fixtures/split_chain_top_without_bloom.sh new file mode 100755 index 0000000000..d82509ab2f --- /dev/null +++ b/gix-commitgraph/tests/fixtures/split_chain_top_without_bloom.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -eu -o pipefail + +git init -q +git config user.name committer +git config user.email committer@example.com + +echo one > tracked +git add tracked +git commit -q -m c1 +git branch c1 + +echo two > tracked +git add tracked +git commit -q -m c2 +git branch c2 + +# Keep this fixture distinct so cached outputs are regenerated with a Git that can emit v2 data. +git show-ref -s c1 | git -c commitGraph.changedPathsVersion=2 commit-graph write \ + --no-progress --changed-paths --split=no-merge --stdin-commits +git show-ref -s c2 | git commit-graph write \ + --no-progress --no-changed-paths --split=no-merge --stdin-commits