Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions gitoxide-core/src/index/checkout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,11 @@ where
};
buf.clear();
// …but write nothing
Ok(Some(gix::objs::Data { kind, data: buf }))
Ok(Some(gix::objs::Data {
kind,
hash_len: id.kind().len_in_bytes(),
data: buf,
}))
} else {
self.db.try_find(id, buf)
}
Expand All @@ -201,10 +205,11 @@ where
struct Empty;

impl gix::objs::Find for Empty {
fn try_find<'a>(&self, _id: &gix::oid, buffer: &'a mut Vec<u8>) -> Result<Option<gix::objs::Data<'a>>, Error> {
fn try_find<'a>(&self, id: &gix::oid, buffer: &'a mut Vec<u8>) -> Result<Option<gix::objs::Data<'a>>, Error> {
buffer.clear();
Ok(Some(gix::objs::Data {
kind: gix::object::Kind::Blob,
hash_len: id.kind().len_in_bytes(),
data: buffer,
}))
}
Expand Down
1 change: 1 addition & 0 deletions gix-diff/tests/diff/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ mod util {
buffer.extend_from_slice(data);
Ok(Some(gix_object::Data {
kind: gix_object::Kind::Blob,
hash_len: id.kind().len_in_bytes(),
data: buffer.as_slice(),
}))
}
Expand Down
17 changes: 13 additions & 4 deletions gix-diff/tests/diff/tree_with_rewrites.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,14 @@ fn empty_to_new_tree_without_rename_tracking() -> crate::Result {
{
let (lhs, rhs, mut cache, odb) = repo_with_trees(None, "c1 - initial")?;
let err = gix_diff::tree_with_rewrites(
TreeRefIter::from_bytes(&lhs),
TreeRefIter::from_bytes(&rhs),
TreeRefIter::from_bytes(
&lhs,
gix_testtools::hash_kind_from_env().unwrap_or_default().len_in_bytes(),
),
TreeRefIter::from_bytes(
&rhs,
gix_testtools::hash_kind_from_env().unwrap_or_default().len_in_bytes(),
),
&mut cache,
&mut Default::default(),
&odb,
Expand Down Expand Up @@ -1843,8 +1849,11 @@ mod util {
let (from, to, mut cache, odb) = repo_with_trees(lhs, rhs)?;
let mut out = Vec::new();
let rewrites_info = gix_diff::tree_with_rewrites(
TreeRefIter::from_bytes(&from),
TreeRefIter::from_bytes(&to),
// TODO(SHA256):
// Get hash from env. This requires updating the snapshot at the top of the file as
// well.
TreeRefIter::from_bytes(&from, gix_hash::Kind::Sha1.len_in_bytes()),
TreeRefIter::from_bytes(&to, gix_hash::Kind::Sha1.len_in_bytes()),
&mut cache,
&mut Default::default(),
&odb,
Expand Down
2 changes: 1 addition & 1 deletion gix-merge/src/tree/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ where
let (mut base_buf, mut side_buf) = (Vec::new(), Vec::new());
let ancestor_tree = objects.find_tree(base_tree, &mut base_buf)?;
let mut editor = tree::Editor::new(ancestor_tree.to_owned(), objects, base_tree.kind());
let ancestor_tree = gix_object::TreeRefIter::from_bytes(&base_buf);
let ancestor_tree = gix_object::TreeRefIter::from_bytes(&base_buf, base_tree.kind().len_in_bytes());
let tree_conflicts = options.tree_conflicts;

let mut our_changes = Vec::new();
Expand Down
1 change: 1 addition & 0 deletions gix-merge/tests/merge/blob/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ mod util {
buffer.extend_from_slice(data);
Ok(Some(gix_object::Data {
kind: gix_object::Kind::Blob,
hash_len: id.kind().len_in_bytes(),
data: buffer.as_slice(),
}))
}
Expand Down
18 changes: 16 additions & 2 deletions gix-object/benches/decode_objects.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,24 @@ fn parse_tag(c: &mut Criterion) {

fn parse_tree(c: &mut Criterion) {
c.bench_function("TreeRef()", |b| {
b.iter(|| black_box(gix_object::TreeRef::from_bytes(TREE)).unwrap());
b.iter(|| {
black_box(gix_object::TreeRef::from_bytes(
TREE,
gix_testtools::hash_kind_from_env().unwrap_or_default().len_in_bytes(),
))
.unwrap()
});
});
c.bench_function("TreeRefIter()", |b| {
b.iter(|| black_box(gix_object::TreeRefIter::from_bytes(TREE).count()));
b.iter(|| {
black_box(
gix_object::TreeRefIter::from_bytes(
TREE,
gix_testtools::hash_kind_from_env().unwrap_or_default().len_in_bytes(),
)
.count(),
)
});
});
}

Expand Down
1 change: 1 addition & 0 deletions gix-object/benches/edit_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ impl gix_object::Find for StorageOdb {
tree.write_to(buffer).expect("valid trees can always be serialized");
Ok(Some(gix_object::Data {
kind: gix_object::Kind::Tree,
hash_len: id.kind().len_in_bytes(),
data: &*buffer,
}))
}
Expand Down
8 changes: 4 additions & 4 deletions gix-object/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use crate::{BlobRef, CommitRef, CommitRefIter, Data, Kind, ObjectRef, TagRef, Ta

impl<'a> Data<'a> {
/// Constructs a new data object from `kind` and `data`.
pub fn new(kind: Kind, data: &'a [u8]) -> Data<'a> {
Data { kind, data }
pub fn new(kind: Kind, hash_len: usize, data: &'a [u8]) -> Data<'a> {
Data { kind, hash_len, data }
}
/// Decodes the data in the backing slice into a [`ObjectRef`], allowing to access all of its data
/// conveniently. The cost of parsing an object is negligible.
Expand All @@ -14,7 +14,7 @@ impl<'a> Data<'a> {
/// using [`crate::ObjectRef::into_owned()`].
pub fn decode(&self) -> Result<ObjectRef<'a>, crate::decode::Error> {
Ok(match self.kind {
Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(self.data)?),
Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(self.data, self.hash_len)?),
Kind::Blob => ObjectRef::Blob(BlobRef { data: self.data }),
Kind::Commit => ObjectRef::Commit(CommitRef::from_bytes(self.data)?),
Kind::Tag => ObjectRef::Tag(TagRef::from_bytes(self.data)?),
Expand All @@ -25,7 +25,7 @@ impl<'a> Data<'a> {
/// `None` if this is not a tree object.
pub fn try_into_tree_iter(self) -> Option<TreeRefIter<'a>> {
match self.kind {
Kind::Tree => Some(TreeRefIter::from_bytes(self.data)),
Kind::Tree => Some(TreeRefIter::from_bytes(self.data, self.hash_len)),
_ => None,
}
}
Expand Down
6 changes: 6 additions & 0 deletions gix-object/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ pub struct TreeRef<'a> {
/// A directory snapshot containing files (blobs), directories (trees) and submodules (commits), lazily evaluated.
#[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)]
pub struct TreeRefIter<'a> {
/// TODO:
/// Document.
hash_len: usize,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To my mind it's fine to use hash_len if it stays private (after all, that's what the iter really wants). One can even then consider keeping Kind instead as the function to get its len is const for sure and thus has no chance of being worse than the whole usize.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to hash_kind. (Aside: this is called object_hash in other places, I started wondering whether it should also be called that here.)

Using hash_kind instead of hash_len also has the advantage that it can be packed and doesn’t grow the size of either TreeRefIter or Data.

/// The directories and files contained in this tree.
data: &'a [u8],
}
Expand All @@ -289,6 +292,9 @@ impl Tree {
pub struct Data<'a> {
/// kind of object
pub kind: Kind,
/// TODO:
/// Document.
pub hash_len: usize,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely gix_hash::Kind here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed.

/// decoded, decompressed data, owned by a backing store.
pub data: &'a [u8],
}
Expand Down
8 changes: 4 additions & 4 deletions gix-object/src/object/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ pub enum LooseDecodeError {

impl<'a> ObjectRef<'a> {
/// Deserialize an object from a loose serialisation
pub fn from_loose(data: &'a [u8]) -> Result<ObjectRef<'a>, LooseDecodeError> {
pub fn from_loose(data: &'a [u8], hash_len: usize) -> Result<ObjectRef<'a>, LooseDecodeError> {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's very uncommon to take a hash_len I think, and would expect this to be gix_hash::Kind as this is what it boils down to.
Let's do that instead unless there is a very good reason to use a hash_len.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes way more sense, in particular as TreeRefIter and Data don’t grow in size that way.

(The main reason I initially picked hash_len was that it felt more targeted and precise, but in hindsight I agree that gix_hash::Kind is a much better way to carry that information.)

let (kind, size, offset) = loose_header(data)?;

let body = &data[offset..]
Expand All @@ -200,13 +200,13 @@ impl<'a> ObjectRef<'a> {
message: "object data was shorter than its size declared in the header",
})?;

Ok(Self::from_bytes(kind, body)?)
Ok(Self::from_bytes(kind, hash_len, body)?)
}

/// Deserialize an object of `kind` from the given `data`.
pub fn from_bytes(kind: Kind, data: &'a [u8]) -> Result<ObjectRef<'a>, crate::decode::Error> {
pub fn from_bytes(kind: Kind, hash_len: usize, data: &'a [u8]) -> Result<ObjectRef<'a>, crate::decode::Error> {
Ok(match kind {
Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(data)?),
Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(data, hash_len)?),
Kind::Blob => ObjectRef::Blob(BlobRef { data }),
Kind::Commit => ObjectRef::Commit(CommitRef::from_bytes(data)?),
Kind::Tag => ObjectRef::Tag(TagRef::from_bytes(data)?),
Expand Down
48 changes: 27 additions & 21 deletions gix-object/src/tree/ref_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ where
return ControlFlow::Break(None);
};

let Some(entry) = TreeRefIter::from_bytes(tree.data)
let Some(entry) = TreeRefIter::from_bytes(tree.data, tree.hash_len)
.filter_map(Result::ok)
.find(|entry| component.eq(entry.filename))
else {
Expand All @@ -55,8 +55,8 @@ where

impl<'a> TreeRefIter<'a> {
/// Instantiate an iterator from the given tree data.
pub fn from_bytes(data: &'a [u8]) -> TreeRefIter<'a> {
TreeRefIter { data }
pub fn from_bytes(data: &'a [u8], hash_len: usize) -> TreeRefIter<'a> {
TreeRefIter { data, hash_len }
}

/// Follow a sequence of `path` components starting from this instance, and look them up in `odb` one by one using `buffer`
Expand All @@ -81,7 +81,7 @@ impl<'a> TreeRefIter<'a> {
buffer.extend_from_slice(self.data);

let mut iter = path.into_iter().peekable();
let mut data = crate::Data::new(crate::Kind::Tree, buffer);
let mut data = crate::Data::new(crate::Kind::Tree, self.hash_len, buffer);

loop {
data = match next_entry(&mut iter, data) {
Expand Down Expand Up @@ -123,11 +123,12 @@ impl<'a> TreeRefIter<'a> {

impl<'a> TreeRef<'a> {
/// Deserialize a Tree from `data`.
pub fn from_bytes(mut data: &'a [u8]) -> Result<TreeRef<'a>, crate::decode::Error> {
let input = &mut data;
match decode::tree.parse_next(input) {
pub fn from_bytes(data: &'a [u8], hash_len: usize) -> Result<TreeRef<'a>, crate::decode::Error> {
let state = decode::State { hash_len };
let mut input = decode::Stream { input: data, state };
match decode::tree.parse_next(&mut input) {
Ok(tag) => Ok(tag),
Err(err) => Err(crate::decode::Error::with_err(err, input)),
Err(err) => Err(crate::decode::Error::with_err(err, &input)),
}
}

Expand Down Expand Up @@ -190,7 +191,7 @@ impl<'a> Iterator for TreeRefIter<'a> {
if self.data.is_empty() {
return None;
}
match decode::fast_entry(self.data) {
match decode::fast_entry(self.data, self.hash_len) {
Some((data_left, entry)) => {
self.data = data_left;
Some(Ok(entry))
Expand Down Expand Up @@ -218,47 +219,52 @@ impl<'a> TryFrom<&'a [u8]> for tree::EntryMode {

mod decode {
use bstr::ByteSlice;
use winnow::{error::ParserError, prelude::*};
use winnow::{error::ParserError, prelude::*, Stateful};

use crate::{tree, tree::EntryRef, TreeRef};

pub fn fast_entry(i: &[u8]) -> Option<(&[u8], EntryRef<'_>)> {
pub fn fast_entry(i: &[u8], hash_len: usize) -> Option<(&[u8], EntryRef<'_>)> {
let (mode, i) = tree::EntryMode::extract_from_bytes(i)?;
let (filename, i) = i.split_at(i.find_byte(0)?);
let i = &i[1..];
const HASH_LEN_FIXME: usize = 20; // TODO(SHA256): know actual/desired length or we may overshoot
let (oid, i) = match i.len() {
len if len < HASH_LEN_FIXME => return None,
_ => i.split_at(20),
len if len < hash_len => return None,
_ => i.split_at(hash_len),
};
Some((
i,
EntryRef {
mode,
filename: filename.as_bstr(),
oid: gix_hash::oid::try_from_bytes(oid).expect("we counted exactly 20 bytes"),
oid: gix_hash::oid::try_from_bytes(oid)
.unwrap_or_else(|_| panic!("we counted exactly {hash_len} bytes")),
},
))
}

pub fn tree<'a, E: ParserError<&'a [u8]>>(i: &mut &'a [u8]) -> ModalResult<TreeRef<'a>, E> {
let mut i = &**i;
#[derive(Debug)]
pub struct State {
pub hash_len: usize,
}

pub type Stream<'is> = Stateful<&'is [u8], State>;

pub fn tree<'a, E: ParserError<&'a [u8]>>(stream: &mut Stream<'a>) -> ModalResult<TreeRef<'a>, E> {
Comment on lines +247 to +254
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd love it if object_hash: gix_hash::Kind could be a parameter here with everything else unchanged.
Probably this was first attempted and this is what it needs to be for Winnow, but… honestly, maybe the slow version can even be removed in favor of fast_decode. I wonder if anything speaks against that.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you suspected, that’s what I tried first. :-) But winnow places certain constraints on what it accepts as a parsing function. Just looking at tree locally, I don’t see why it shouldn’t be possible to rewrite it not to need winnow anymore. I haven’t tried yet, but could, either in this PR or in a follow-up.

let mut i = stream.input;

// Calculate an estimate of the amount of entries to reduce
// the amount of allocations necessary.
// Note that this assumes that we want speed over fitting Vecs, this is a trade-off.
// TODO(SHA256): know actual/desired length for reduced overallocation
const HASH_LEN_FIXME: usize = 20;
const AVERAGE_FILENAME_LEN: usize = 24;
const AVERAGE_MODE_LEN: usize = 6;
const ENTRY_DELIMITER_LEN: usize = 2; // space + trailing zero
const AVERAGE_TREE_ENTRIES: usize = 16 * 2; // prevent overallocation beyond what's meaningful or what could be dangerous
let average_entry_len = ENTRY_DELIMITER_LEN + HASH_LEN_FIXME + AVERAGE_MODE_LEN + AVERAGE_FILENAME_LEN;
let average_entry_len = ENTRY_DELIMITER_LEN + stream.state.hash_len + AVERAGE_MODE_LEN + AVERAGE_FILENAME_LEN;
let upper_bound = i.len() / average_entry_len;
let mut out = Vec::with_capacity(upper_bound.min(AVERAGE_TREE_ENTRIES));

while !i.is_empty() {
let Some((rest, entry)) = fast_entry(i) else {
let Some((rest, entry)) = fast_entry(i, stream.state.hash_len) else {
#[allow(clippy::unit_arg)]
return Err(winnow::error::ErrMode::from_input(&i));
};
Expand Down
Loading
Loading