-
-
Notifications
You must be signed in to change notification settings - Fork 447
feat!: add hash_kind to TreeRefIter and Data
#2497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
88ac70d
2bd3d04
ba73edd
63bc607
bedeb17
260d529
5b1d9b8
43ce8e1
81b9efb
cf256bd
94d703b
c9949ab
bfc5fd9
0cb881e
7dedb58
7383f02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -263,6 +263,9 @@ pub struct TreeRef<'a> { | |
| /// A directory snapshot containing files (blobs), directories (trees) and submodules (commits), lazily evaluated. | ||
| #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] | ||
| pub struct TreeRefIter<'a> { | ||
| /// TODO: | ||
| /// Document. | ||
| hash_len: usize, | ||
| /// The directories and files contained in this tree. | ||
| data: &'a [u8], | ||
| } | ||
|
|
@@ -289,6 +292,9 @@ impl Tree { | |
| pub struct Data<'a> { | ||
| /// kind of object | ||
| pub kind: Kind, | ||
| /// TODO: | ||
| /// Document. | ||
| pub hash_len: usize, | ||
|
||
| /// decoded, decompressed data, owned by a backing store. | ||
| pub data: &'a [u8], | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -191,7 +191,7 @@ pub enum LooseDecodeError { | |
|
|
||
| impl<'a> ObjectRef<'a> { | ||
| /// Deserialize an object from a loose serialisation | ||
| pub fn from_loose(data: &'a [u8]) -> Result<ObjectRef<'a>, LooseDecodeError> { | ||
| pub fn from_loose(data: &'a [u8], hash_len: usize) -> Result<ObjectRef<'a>, LooseDecodeError> { | ||
|
||
| let (kind, size, offset) = loose_header(data)?; | ||
|
|
||
| let body = &data[offset..] | ||
|
|
@@ -200,13 +200,13 @@ impl<'a> ObjectRef<'a> { | |
| message: "object data was shorter than its size declared in the header", | ||
| })?; | ||
|
|
||
| Ok(Self::from_bytes(kind, body)?) | ||
| Ok(Self::from_bytes(kind, hash_len, body)?) | ||
| } | ||
|
|
||
| /// Deserialize an object of `kind` from the given `data`. | ||
| pub fn from_bytes(kind: Kind, data: &'a [u8]) -> Result<ObjectRef<'a>, crate::decode::Error> { | ||
| pub fn from_bytes(kind: Kind, hash_len: usize, data: &'a [u8]) -> Result<ObjectRef<'a>, crate::decode::Error> { | ||
| Ok(match kind { | ||
| Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(data)?), | ||
| Kind::Tree => ObjectRef::Tree(TreeRef::from_bytes(data, hash_len)?), | ||
| Kind::Blob => ObjectRef::Blob(BlobRef { data }), | ||
| Kind::Commit => ObjectRef::Commit(CommitRef::from_bytes(data)?), | ||
| Kind::Tag => ObjectRef::Tag(TagRef::from_bytes(data)?), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,7 @@ where | |
| return ControlFlow::Break(None); | ||
| }; | ||
|
|
||
| let Some(entry) = TreeRefIter::from_bytes(tree.data) | ||
| let Some(entry) = TreeRefIter::from_bytes(tree.data, tree.hash_len) | ||
| .filter_map(Result::ok) | ||
| .find(|entry| component.eq(entry.filename)) | ||
| else { | ||
|
|
@@ -55,8 +55,8 @@ where | |
|
|
||
| impl<'a> TreeRefIter<'a> { | ||
| /// Instantiate an iterator from the given tree data. | ||
| pub fn from_bytes(data: &'a [u8]) -> TreeRefIter<'a> { | ||
| TreeRefIter { data } | ||
| pub fn from_bytes(data: &'a [u8], hash_len: usize) -> TreeRefIter<'a> { | ||
| TreeRefIter { data, hash_len } | ||
| } | ||
|
|
||
| /// Follow a sequence of `path` components starting from this instance, and look them up in `odb` one by one using `buffer` | ||
|
|
@@ -81,7 +81,7 @@ impl<'a> TreeRefIter<'a> { | |
| buffer.extend_from_slice(self.data); | ||
|
|
||
| let mut iter = path.into_iter().peekable(); | ||
| let mut data = crate::Data::new(crate::Kind::Tree, buffer); | ||
| let mut data = crate::Data::new(crate::Kind::Tree, self.hash_len, buffer); | ||
|
|
||
| loop { | ||
| data = match next_entry(&mut iter, data) { | ||
|
|
@@ -123,11 +123,12 @@ impl<'a> TreeRefIter<'a> { | |
|
|
||
| impl<'a> TreeRef<'a> { | ||
| /// Deserialize a Tree from `data`. | ||
| pub fn from_bytes(mut data: &'a [u8]) -> Result<TreeRef<'a>, crate::decode::Error> { | ||
| let input = &mut data; | ||
| match decode::tree.parse_next(input) { | ||
| pub fn from_bytes(data: &'a [u8], hash_len: usize) -> Result<TreeRef<'a>, crate::decode::Error> { | ||
| let state = decode::State { hash_len }; | ||
| let mut input = decode::Stream { input: data, state }; | ||
| match decode::tree.parse_next(&mut input) { | ||
| Ok(tag) => Ok(tag), | ||
| Err(err) => Err(crate::decode::Error::with_err(err, input)), | ||
| Err(err) => Err(crate::decode::Error::with_err(err, &input)), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -190,7 +191,7 @@ impl<'a> Iterator for TreeRefIter<'a> { | |
| if self.data.is_empty() { | ||
| return None; | ||
| } | ||
| match decode::fast_entry(self.data) { | ||
| match decode::fast_entry(self.data, self.hash_len) { | ||
| Some((data_left, entry)) => { | ||
| self.data = data_left; | ||
| Some(Ok(entry)) | ||
|
|
@@ -218,47 +219,52 @@ impl<'a> TryFrom<&'a [u8]> for tree::EntryMode { | |
|
|
||
| mod decode { | ||
| use bstr::ByteSlice; | ||
| use winnow::{error::ParserError, prelude::*}; | ||
| use winnow::{error::ParserError, prelude::*, Stateful}; | ||
|
|
||
| use crate::{tree, tree::EntryRef, TreeRef}; | ||
|
|
||
| pub fn fast_entry(i: &[u8]) -> Option<(&[u8], EntryRef<'_>)> { | ||
| pub fn fast_entry(i: &[u8], hash_len: usize) -> Option<(&[u8], EntryRef<'_>)> { | ||
| let (mode, i) = tree::EntryMode::extract_from_bytes(i)?; | ||
| let (filename, i) = i.split_at(i.find_byte(0)?); | ||
| let i = &i[1..]; | ||
| const HASH_LEN_FIXME: usize = 20; // TODO(SHA256): know actual/desired length or we may overshoot | ||
| let (oid, i) = match i.len() { | ||
| len if len < HASH_LEN_FIXME => return None, | ||
| _ => i.split_at(20), | ||
| len if len < hash_len => return None, | ||
| _ => i.split_at(hash_len), | ||
| }; | ||
| Some(( | ||
| i, | ||
| EntryRef { | ||
| mode, | ||
| filename: filename.as_bstr(), | ||
| oid: gix_hash::oid::try_from_bytes(oid).expect("we counted exactly 20 bytes"), | ||
| oid: gix_hash::oid::try_from_bytes(oid) | ||
| .unwrap_or_else(|_| panic!("we counted exactly {hash_len} bytes")), | ||
| }, | ||
| )) | ||
| } | ||
|
|
||
| pub fn tree<'a, E: ParserError<&'a [u8]>>(i: &mut &'a [u8]) -> ModalResult<TreeRef<'a>, E> { | ||
| let mut i = &**i; | ||
| #[derive(Debug)] | ||
| pub struct State { | ||
| pub hash_len: usize, | ||
| } | ||
|
|
||
| pub type Stream<'is> = Stateful<&'is [u8], State>; | ||
|
|
||
| pub fn tree<'a, E: ParserError<&'a [u8]>>(stream: &mut Stream<'a>) -> ModalResult<TreeRef<'a>, E> { | ||
|
Comment on lines
+247
to
+254
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd love it if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you suspected, that’s what I tried first. :-) But |
||
| let mut i = stream.input; | ||
|
|
||
| // Calculate an estimate of the amount of entries to reduce | ||
| // the amount of allocations necessary. | ||
| // Note that this assumes that we want speed over fitting Vecs, this is a trade-off. | ||
| // TODO(SHA256): know actual/desired length for reduced overallocation | ||
| const HASH_LEN_FIXME: usize = 20; | ||
| const AVERAGE_FILENAME_LEN: usize = 24; | ||
| const AVERAGE_MODE_LEN: usize = 6; | ||
| const ENTRY_DELIMITER_LEN: usize = 2; // space + trailing zero | ||
| const AVERAGE_TREE_ENTRIES: usize = 16 * 2; // prevent overallocation beyond what's meaningful or what could be dangerous | ||
| let average_entry_len = ENTRY_DELIMITER_LEN + HASH_LEN_FIXME + AVERAGE_MODE_LEN + AVERAGE_FILENAME_LEN; | ||
| let average_entry_len = ENTRY_DELIMITER_LEN + stream.state.hash_len + AVERAGE_MODE_LEN + AVERAGE_FILENAME_LEN; | ||
| let upper_bound = i.len() / average_entry_len; | ||
| let mut out = Vec::with_capacity(upper_bound.min(AVERAGE_TREE_ENTRIES)); | ||
|
|
||
| while !i.is_empty() { | ||
| let Some((rest, entry)) = fast_entry(i) else { | ||
| let Some((rest, entry)) = fast_entry(i, stream.state.hash_len) else { | ||
| #[allow(clippy::unit_arg)] | ||
| return Err(winnow::error::ErrMode::from_input(&i)); | ||
| }; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To my mind it's fine to use
hash_lenif it stays private (after all, that's what the iter really wants). One can even then consider keepingKindinstead as the function to get its len is const for sure and thus has no chance of being worse than the whole usize.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to
hash_kind. (Aside: this is calledobject_hashin other places, I started wondering whether it should also be called that here.)Using
hash_kindinstead ofhash_lenalso has the advantage that it can be packed and doesn’t grow the size of eitherTreeRefIterorData.