diff --git a/gix-pack/src/cache/delta/mod.rs b/gix-pack/src/cache/delta/mod.rs index 435008edff2..c923b5d1078 100644 --- a/gix-pack/src/cache/delta/mod.rs +++ b/gix-pack/src/cache/delta/mod.rs @@ -17,184 +17,14 @@ pub mod traverse; /// pub mod from_offsets; -/// An item stored within the [`Tree`] whose data is stored in a pack file, identified by -/// the offset of its first (`offset`) and last (`next_offset`) bytes. -/// -/// It represents either a root entry, or one that relies on a base to be resolvable, -/// alongside associated `data` `T`. -pub struct Item { - /// The offset into the pack file at which the pack entry's data is located. - pub offset: crate::data::Offset, - /// The offset of the next item in the pack file. - pub next_offset: crate::data::Offset, - /// Data to store with each Item, effectively data associated with each entry in a pack. - pub data: T, - /// Indices into our Tree's `items`, one for each pack entry that depends on us. - /// - /// Limited to u32 as that's the maximum amount of objects in a pack. - children: Vec, -} +/// Tree datastructure +// kept in separate module to encapsulate unsafety (it has field invariants) +mod tree; -/// Identify what kind of node we have last seen -enum NodeKind { - Root, - Child, -} - -/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process, -/// while being shareable among threads without a lock. -/// It does this by making the guarantee that iteration only happens once. -pub struct Tree { - /// The root nodes, i.e. base objects - root_items: Vec>, - /// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects - child_items: Vec>, - /// The last encountered node was either a root or a child. - last_seen: Option, - /// Future child offsets, associating their offset into the pack with their index in the items array. - /// (parent_offset, child_index) - future_child_offsets: Vec<(crate::data::Offset, usize)>, -} - -impl Tree { - /// Instantiate a empty tree capable of storing `num_objects` amounts of items. - pub fn with_capacity(num_objects: usize) -> Result { - Ok(Tree { - root_items: Vec::with_capacity(num_objects / 2), - child_items: Vec::with_capacity(num_objects / 2), - last_seen: None, - future_child_offsets: Vec::new(), - }) - } - - fn num_items(&self) -> usize { - self.root_items.len() + self.child_items.len() - } - - fn assert_is_incrementing_and_update_next_offset(&mut self, offset: crate::data::Offset) -> Result<(), Error> { - let items = match &self.last_seen { - Some(NodeKind::Root) => &mut self.root_items, - Some(NodeKind::Child) => &mut self.child_items, - None => return Ok(()), - }; - let item = &mut items.last_mut().expect("last seen won't lie"); - if offset <= item.offset { - return Err(Error::InvariantIncreasingPackOffset { - last_pack_offset: item.offset, - pack_offset: offset, - }); - } - item.next_offset = offset; - Ok(()) - } - - fn set_pack_entries_end_and_resolve_ref_offsets( - &mut self, - pack_entries_end: crate::data::Offset, - ) -> Result<(), traverse::Error> { - if !self.future_child_offsets.is_empty() { - for (parent_offset, child_index) in self.future_child_offsets.drain(..) { - if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) { - self.child_items[i].children.push(child_index as u32); - } else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) { - self.root_items[i].children.push(child_index as u32); - } else { - return Err(traverse::Error::OutOfPackRefDelta { - base_pack_offset: parent_offset, - }); - } - } - } - - self.assert_is_incrementing_and_update_next_offset(pack_entries_end) - .expect("BUG: pack now is smaller than all previously seen entries"); - Ok(()) - } - - /// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate - /// custom `data` with it. - pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> { - self.assert_is_incrementing_and_update_next_offset(offset)?; - self.last_seen = NodeKind::Root.into(); - self.root_items.push(Item { - offset, - next_offset: 0, - data, - children: Default::default(), - }); - Ok(()) - } - - /// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it. - pub fn add_child( - &mut self, - base_offset: crate::data::Offset, - offset: crate::data::Offset, - data: T, - ) -> Result<(), Error> { - self.assert_is_incrementing_and_update_next_offset(offset)?; - - let next_child_index = self.child_items.len(); - if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) { - self.child_items[i].children.push(next_child_index as u32); - } else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) { - self.root_items[i].children.push(next_child_index as u32); - } else { - self.future_child_offsets.push((base_offset, next_child_index)); - } - - self.last_seen = NodeKind::Child.into(); - self.child_items.push(Item { - offset, - next_offset: 0, - data, - children: Default::default(), - }); - Ok(()) - } -} +pub use tree::{Item, Tree}; #[cfg(test)] mod tests { - mod tree { - mod from_offsets_in_pack { - use std::sync::atomic::AtomicBool; - - use crate as pack; - - const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"; - const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack"; - - const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"; - const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack"; - - use gix_testtools::fixture_path; - - #[test] - fn v1() -> Result<(), Box> { - tree(INDEX_V1, PACK_FOR_INDEX_V1) - } - - #[test] - fn v2() -> Result<(), Box> { - tree(SMALL_PACK_INDEX, SMALL_PACK) - } - - fn tree(index_path: &str, pack_path: &str) -> Result<(), Box> { - let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?; - crate::cache::delta::Tree::from_offsets_in_pack( - &fixture_path(pack_path), - idx.sorted_offsets().into_iter(), - &|ofs| *ofs, - &|id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)), - &mut gix_features::progress::Discard, - &AtomicBool::new(false), - gix_hash::Kind::Sha1, - )?; - Ok(()) - } - } - } #[test] fn size_of_pack_tree_item() { diff --git a/gix-pack/src/cache/delta/traverse/mod.rs b/gix-pack/src/cache/delta/traverse/mod.rs index ee348162709..ea0c9c93be9 100644 --- a/gix-pack/src/cache/delta/traverse/mod.rs +++ b/gix-pack/src/cache/delta/traverse/mod.rs @@ -132,10 +132,11 @@ where let object_progress = OwnShared::new(Mutable::new(object_progress)); let start = std::time::Instant::now(); - let child_items = ItemSliceSync::new(&mut self.child_items); + let (mut root_items, mut child_items_vec) = self.take_root_and_child(); + let child_items = ItemSliceSync::new(&mut child_items_vec); let child_items = &child_items; in_parallel_with_slice( - &mut self.root_items, + &mut root_items, thread_limit, { { @@ -154,16 +155,22 @@ where }, { move |node, state, threads_left, should_interrupt| { - resolve::deltas( - object_counter.clone(), - size_counter.clone(), - node, - state, - resolve_data, - object_hash.len_in_bytes(), - threads_left, - should_interrupt, - ) + // SAFETY: This invariant is upheld since `child_items` and `node` come from the same Tree. + // This means we can rely on Tree's invariant that node.children will be the only `children` array in + // for nodes in this tree that will contain any of those children. + #[allow(unsafe_code)] + unsafe { + resolve::deltas( + object_counter.clone(), + size_counter.clone(), + node, + state, + resolve_data, + object_hash.len_in_bytes(), + threads_left, + should_interrupt, + ) + } } }, || (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)), @@ -174,8 +181,8 @@ where size_progress.show_throughput(start); Ok(Outcome { - roots: self.root_items, - children: self.child_items, + roots: root_items, + children: child_items_vec, }) } } diff --git a/gix-pack/src/cache/delta/traverse/resolve.rs b/gix-pack/src/cache/delta/traverse/resolve.rs index aad68ee274c..92bff6b03e8 100644 --- a/gix-pack/src/cache/delta/traverse/resolve.rs +++ b/gix-pack/src/cache/delta/traverse/resolve.rs @@ -19,17 +19,25 @@ mod root { /// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`]. pub(crate) struct Node<'a, T: Send> { + // SAFETY INVARIANT: see Node::new(). That function is the only one used + // to create or modify these fields. item: &'a mut Item, child_items: &'a ItemSliceSync<'a, Item>, } impl<'a, T: Send> Node<'a, T> { - /// SAFETY: The child_items must be unique among between users of the `ItemSliceSync`. + /// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive + /// item does. All child_items must also have unique children, unless the child_item is itself `item`, + /// in which case no other live item should reference it in its `item.children`. + /// + /// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items` + /// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all + /// child_items are referenced at most once (really, exactly once) by a node in the tree. + /// + /// Note that this invariant is a bit more relaxed than that on `deltas()`, because this function can be called + /// for traversal within a child item, which happens in into_child_iter() #[allow(unsafe_code)] - pub(in crate::cache::delta::traverse) unsafe fn new( - item: &'a mut Item, - child_items: &'a ItemSliceSync<'a, Item>, - ) -> Self { + pub(super) unsafe fn new(item: &'a mut Item, child_items: &'a ItemSliceSync<'a, Item>) -> Self { Node { item, child_items } } } @@ -52,7 +60,7 @@ mod root { /// Returns true if this node has children, e.g. is not a leaf in the tree. pub fn has_children(&self) -> bool { - !self.item.children.is_empty() + !self.item.children().is_empty() } /// Transform this `Node` into an iterator over its children. @@ -60,18 +68,20 @@ mod root { /// Children are `Node`s referring to pack entries whose base object is this pack entry. pub fn into_child_iter(self) -> impl Iterator> + 'a { let children = self.child_items; - // SAFETY: The index is a valid index into the children array. - // SAFETY: The resulting mutable pointer cannot be yielded by any other node. #[allow(unsafe_code)] - self.item.children.iter().map(move |&index| Node { - item: unsafe { children.get_mut(index as usize) }, - child_items: children, + self.item.children().iter().map(move |&index| { + // SAFETY: Due to the invariant on new(), we can rely on these indices + // being unique. + let item = unsafe { children.get_mut(index as usize) }; + // SAFETY: Since every child_item is also required to uphold the uniqueness guarantee, + // creating a Node with one of the child_items that we are allowed access to is still fine. + unsafe { Node::new(item, children) } }) } } } -pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> { +pub(super) struct State<'items, F, MBFN, T: Send> { pub delta_bytes: Vec, pub fully_resolved_delta_bytes: Vec, pub progress: Box, @@ -80,8 +90,15 @@ pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> { pub child_items: &'items ItemSliceSync<'items, Item>, } -#[allow(clippy::too_many_arguments)] -pub(in crate::cache::delta::traverse) fn deltas( +/// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive +/// item does. All child_items must also have unique children. +/// +/// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items` +/// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all +/// child_items are referenced at most once (really, exactly once) by a node in the tree. +#[allow(clippy::too_many_arguments, unsafe_code)] +#[deny(unsafe_op_in_unsafe_fn)] // this is a big function, require unsafe for the one small unsafe op we have +pub(super) unsafe fn deltas( objects: gix_features::progress::StepShared, size: gix_features::progress::StepShared, item: &mut Item, @@ -121,7 +138,7 @@ where // each node is a base, and its children always start out as deltas which become a base after applying them. // These will be pushed onto our stack until all are processed let root_level = 0; - // SAFETY: The child items are unique, as `item` is the root of a tree of dependent child items. + // SAFETY: This invariant is required from the caller #[allow(unsafe_code)] let root_node = unsafe { root::Node::new(item, child_items) }; let mut nodes: Vec<_> = vec![(root_level, root_node)]; diff --git a/gix-pack/src/cache/delta/traverse/util.rs b/gix-pack/src/cache/delta/traverse/util.rs index 030383a7a61..35d5642c759 100644 --- a/gix-pack/src/cache/delta/traverse/util.rs +++ b/gix-pack/src/cache/delta/traverse/util.rs @@ -16,21 +16,21 @@ use std::marker::PhantomData; /// more than one base. And that's what one would really have to do for two threads to encounter the same child. /// /// Thus I believe it's impossible for this data structure to end up in a place where it violates its assumption. -pub(in crate::cache::delta::traverse) struct ItemSliceSync<'a, T> +pub(super) struct ItemSliceSync<'a, T> where T: Send, { items: *mut T, #[cfg(debug_assertions)] len: usize, - phantom: PhantomData<&'a T>, + phantom: PhantomData<&'a mut T>, } impl<'a, T> ItemSliceSync<'a, T> where T: Send, { - pub(in crate::cache::delta::traverse) fn new(items: &'a mut [T]) -> Self { + pub(super) fn new(items: &'a mut [T]) -> Self { ItemSliceSync { items: items.as_mut_ptr(), #[cfg(debug_assertions)] @@ -41,21 +41,24 @@ where // SAFETY: The index must point into the slice and must not be reused concurrently. #[allow(unsafe_code)] - pub(in crate::cache::delta::traverse) unsafe fn get_mut(&self, index: usize) -> &'a mut T { + pub(super) unsafe fn get_mut(&self, index: usize) -> &'a mut T { #[cfg(debug_assertions)] if index >= self.len { panic!("index out of bounds: the len is {} but the index is {index}", self.len); } - // SAFETY: The index is within the slice - // SAFETY: The children array is alive by the 'a lifetime. + // SAFETY: + // - The index is within the slice (required by documentation) + // - We have mutable access to `items` as ensured by Self::new() + // - This is the only method on this type giving access to items + // - The documentation requires that this access is unique unsafe { &mut *self.items.add(index) } } } -// SAFETY: T is `Send`, and we only use the pointer for creating new pointers. +// SAFETY: This is logically an &mut T, which is Send if T is Send +// (note: this is different from &T, which also needs T: Sync) #[allow(unsafe_code)] unsafe impl Send for ItemSliceSync<'_, T> where T: Send {} -// SAFETY: T is `Send`, and as long as the user follows the contract of -// `get_mut()`, we only ever access one T at a time. +// SAFETY: This is logically an &mut T, which is Sync if T is Sync #[allow(unsafe_code)] unsafe impl Sync for ItemSliceSync<'_, T> where T: Send {} diff --git a/gix-pack/src/cache/delta/tree.rs b/gix-pack/src/cache/delta/tree.rs new file mode 100644 index 00000000000..f26cf5a0564 --- /dev/null +++ b/gix-pack/src/cache/delta/tree.rs @@ -0,0 +1,250 @@ +use super::{traverse, Error}; +/// An item stored within the [`Tree`] whose data is stored in a pack file, identified by +/// the offset of its first (`offset`) and last (`next_offset`) bytes. +/// +/// It represents either a root entry, or one that relies on a base to be resolvable, +/// alongside associated `data` `T`. +pub struct Item { + /// The offset into the pack file at which the pack entry's data is located. + pub offset: crate::data::Offset, + /// The offset of the next item in the pack file. + pub next_offset: crate::data::Offset, + /// Data to store with each Item, effectively data associated with each entry in a pack. + pub data: T, + /// Indices into our Tree's `items`, one for each pack entry that depends on us. + /// + /// Limited to u32 as that's the maximum amount of objects in a pack. + // SAFETY INVARIANT: + // - only one Item in a tree may have any given child index. `future_child_offsets` + // should also not contain any indices found in `children`.\ + // - These indices should be in bounds for tree.child_items + children: Vec, +} + +impl Item { + /// Get the children + // (we don't want to expose mutable access) + pub fn children(&self) -> &[u32] { + &self.children + } +} + +/// Identify what kind of node we have last seen +enum NodeKind { + Root, + Child, +} + +/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process, +/// while being shareable among threads without a lock. +/// It does this by making the guarantee that iteration only happens once. +pub struct Tree { + /// The root nodes, i.e. base objects + // SAFETY invariant: see Item.children + root_items: Vec>, + /// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects + // SAFETY invariant: see Item.children + child_items: Vec>, + /// The last encountered node was either a root or a child. + last_seen: Option, + /// Future child offsets, associating their offset into the pack with their index in the items array. + /// (parent_offset, child_index) + // SAFETY invariant: + // - None of these child indices should already have parents + // i.e. future_child_offsets[i].1 should never be also found + // in Item.children. Indices should be found here at most once. + // - These indices should be in bounds for tree.child_items. + future_child_offsets: Vec<(crate::data::Offset, usize)>, +} + +impl Tree { + /// Instantiate a empty tree capable of storing `num_objects` amounts of items. + pub fn with_capacity(num_objects: usize) -> Result { + Ok(Tree { + root_items: Vec::with_capacity(num_objects / 2), + child_items: Vec::with_capacity(num_objects / 2), + last_seen: None, + future_child_offsets: Vec::new(), + }) + } + + pub(super) fn num_items(&self) -> usize { + self.root_items.len() + self.child_items.len() + } + + /// Returns self's root and child items. + /// + /// You can rely on them following the same `children` invariants as they did in the tree + pub(super) fn take_root_and_child(self) -> (Vec>, Vec>) { + (self.root_items, self.child_items) + } + + pub(super) fn assert_is_incrementing_and_update_next_offset( + &mut self, + offset: crate::data::Offset, + ) -> Result<(), Error> { + let items = match &self.last_seen { + Some(NodeKind::Root) => &mut self.root_items, + Some(NodeKind::Child) => &mut self.child_items, + None => return Ok(()), + }; + let item = &mut items.last_mut().expect("last seen won't lie"); + if offset <= item.offset { + return Err(Error::InvariantIncreasingPackOffset { + last_pack_offset: item.offset, + pack_offset: offset, + }); + } + item.next_offset = offset; + Ok(()) + } + + pub(super) fn set_pack_entries_end_and_resolve_ref_offsets( + &mut self, + pack_entries_end: crate::data::Offset, + ) -> Result<(), traverse::Error> { + if !self.future_child_offsets.is_empty() { + for (parent_offset, child_index) in self.future_child_offsets.drain(..) { + // SAFETY invariants upheld: + // - We are draining from future_child_offsets and adding to children, keeping things the same. + // - We can rely on the `future_child_offsets` invariant to be sure that `children` is + // not getting any indices that are already in use in `children` elsewhere + // - The indices are in bounds for child_items since they were in bounds for future_child_offsets, + // we can carry over the invariant. + if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.child_items[i].children.push(child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.root_items[i].children.push(child_index as u32); + } else { + return Err(traverse::Error::OutOfPackRefDelta { + base_pack_offset: parent_offset, + }); + } + } + } + + self.assert_is_incrementing_and_update_next_offset(pack_entries_end) + .expect("BUG: pack now is smaller than all previously seen entries"); + Ok(()) + } + + /// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate + /// custom `data` with it. + pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + self.last_seen = NodeKind::Root.into(); + self.root_items.push(Item { + offset, + next_offset: 0, + data, + // SAFETY INVARIANT upheld: there are no children + children: Default::default(), + }); + Ok(()) + } + + /// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it. + pub fn add_child( + &mut self, + base_offset: crate::data::Offset, + offset: crate::data::Offset, + data: T, + ) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + + let next_child_index = self.child_items.len(); + // SAFETY INVARIANT upheld: + // - This is one of two methods that modifies `children` and future_child_offsets. Out + // of the two, it is the only one that produces new indices in the system. + // - This always pushes next_child_index to *either* `children` or `future_child_offsets`, + // maintaining the cross-field invariant there. + // - This method will always push to child_items (at the end), incrementing + // future values of next_child_index. This means next_child_index is always + // unique for this method call. + // - As the only method producing new indices, this is the only time + // next_child_index will be added to children/future_child_offsets, upholding the invariant. + // - Since next_child_index will always be a valid index by the end of this method, + // this always produces valid in-bounds indices, upholding the bounds invariant. + + if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.child_items[i].children.push(next_child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.root_items[i].children.push(next_child_index as u32); + } else { + self.future_child_offsets.push((base_offset, next_child_index)); + } + + self.last_seen = NodeKind::Child.into(); + self.child_items.push(Item { + offset, + next_offset: 0, + data, + // SAFETY INVARIANT upheld: there are no children + children: Default::default(), + }); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + mod from_offsets_in_pack { + use std::sync::atomic::AtomicBool; + + use crate as pack; + + const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"; + const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack"; + + const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"; + const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack"; + + use gix_testtools::fixture_path; + + #[test] + fn v1() -> Result<(), Box> { + tree(INDEX_V1, PACK_FOR_INDEX_V1) + } + + #[test] + fn v2() -> Result<(), Box> { + tree(SMALL_PACK_INDEX, SMALL_PACK) + } + + fn tree(index_path: &str, pack_path: &str) -> Result<(), Box> { + let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?; + crate::cache::delta::Tree::from_offsets_in_pack( + &fixture_path(pack_path), + idx.sorted_offsets().into_iter(), + &|ofs| *ofs, + &|id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)), + &mut gix_features::progress::Discard, + &AtomicBool::new(false), + gix_hash::Kind::Sha1, + )?; + Ok(()) + } + } + + #[test] + fn size_of_pack_tree_item() { + use super::Item; + assert_eq!(std::mem::size_of::<[Item<()>; 7_500_000]>(), 300_000_000); + } + + #[test] + fn size_of_pack_verify_data_structure() { + use super::Item; + pub struct EntryWithDefault { + _index_entry: crate::index::Entry, + _kind: gix_object::Kind, + _object_size: u64, + _decompressed_size: u64, + _compressed_size: u64, + _header_size: u16, + _level: u16, + } + + assert_eq!(std::mem::size_of::<[Item; 7_500_000]>(), 840_000_000); + } +}