Skip to content

Properly track safety invariants #1237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 4 additions & 174 deletions gix-pack/src/cache/delta/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,184 +17,14 @@ pub mod traverse;
///
pub mod from_offsets;

/// An item stored within the [`Tree`] whose data is stored in a pack file, identified by
/// the offset of its first (`offset`) and last (`next_offset`) bytes.
///
/// It represents either a root entry, or one that relies on a base to be resolvable,
/// alongside associated `data` `T`.
pub struct Item<T> {
/// The offset into the pack file at which the pack entry's data is located.
pub offset: crate::data::Offset,
/// The offset of the next item in the pack file.
pub next_offset: crate::data::Offset,
/// Data to store with each Item, effectively data associated with each entry in a pack.
pub data: T,
/// Indices into our Tree's `items`, one for each pack entry that depends on us.
///
/// Limited to u32 as that's the maximum amount of objects in a pack.
children: Vec<u32>,
}
/// Tree datastructure
// kept in separate module to encapsulate unsafety (it has field invariants)
mod tree;

/// Identify what kind of node we have last seen
enum NodeKind {
Root,
Child,
}

/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process,
/// while being shareable among threads without a lock.
/// It does this by making the guarantee that iteration only happens once.
pub struct Tree<T> {
/// The root nodes, i.e. base objects
root_items: Vec<Item<T>>,
/// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects
child_items: Vec<Item<T>>,
/// The last encountered node was either a root or a child.
last_seen: Option<NodeKind>,
/// Future child offsets, associating their offset into the pack with their index in the items array.
/// (parent_offset, child_index)
future_child_offsets: Vec<(crate::data::Offset, usize)>,
}

impl<T> Tree<T> {
/// Instantiate a empty tree capable of storing `num_objects` amounts of items.
pub fn with_capacity(num_objects: usize) -> Result<Self, Error> {
Ok(Tree {
root_items: Vec::with_capacity(num_objects / 2),
child_items: Vec::with_capacity(num_objects / 2),
last_seen: None,
future_child_offsets: Vec::new(),
})
}

fn num_items(&self) -> usize {
self.root_items.len() + self.child_items.len()
}

fn assert_is_incrementing_and_update_next_offset(&mut self, offset: crate::data::Offset) -> Result<(), Error> {
let items = match &self.last_seen {
Some(NodeKind::Root) => &mut self.root_items,
Some(NodeKind::Child) => &mut self.child_items,
None => return Ok(()),
};
let item = &mut items.last_mut().expect("last seen won't lie");
if offset <= item.offset {
return Err(Error::InvariantIncreasingPackOffset {
last_pack_offset: item.offset,
pack_offset: offset,
});
}
item.next_offset = offset;
Ok(())
}

fn set_pack_entries_end_and_resolve_ref_offsets(
&mut self,
pack_entries_end: crate::data::Offset,
) -> Result<(), traverse::Error> {
if !self.future_child_offsets.is_empty() {
for (parent_offset, child_index) in self.future_child_offsets.drain(..) {
if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) {
self.child_items[i].children.push(child_index as u32);
} else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) {
self.root_items[i].children.push(child_index as u32);
} else {
return Err(traverse::Error::OutOfPackRefDelta {
base_pack_offset: parent_offset,
});
}
}
}

self.assert_is_incrementing_and_update_next_offset(pack_entries_end)
.expect("BUG: pack now is smaller than all previously seen entries");
Ok(())
}

/// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate
/// custom `data` with it.
pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> {
self.assert_is_incrementing_and_update_next_offset(offset)?;
self.last_seen = NodeKind::Root.into();
self.root_items.push(Item {
offset,
next_offset: 0,
data,
children: Default::default(),
});
Ok(())
}

/// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it.
pub fn add_child(
&mut self,
base_offset: crate::data::Offset,
offset: crate::data::Offset,
data: T,
) -> Result<(), Error> {
self.assert_is_incrementing_and_update_next_offset(offset)?;

let next_child_index = self.child_items.len();
if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) {
self.child_items[i].children.push(next_child_index as u32);
} else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) {
self.root_items[i].children.push(next_child_index as u32);
} else {
self.future_child_offsets.push((base_offset, next_child_index));
}

self.last_seen = NodeKind::Child.into();
self.child_items.push(Item {
offset,
next_offset: 0,
data,
children: Default::default(),
});
Ok(())
}
}
pub use tree::{Item, Tree};

#[cfg(test)]
mod tests {
mod tree {
mod from_offsets_in_pack {
use std::sync::atomic::AtomicBool;

use crate as pack;

const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx";
const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack";

const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx";
const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack";

use gix_testtools::fixture_path;

#[test]
fn v1() -> Result<(), Box<dyn std::error::Error>> {
tree(INDEX_V1, PACK_FOR_INDEX_V1)
}

#[test]
fn v2() -> Result<(), Box<dyn std::error::Error>> {
tree(SMALL_PACK_INDEX, SMALL_PACK)
}

fn tree(index_path: &str, pack_path: &str) -> Result<(), Box<dyn std::error::Error>> {
let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?;
crate::cache::delta::Tree::from_offsets_in_pack(
&fixture_path(pack_path),
idx.sorted_offsets().into_iter(),
&|ofs| *ofs,
&|id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)),
&mut gix_features::progress::Discard,
&AtomicBool::new(false),
gix_hash::Kind::Sha1,
)?;
Ok(())
}
}
}

#[test]
fn size_of_pack_tree_item() {
Expand Down
35 changes: 21 additions & 14 deletions gix-pack/src/cache/delta/traverse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ where
let object_progress = OwnShared::new(Mutable::new(object_progress));

let start = std::time::Instant::now();
let child_items = ItemSliceSync::new(&mut self.child_items);
let (mut root_items, mut child_items_vec) = self.take_root_and_child();
let child_items = ItemSliceSync::new(&mut child_items_vec);
let child_items = &child_items;
in_parallel_with_slice(
&mut self.root_items,
&mut root_items,
thread_limit,
{
{
Expand All @@ -154,16 +155,22 @@ where
},
{
move |node, state, threads_left, should_interrupt| {
resolve::deltas(
object_counter.clone(),
size_counter.clone(),
node,
state,
resolve_data,
object_hash.len_in_bytes(),
threads_left,
should_interrupt,
)
// SAFETY: This invariant is upheld since `child_items` and `node` come from the same Tree.
// This means we can rely on Tree's invariant that node.children will be the only `children` array in
// for nodes in this tree that will contain any of those children.
#[allow(unsafe_code)]
unsafe {
resolve::deltas(
object_counter.clone(),
size_counter.clone(),
node,
state,
resolve_data,
object_hash.len_in_bytes(),
threads_left,
should_interrupt,
)
}
}
},
|| (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)),
Expand All @@ -174,8 +181,8 @@ where
size_progress.show_throughput(start);

Ok(Outcome {
roots: self.root_items,
children: self.child_items,
roots: root_items,
children: child_items_vec,
})
}
}
47 changes: 32 additions & 15 deletions gix-pack/src/cache/delta/traverse/resolve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,25 @@ mod root {

/// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`].
pub(crate) struct Node<'a, T: Send> {
// SAFETY INVARIANT: see Node::new(). That function is the only one used
// to create or modify these fields.
item: &'a mut Item<T>,
child_items: &'a ItemSliceSync<'a, Item<T>>,
}

impl<'a, T: Send> Node<'a, T> {
/// SAFETY: The child_items must be unique among between users of the `ItemSliceSync`.
/// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive
/// item does. All child_items must also have unique children, unless the child_item is itself `item`,
/// in which case no other live item should reference it in its `item.children`.
///
/// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items`
/// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all
/// child_items are referenced at most once (really, exactly once) by a node in the tree.
///
/// Note that this invariant is a bit more relaxed than that on `deltas()`, because this function can be called
/// for traversal within a child item, which happens in into_child_iter()
#[allow(unsafe_code)]
pub(in crate::cache::delta::traverse) unsafe fn new(
item: &'a mut Item<T>,
child_items: &'a ItemSliceSync<'a, Item<T>>,
) -> Self {
pub(super) unsafe fn new(item: &'a mut Item<T>, child_items: &'a ItemSliceSync<'a, Item<T>>) -> Self {
Node { item, child_items }
}
}
Expand All @@ -52,26 +60,28 @@ mod root {

/// Returns true if this node has children, e.g. is not a leaf in the tree.
pub fn has_children(&self) -> bool {
!self.item.children.is_empty()
!self.item.children().is_empty()
}

/// Transform this `Node` into an iterator over its children.
///
/// Children are `Node`s referring to pack entries whose base object is this pack entry.
pub fn into_child_iter(self) -> impl Iterator<Item = Node<'a, T>> + 'a {
let children = self.child_items;
// SAFETY: The index is a valid index into the children array.
// SAFETY: The resulting mutable pointer cannot be yielded by any other node.
#[allow(unsafe_code)]
self.item.children.iter().map(move |&index| Node {
item: unsafe { children.get_mut(index as usize) },
child_items: children,
self.item.children().iter().map(move |&index| {
// SAFETY: Due to the invariant on new(), we can rely on these indices
// being unique.
let item = unsafe { children.get_mut(index as usize) };
// SAFETY: Since every child_item is also required to uphold the uniqueness guarantee,
// creating a Node with one of the child_items that we are allowed access to is still fine.
unsafe { Node::new(item, children) }
})
}
}
}

pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> {
pub(super) struct State<'items, F, MBFN, T: Send> {
pub delta_bytes: Vec<u8>,
pub fully_resolved_delta_bytes: Vec<u8>,
pub progress: Box<dyn Progress>,
Expand All @@ -80,8 +90,15 @@ pub(in crate::cache::delta::traverse) struct State<'items, F, MBFN, T: Send> {
pub child_items: &'items ItemSliceSync<'items, Item<T>>,
}

#[allow(clippy::too_many_arguments)]
pub(in crate::cache::delta::traverse) fn deltas<T, F, MBFN, E, R>(
/// SAFETY: `item.children` must uniquely reference elements in child_items that no other currently alive
/// item does. All child_items must also have unique children.
///
/// This safety invariant can be reliably upheld by making sure `item` comes from a Tree and `child_items`
/// was constructed using that Tree's child_items. This works since Tree has this invariant as well: all
/// child_items are referenced at most once (really, exactly once) by a node in the tree.
#[allow(clippy::too_many_arguments, unsafe_code)]
#[deny(unsafe_op_in_unsafe_fn)] // this is a big function, require unsafe for the one small unsafe op we have
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is great!! I wasn't aware of this effective means of propagating unsafe the way it should be propagated without making it incredibly easy to embed even more unsafe calls or expressed differently, make the unsafe-review-surface even larger.

pub(super) unsafe fn deltas<T, F, MBFN, E, R>(
objects: gix_features::progress::StepShared,
size: gix_features::progress::StepShared,
item: &mut Item<T>,
Expand Down Expand Up @@ -121,7 +138,7 @@ where
// each node is a base, and its children always start out as deltas which become a base after applying them.
// These will be pushed onto our stack until all are processed
let root_level = 0;
// SAFETY: The child items are unique, as `item` is the root of a tree of dependent child items.
// SAFETY: This invariant is required from the caller
#[allow(unsafe_code)]
let root_node = unsafe { root::Node::new(item, child_items) };
let mut nodes: Vec<_> = vec![(root_level, root_node)];
Expand Down
21 changes: 12 additions & 9 deletions gix-pack/src/cache/delta/traverse/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@ use std::marker::PhantomData;
/// more than one base. And that's what one would really have to do for two threads to encounter the same child.
///
/// Thus I believe it's impossible for this data structure to end up in a place where it violates its assumption.
pub(in crate::cache::delta::traverse) struct ItemSliceSync<'a, T>
pub(super) struct ItemSliceSync<'a, T>
where
T: Send,
{
items: *mut T,
#[cfg(debug_assertions)]
len: usize,
phantom: PhantomData<&'a T>,
phantom: PhantomData<&'a mut T>,
}

impl<'a, T> ItemSliceSync<'a, T>
where
T: Send,
{
pub(in crate::cache::delta::traverse) fn new(items: &'a mut [T]) -> Self {
pub(super) fn new(items: &'a mut [T]) -> Self {
ItemSliceSync {
items: items.as_mut_ptr(),
#[cfg(debug_assertions)]
Expand All @@ -41,21 +41,24 @@ where

// SAFETY: The index must point into the slice and must not be reused concurrently.
#[allow(unsafe_code)]
pub(in crate::cache::delta::traverse) unsafe fn get_mut(&self, index: usize) -> &'a mut T {
pub(super) unsafe fn get_mut(&self, index: usize) -> &'a mut T {
#[cfg(debug_assertions)]
if index >= self.len {
panic!("index out of bounds: the len is {} but the index is {index}", self.len);
}
// SAFETY: The index is within the slice
// SAFETY: The children array is alive by the 'a lifetime.
// SAFETY:
// - The index is within the slice (required by documentation)
// - We have mutable access to `items` as ensured by Self::new()
// - This is the only method on this type giving access to items
// - The documentation requires that this access is unique
unsafe { &mut *self.items.add(index) }
}
}

// SAFETY: T is `Send`, and we only use the pointer for creating new pointers.
// SAFETY: This is logically an &mut T, which is Send if T is Send
// (note: this is different from &T, which also needs T: Sync)
#[allow(unsafe_code)]
unsafe impl<T> Send for ItemSliceSync<'_, T> where T: Send {}
// SAFETY: T is `Send`, and as long as the user follows the contract of
// `get_mut()`, we only ever access one T at a time.
// SAFETY: This is logically an &mut T, which is Sync if T is Sync
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This I never really understood as my intuition was that the bound should be Sync if T: Sync for perfect safety. But having to say it's Sync if T: Send seems to be too broad and also allows RefCells for instance which are !Sync.

Is this a shortcoming of the type system, or a shortcoming of the way ItemSliceSync is defined, or something entirely different (like me not understanding how Send + Sync truly works beyond some intuition from practice :D).

#[allow(unsafe_code)]
unsafe impl<T> Sync for ItemSliceSync<'_, T> where T: Send {}
Loading