-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Account for child Bucket size in OrderPreservingInterner
#4646
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4408037
111b43c
189c3b5
270f79e
0f07726
e51602d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -343,8 +343,19 @@ impl Bucket { | |
| fn size(&self) -> usize { | ||
| std::mem::size_of::<Self>() | ||
| + self.slots.capacity() * std::mem::size_of::<Slot>() | ||
| // and account for the size of any embedded buckets in the slots | ||
| + self.slot_child_bucket_size() | ||
| + self.next.as_ref().map(|x| x.size()).unwrap_or_default() | ||
| } | ||
|
|
||
| /// returns the total size of any recursively allocated `Bucket`s | ||
| /// in self.slots. This does not include the size of the child Slot itself | ||
| fn slot_child_bucket_size(&self) -> usize { | ||
| self.slots | ||
| .iter() | ||
| .map(|slot| slot.child.as_ref().map(|x| x.size()).unwrap_or_default()) | ||
| .sum() | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would love some suggestions on how to test this -- what I tried is described on #4645
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made a test in 111b43c It is basically a second implementation of size, which one could argue is unnecessary but I feel good by having a double check |
||
|
|
@@ -427,4 +438,86 @@ mod tests { | |
| interner.normalized_key(interned[3]) < interner.normalized_key(interned[2]) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_intern_sizes() { | ||
| let mut interner = OrderPreservingInterner::default(); | ||
|
|
||
| // Intern a 1K values each 8 bytes large | ||
| let num_items = 1000; | ||
| let mut values: Vec<usize> = (0..num_items).collect(); | ||
| values.reverse(); | ||
|
|
||
| // intern these values 1 at a time (otherwise the interner | ||
| // will sort them first); | ||
| for v in values { | ||
| interner.intern([Some(v.to_be_bytes())]); | ||
| } | ||
|
|
||
| let reported = interner.size(); | ||
|
|
||
| // Figure out the expected size (this is a second | ||
| // implementation of size()) as a double check | ||
| let min_expected = BucketWalker::new() | ||
| .visit_bucket(interner.bucket.as_ref()) | ||
| .memory_estimate() | ||
| // hash table size | ||
| + interner.lookup.capacity() * std::mem::size_of::<Interned>() | ||
| // key/value storage | ||
| + interner.keys.buffer_size() | ||
| + interner.values.buffer_size(); | ||
|
|
||
| assert!( | ||
| reported > min_expected, | ||
| "reported size {reported} not larger than min expected size: {min_expected}" | ||
| ) | ||
| } | ||
|
|
||
| // Walks over the buckets / slots counting counting them all | ||
| struct BucketWalker { | ||
| num_buckets: usize, | ||
| num_slots: usize, | ||
| } | ||
|
|
||
| impl BucketWalker { | ||
| fn new() -> Self { | ||
| Self { | ||
| num_buckets: 0, | ||
| num_slots: 0, | ||
| } | ||
| } | ||
|
|
||
| // recursively visit the bucket and any slots/buckets contained | ||
| fn visit_bucket(mut self, bucket: &Bucket) -> Self { | ||
| self.num_buckets += 1; | ||
| let acc = bucket | ||
| .slots | ||
| .iter() | ||
| .fold(self, |acc, slot| acc.visit_slot(slot)); | ||
|
|
||
| if let Some(next) = bucket.next.as_ref() { | ||
| acc.visit_bucket(next.as_ref()) | ||
| } else { | ||
| acc | ||
| } | ||
| } | ||
|
|
||
| // recursively visit slot and any slots/buckets | ||
| fn visit_slot(mut self, slot: &Slot) -> Self { | ||
| self.num_slots += 1; | ||
| if let Some(child) = slot.child.as_ref() { | ||
| self.visit_bucket(child.as_ref()) | ||
| } else { | ||
| self | ||
| } | ||
| } | ||
|
|
||
| // estimate how much memory is used just for Buckets / Slots | ||
| // (an underestimate of the total memory used for the | ||
| // interner as it doesn't contain any actual values) | ||
| fn memory_estimate(self) -> usize { | ||
| self.num_buckets * std::mem::size_of::<Bucket>() | ||
| + self.num_slots * std::mem::size_of::<Slot>() | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem is that a
Bucketmay contain otherBuckets in the childrenSlots:arrow-rs/arrow-row/src/interner.rs
Line 233 in b15838c
Their memory was not accounted for