/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/buffer/offset.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::buffer::ScalarBuffer; |
19 | | use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder}; |
20 | | use std::ops::Deref; |
21 | | |
22 | | /// A non-empty buffer of monotonically increasing, positive integers. |
23 | | /// |
24 | | /// [`OffsetBuffer`] are used to represent ranges of offsets. An |
25 | | /// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start |
26 | | /// offset for element `i` is `offsets[i]` and the end offset is |
27 | | /// `offsets[i+1]`. Equal offsets represent an empty range. |
28 | | /// |
29 | | /// # Example |
30 | | /// |
31 | | /// This example shows how 5 distinct ranges, are represented using a |
32 | | /// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the |
33 | | /// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets |
34 | | /// (e.g. an empty list). |
35 | | /// |
36 | | /// ```text |
37 | | /// ┌───────┐ ┌───┐ |
38 | | /// │ (0,3) │ │ 0 │ |
39 | | /// ├───────┤ ├───┤ |
40 | | /// │ (3,3) │ │ 3 │ |
41 | | /// ├───────┤ ├───┤ |
42 | | /// │ (3,4) │ │ 3 │ |
43 | | /// ├───────┤ ├───┤ |
44 | | /// │ (4,5) │ │ 4 │ |
45 | | /// ├───────┤ ├───┤ |
46 | | /// │ (5,7) │ │ 5 │ |
47 | | /// └───────┘ ├───┤ |
48 | | /// │ 7 │ |
49 | | /// └───┘ |
50 | | /// |
51 | | /// Offsets Buffer |
52 | | /// Logical |
53 | | /// Offsets |
54 | | /// |
55 | | /// (offsets[i], |
56 | | /// offsets[i+1]) |
57 | | /// ``` |
58 | | #[derive(Debug, Clone, PartialEq, Eq)] |
59 | | pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>); |
60 | | |
61 | | impl<O: ArrowNativeType> OffsetBuffer<O> { |
62 | | /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] |
63 | | /// |
64 | | /// # Panics |
65 | | /// |
66 | | /// Panics if `buffer` is not a non-empty buffer containing |
67 | | /// monotonically increasing values greater than or equal to zero |
68 | | pub fn new(buffer: ScalarBuffer<O>) -> Self { |
69 | | assert!(!buffer.is_empty(), "offsets cannot be empty"); |
70 | | assert!( |
71 | | buffer[0] >= O::usize_as(0), |
72 | | "offsets must be greater than 0" |
73 | | ); |
74 | | assert!( |
75 | | buffer.windows(2).all(|w| w[0] <= w[1]), |
76 | | "offsets must be monotonically increasing" |
77 | | ); |
78 | | Self(buffer) |
79 | | } |
80 | | |
81 | | /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] |
82 | | /// |
83 | | /// # Safety |
84 | | /// |
85 | | /// `buffer` must be a non-empty buffer containing monotonically increasing |
86 | | /// values greater than or equal to zero |
87 | 241k | pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self { |
88 | 241k | Self(buffer) |
89 | 241k | } |
90 | | |
91 | | /// Create a new [`OffsetBuffer`] containing a single 0 value |
92 | 0 | pub fn new_empty() -> Self { |
93 | 0 | let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>()); |
94 | 0 | Self(buffer.into_buffer().into()) |
95 | 0 | } |
96 | | |
97 | | /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values |
98 | 35 | pub fn new_zeroed(len: usize) -> Self { |
99 | 35 | let len_bytes = len |
100 | 35 | .checked_add(1) |
101 | 35 | .and_then(|o| o.checked_mul(std::mem::size_of::<O>())) |
102 | 35 | .expect("overflow"); |
103 | 35 | let buffer = MutableBuffer::from_len_zeroed(len_bytes); |
104 | 35 | Self(buffer.into_buffer().into()) |
105 | 35 | } |
106 | | |
107 | | /// Create a new [`OffsetBuffer`] from the iterator of slice lengths |
108 | | /// |
109 | | /// ``` |
110 | | /// # use arrow_buffer::OffsetBuffer; |
111 | | /// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]); |
112 | | /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); |
113 | | /// ``` |
114 | | /// |
115 | | /// If you want to create an [`OffsetBuffer`] where all lengths are the same, |
116 | | /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead. |
117 | | /// |
118 | | /// # Panics |
119 | | /// |
120 | | /// Panics on overflow |
121 | 10 | pub fn from_lengths<I>(lengths: I) -> Self |
122 | 10 | where |
123 | 10 | I: IntoIterator<Item = usize>, |
124 | | { |
125 | 10 | let iter = lengths.into_iter(); |
126 | 10 | let mut out = Vec::with_capacity(iter.size_hint().0 + 1); |
127 | 10 | out.push(O::usize_as(0)); |
128 | | |
129 | 10 | let mut acc = 0_usize; |
130 | 80.0k | for length80.0k in iter { |
131 | 80.0k | acc = acc.checked_add(length).expect("usize overflow"); |
132 | 80.0k | out.push(O::usize_as(acc)) |
133 | | } |
134 | | // Check for overflow |
135 | 10 | O::from_usize(acc).expect("offset overflow"); |
136 | 10 | Self(out.into()) |
137 | 10 | } |
138 | | |
139 | | /// Create a new [`OffsetBuffer`] where each slice has the same length |
140 | | /// `length`, repeated `n` times. |
141 | | /// |
142 | | /// |
143 | | /// Example |
144 | | /// ``` |
145 | | /// # use arrow_buffer::OffsetBuffer; |
146 | | /// let offsets = OffsetBuffer::<i32>::from_repeated_length(4, 3); |
147 | | /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]); |
148 | | /// ``` |
149 | | /// |
150 | | /// # Panics |
151 | | /// |
152 | | /// Panics on overflow |
153 | 2 | pub fn from_repeated_length(length: usize, n: usize) -> Self { |
154 | 2 | if n == 0 { |
155 | 0 | return Self::new_empty(); |
156 | 2 | } |
157 | | |
158 | 2 | if length == 0 { |
159 | 0 | return Self::new_zeroed(n); |
160 | 2 | } |
161 | | |
162 | | // Check for overflow |
163 | | // Making sure we don't overflow usize or O when calculating the total length |
164 | 2 | length.checked_mul(n).expect("usize overflow"); |
165 | | |
166 | | // Check for overflow |
167 | 2 | O::from_usize(length * n).expect("offset overflow"); |
168 | | |
169 | 2 | let offsets = (0..=n) |
170 | 12 | .map2 (|index| O::usize_as(index * length)) |
171 | 2 | .collect::<Vec<O>>(); |
172 | | |
173 | 2 | Self(ScalarBuffer::from(offsets)) |
174 | 2 | } |
175 | | |
176 | | /// Get an Iterator over the lengths of this [`OffsetBuffer`] |
177 | | /// |
178 | | /// ``` |
179 | | /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; |
180 | | /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
181 | | /// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]); |
182 | | /// ``` |
183 | | /// |
184 | | /// Empty [`OffsetBuffer`] will return an empty iterator |
185 | | /// ``` |
186 | | /// # use arrow_buffer::OffsetBuffer; |
187 | | /// let offsets = OffsetBuffer::<i32>::new_empty(); |
188 | | /// assert_eq!(offsets.lengths().count(), 0); |
189 | | /// ``` |
190 | | /// |
191 | | /// This can be used to merge multiple [`OffsetBuffer`]s to one |
192 | | /// ``` |
193 | | /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; |
194 | | /// |
195 | | /// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]); |
196 | | /// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]); |
197 | | /// |
198 | | /// let merged = OffsetBuffer::<i32>::from_lengths( |
199 | | /// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths()) |
200 | | /// ); |
201 | | /// |
202 | | /// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]); |
203 | | /// ``` |
204 | 80.0k | pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ { |
205 | 80.0k | self.0.windows(2)80.0k .map80.0k (|x| x[1].as_usize() - x[0].as_usize()) |
206 | 80.0k | } |
207 | | |
208 | | /// Free up unused memory. |
209 | 0 | pub fn shrink_to_fit(&mut self) { |
210 | 0 | self.0.shrink_to_fit(); |
211 | 0 | } |
212 | | |
213 | | /// Returns the inner [`ScalarBuffer`] |
214 | 0 | pub fn inner(&self) -> &ScalarBuffer<O> { |
215 | 0 | &self.0 |
216 | 0 | } |
217 | | |
218 | | /// Returns the inner [`ScalarBuffer`], consuming self |
219 | 240k | pub fn into_inner(self) -> ScalarBuffer<O> { |
220 | 240k | self.0 |
221 | 240k | } |
222 | | |
223 | | /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` |
224 | 222 | pub fn slice(&self, offset: usize, len: usize) -> Self { |
225 | 222 | Self(self.0.slice(offset, len.saturating_add(1))) |
226 | 222 | } |
227 | | |
228 | | /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons |
229 | | /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may |
230 | | /// return false when the arrays are logically equal |
231 | | #[inline] |
232 | 2 | pub fn ptr_eq(&self, other: &Self) -> bool { |
233 | 2 | self.0.ptr_eq(&other.0) |
234 | 2 | } |
235 | | } |
236 | | |
237 | | impl<T: ArrowNativeType> Deref for OffsetBuffer<T> { |
238 | | type Target = [T]; |
239 | | |
240 | | #[inline] |
241 | 1.46M | fn deref(&self) -> &Self::Target { |
242 | 1.46M | &self.0 |
243 | 1.46M | } |
244 | | } |
245 | | |
246 | | impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> { |
247 | | #[inline] |
248 | | fn as_ref(&self) -> &[T] { |
249 | | self |
250 | | } |
251 | | } |
252 | | |
253 | | impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> { |
254 | | fn from(value: OffsetBufferBuilder<O>) -> Self { |
255 | | value.finish() |
256 | | } |
257 | | } |
258 | | |
259 | | impl<O: ArrowNativeType> Default for OffsetBuffer<O> { |
260 | | fn default() -> Self { |
261 | | Self::new_empty() |
262 | | } |
263 | | } |
264 | | |
265 | | #[cfg(test)] |
266 | | mod tests { |
267 | | use super::*; |
268 | | |
269 | | #[test] |
270 | | #[should_panic(expected = "offsets cannot be empty")] |
271 | | fn empty_offsets() { |
272 | | OffsetBuffer::new(Vec::<i32>::new().into()); |
273 | | } |
274 | | |
275 | | #[test] |
276 | | #[should_panic(expected = "offsets must be greater than 0")] |
277 | | fn negative_offsets() { |
278 | | OffsetBuffer::new(vec![-1, 0, 1].into()); |
279 | | } |
280 | | |
281 | | #[test] |
282 | | fn offsets() { |
283 | | OffsetBuffer::new(vec![0, 1, 2, 3].into()); |
284 | | |
285 | | let offsets = OffsetBuffer::<i32>::new_zeroed(3); |
286 | | assert_eq!(offsets.as_ref(), &[0; 4]); |
287 | | |
288 | | let offsets = OffsetBuffer::<i32>::new_zeroed(0); |
289 | | assert_eq!(offsets.as_ref(), &[0; 1]); |
290 | | } |
291 | | |
292 | | #[test] |
293 | | #[should_panic(expected = "overflow")] |
294 | | fn offsets_new_zeroed_overflow() { |
295 | | OffsetBuffer::<i32>::new_zeroed(usize::MAX); |
296 | | } |
297 | | |
298 | | #[test] |
299 | | #[should_panic(expected = "offsets must be monotonically increasing")] |
300 | | fn non_monotonic_offsets() { |
301 | | OffsetBuffer::new(vec![1, 2, 0].into()); |
302 | | } |
303 | | |
304 | | #[test] |
305 | | fn from_lengths() { |
306 | | let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]); |
307 | | assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]); |
308 | | |
309 | | let half_max = i32::MAX / 2; |
310 | | let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]); |
311 | | assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]); |
312 | | } |
313 | | |
314 | | #[test] |
315 | | #[should_panic(expected = "offset overflow")] |
316 | | fn from_lengths_offset_overflow() { |
317 | | OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]); |
318 | | } |
319 | | |
320 | | #[test] |
321 | | #[should_panic(expected = "usize overflow")] |
322 | | fn from_lengths_usize_overflow() { |
323 | | OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]); |
324 | | } |
325 | | |
326 | | #[test] |
327 | | #[should_panic(expected = "offset overflow")] |
328 | | fn from_repeated_lengths_offset_length_overflow() { |
329 | | OffsetBuffer::<i32>::from_repeated_length(i32::MAX as usize / 4, 5); |
330 | | } |
331 | | |
332 | | #[test] |
333 | | #[should_panic(expected = "offset overflow")] |
334 | | fn from_repeated_lengths_offset_repeat_overflow() { |
335 | | OffsetBuffer::<i32>::from_repeated_length(1, i32::MAX as usize + 1); |
336 | | } |
337 | | |
338 | | #[test] |
339 | | #[should_panic(expected = "offset overflow")] |
340 | | fn from_repeated_lengths_usize_length_overflow() { |
341 | | OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 1); |
342 | | } |
343 | | |
344 | | #[test] |
345 | | #[should_panic(expected = "usize overflow")] |
346 | | fn from_repeated_lengths_usize_length_usize_overflow() { |
347 | | OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 2); |
348 | | } |
349 | | |
350 | | #[test] |
351 | | #[should_panic(expected = "offset overflow")] |
352 | | fn from_repeated_lengths_usize_repeat_overflow() { |
353 | | OffsetBuffer::<i32>::from_repeated_length(1, usize::MAX); |
354 | | } |
355 | | |
356 | | #[test] |
357 | | fn get_lengths() { |
358 | | let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
359 | | assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]); |
360 | | } |
361 | | |
362 | | #[test] |
363 | | fn get_lengths_should_be_with_fixed_size() { |
364 | | let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
365 | | let iter = offsets.lengths(); |
366 | | assert_eq!(iter.size_hint(), (3, Some(3))); |
367 | | assert_eq!(iter.len(), 3); |
368 | | } |
369 | | |
370 | | #[test] |
371 | | fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() { |
372 | | let offsets = OffsetBuffer::<i32>::new_empty(); |
373 | | assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]); |
374 | | } |
375 | | |
376 | | #[test] |
377 | | fn impl_eq() { |
378 | | fn are_equal<T: Eq>(a: &T, b: &T) -> bool { |
379 | | a.eq(b) |
380 | | } |
381 | | |
382 | | assert!( |
383 | | are_equal( |
384 | | &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])), |
385 | | &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])) |
386 | | ), |
387 | | "OffsetBuffer should implement Eq." |
388 | | ); |
389 | | } |
390 | | |
391 | | #[test] |
392 | | fn impl_default() { |
393 | | let default = OffsetBuffer::<i32>::default(); |
394 | | assert_eq!(default.as_ref(), &[0]); |
395 | | } |
396 | | |
397 | | #[test] |
398 | | fn from_repeated_length_basic() { |
399 | | // Basic case with length 4, repeated 3 times |
400 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(4, 3); |
401 | | assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]); |
402 | | |
403 | | // Verify the lengths are correct |
404 | | let lengths: Vec<usize> = buffer.lengths().collect(); |
405 | | assert_eq!(lengths, vec![4, 4, 4]); |
406 | | } |
407 | | |
408 | | #[test] |
409 | | fn from_repeated_length_single_repeat() { |
410 | | // Length 5, repeated once |
411 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(5, 1); |
412 | | assert_eq!(buffer.as_ref(), &[0, 5]); |
413 | | |
414 | | let lengths: Vec<usize> = buffer.lengths().collect(); |
415 | | assert_eq!(lengths, vec![5]); |
416 | | } |
417 | | |
418 | | #[test] |
419 | | fn from_repeated_length_zero_repeats() { |
420 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(10, 0); |
421 | | assert_eq!(buffer, OffsetBuffer::<i32>::new_empty()); |
422 | | } |
423 | | |
424 | | #[test] |
425 | | fn from_repeated_length_zero_length() { |
426 | | // Zero length, repeated 5 times (all zeros) |
427 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(0, 5); |
428 | | assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]); |
429 | | |
430 | | // All lengths should be 0 |
431 | | let lengths: Vec<usize> = buffer.lengths().collect(); |
432 | | assert_eq!(lengths, vec![0, 0, 0, 0, 0]); |
433 | | } |
434 | | |
435 | | #[test] |
436 | | fn from_repeated_length_large_values() { |
437 | | // Test with larger values that don't overflow |
438 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(1000, 100); |
439 | | assert_eq!(buffer[0], 0); |
440 | | |
441 | | // Verify all lengths are 1000 |
442 | | let lengths: Vec<usize> = buffer.lengths().collect(); |
443 | | assert_eq!(lengths.len(), 100); |
444 | | assert!(lengths.iter().all(|&len| len == 1000)); |
445 | | } |
446 | | |
447 | | #[test] |
448 | | fn from_repeated_length_unit_length() { |
449 | | // Length 1, repeated multiple times |
450 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(1, 10); |
451 | | assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); |
452 | | |
453 | | let lengths: Vec<usize> = buffer.lengths().collect(); |
454 | | assert_eq!(lengths, vec![1; 10]); |
455 | | } |
456 | | |
457 | | #[test] |
458 | | fn from_repeated_length_max_safe_values() { |
459 | | // Test with maximum safe values for i32 |
460 | | // i32::MAX / 3 ensures we don't overflow when repeated twice |
461 | | let third_max = (i32::MAX / 3) as usize; |
462 | | let buffer = OffsetBuffer::<i32>::from_repeated_length(third_max, 2); |
463 | | assert_eq!( |
464 | | buffer.as_ref(), |
465 | | &[0, third_max as i32, (third_max * 2) as i32] |
466 | | ); |
467 | | } |
468 | | } |