Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/buffer/offset.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::buffer::ScalarBuffer;
19
use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder};
20
use std::ops::Deref;
21
22
/// A non-empty buffer of monotonically increasing, positive integers.
23
///
24
/// [`OffsetBuffer`] are used to represent ranges of offsets. An
25
/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start
26
/// offset for element `i` is `offsets[i]` and the end offset is
27
/// `offsets[i+1]`. Equal offsets represent an empty range.
28
///
29
/// # Example
30
///
31
/// This example shows how 5 distinct ranges, are represented using a
32
/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the
33
/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets
34
/// (e.g. an empty list).
35
///
36
/// ```text
37
///   ┌───────┐                ┌───┐
38
///   │ (0,3) │                │ 0 │
39
///   ├───────┤                ├───┤
40
///   │ (3,3) │                │ 3 │
41
///   ├───────┤                ├───┤
42
///   │ (3,4) │                │ 3 │
43
///   ├───────┤                ├───┤
44
///   │ (4,5) │                │ 4 │
45
///   ├───────┤                ├───┤
46
///   │ (5,7) │                │ 5 │
47
///   └───────┘                ├───┤
48
///                            │ 7 │
49
///                            └───┘
50
///
51
///                        Offsets Buffer
52
///    Logical
53
///    Offsets
54
///
55
///  (offsets[i],
56
///   offsets[i+1])
57
/// ```
58
#[derive(Debug, Clone, PartialEq, Eq)]
59
pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>);
60
61
impl<O: ArrowNativeType> OffsetBuffer<O> {
62
    /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
63
    ///
64
    /// # Panics
65
    ///
66
    /// Panics if `buffer` is not a non-empty buffer containing
67
    /// monotonically increasing values greater than or equal to zero
68
    pub fn new(buffer: ScalarBuffer<O>) -> Self {
69
        assert!(!buffer.is_empty(), "offsets cannot be empty");
70
        assert!(
71
            buffer[0] >= O::usize_as(0),
72
            "offsets must be greater than 0"
73
        );
74
        assert!(
75
            buffer.windows(2).all(|w| w[0] <= w[1]),
76
            "offsets must be monotonically increasing"
77
        );
78
        Self(buffer)
79
    }
80
81
    /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
82
    ///
83
    /// # Safety
84
    ///
85
    /// `buffer` must be a non-empty buffer containing monotonically increasing
86
    /// values greater than or equal to zero
87
241k
    pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self {
88
241k
        Self(buffer)
89
241k
    }
90
91
    /// Create a new [`OffsetBuffer`] containing a single 0 value
92
0
    pub fn new_empty() -> Self {
93
0
        let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>());
94
0
        Self(buffer.into_buffer().into())
95
0
    }
96
97
    /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values
98
35
    pub fn new_zeroed(len: usize) -> Self {
99
35
        let len_bytes = len
100
35
            .checked_add(1)
101
35
            .and_then(|o| o.checked_mul(std::mem::size_of::<O>()))
102
35
            .expect("overflow");
103
35
        let buffer = MutableBuffer::from_len_zeroed(len_bytes);
104
35
        Self(buffer.into_buffer().into())
105
35
    }
106
107
    /// Create a new [`OffsetBuffer`] from the iterator of slice lengths
108
    ///
109
    /// ```
110
    /// # use arrow_buffer::OffsetBuffer;
111
    /// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]);
112
    /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]);
113
    /// ```
114
    ///
115
    /// If you want to create an [`OffsetBuffer`] where all lengths are the same,
116
    /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead.
117
    ///
118
    /// # Panics
119
    ///
120
    /// Panics on overflow
121
10
    pub fn from_lengths<I>(lengths: I) -> Self
122
10
    where
123
10
        I: IntoIterator<Item = usize>,
124
    {
125
10
        let iter = lengths.into_iter();
126
10
        let mut out = Vec::with_capacity(iter.size_hint().0 + 1);
127
10
        out.push(O::usize_as(0));
128
129
10
        let mut acc = 0_usize;
130
80.0k
        for 
length80.0k
in iter {
131
80.0k
            acc = acc.checked_add(length).expect("usize overflow");
132
80.0k
            out.push(O::usize_as(acc))
133
        }
134
        // Check for overflow
135
10
        O::from_usize(acc).expect("offset overflow");
136
10
        Self(out.into())
137
10
    }
138
139
    /// Create a new [`OffsetBuffer`] where each slice has the same length
140
    /// `length`, repeated `n` times.
141
    ///
142
    ///
143
    /// Example
144
    /// ```
145
    /// # use arrow_buffer::OffsetBuffer;
146
    /// let offsets = OffsetBuffer::<i32>::from_repeated_length(4, 3);
147
    /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]);
148
    /// ```
149
    ///
150
    /// # Panics
151
    ///
152
    /// Panics on overflow
153
2
    pub fn from_repeated_length(length: usize, n: usize) -> Self {
154
2
        if n == 0 {
155
0
            return Self::new_empty();
156
2
        }
157
158
2
        if length == 0 {
159
0
            return Self::new_zeroed(n);
160
2
        }
161
162
        // Check for overflow
163
        // Making sure we don't overflow usize or O when calculating the total length
164
2
        length.checked_mul(n).expect("usize overflow");
165
166
        // Check for overflow
167
2
        O::from_usize(length * n).expect("offset overflow");
168
169
2
        let offsets = (0..=n)
170
12
            .
map2
(|index| O::usize_as(index * length))
171
2
            .collect::<Vec<O>>();
172
173
2
        Self(ScalarBuffer::from(offsets))
174
2
    }
175
176
    /// Get an Iterator over the lengths of this [`OffsetBuffer`]
177
    ///
178
    /// ```
179
    /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
180
    /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
181
    /// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
182
    /// ```
183
    ///
184
    /// Empty [`OffsetBuffer`] will return an empty iterator
185
    /// ```
186
    /// # use arrow_buffer::OffsetBuffer;
187
    /// let offsets = OffsetBuffer::<i32>::new_empty();
188
    /// assert_eq!(offsets.lengths().count(), 0);
189
    /// ```
190
    ///
191
    /// This can be used to merge multiple [`OffsetBuffer`]s to one
192
    /// ```
193
    /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
194
    ///
195
    /// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
196
    /// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]);
197
    ///
198
    /// let merged = OffsetBuffer::<i32>::from_lengths(
199
    ///     vec![buffer1, buffer2].iter().flat_map(|x| x.lengths())
200
    /// );
201
    ///
202
    /// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]);
203
    /// ```
204
80.0k
    pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
205
80.0k
        
self.0.windows(2)80.0k
.
map80.0k
(|x| x[1].as_usize() - x[0].as_usize())
206
80.0k
    }
207
208
    /// Free up unused memory.
209
0
    pub fn shrink_to_fit(&mut self) {
210
0
        self.0.shrink_to_fit();
211
0
    }
212
213
    /// Returns the inner [`ScalarBuffer`]
214
0
    pub fn inner(&self) -> &ScalarBuffer<O> {
215
0
        &self.0
216
0
    }
217
218
    /// Returns the inner [`ScalarBuffer`], consuming self
219
240k
    pub fn into_inner(self) -> ScalarBuffer<O> {
220
240k
        self.0
221
240k
    }
222
223
    /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset`
224
222
    pub fn slice(&self, offset: usize, len: usize) -> Self {
225
222
        Self(self.0.slice(offset, len.saturating_add(1)))
226
222
    }
227
228
    /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons
229
    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
230
    /// return false when the arrays are logically equal
231
    #[inline]
232
2
    pub fn ptr_eq(&self, other: &Self) -> bool {
233
2
        self.0.ptr_eq(&other.0)
234
2
    }
235
}
236
237
impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
238
    type Target = [T];
239
240
    #[inline]
241
1.46M
    fn deref(&self) -> &Self::Target {
242
1.46M
        &self.0
243
1.46M
    }
244
}
245
246
impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> {
247
    #[inline]
248
    fn as_ref(&self) -> &[T] {
249
        self
250
    }
251
}
252
253
impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> {
254
    fn from(value: OffsetBufferBuilder<O>) -> Self {
255
        value.finish()
256
    }
257
}
258
259
impl<O: ArrowNativeType> Default for OffsetBuffer<O> {
260
    fn default() -> Self {
261
        Self::new_empty()
262
    }
263
}
264
265
#[cfg(test)]
266
mod tests {
267
    use super::*;
268
269
    #[test]
270
    #[should_panic(expected = "offsets cannot be empty")]
271
    fn empty_offsets() {
272
        OffsetBuffer::new(Vec::<i32>::new().into());
273
    }
274
275
    #[test]
276
    #[should_panic(expected = "offsets must be greater than 0")]
277
    fn negative_offsets() {
278
        OffsetBuffer::new(vec![-1, 0, 1].into());
279
    }
280
281
    #[test]
282
    fn offsets() {
283
        OffsetBuffer::new(vec![0, 1, 2, 3].into());
284
285
        let offsets = OffsetBuffer::<i32>::new_zeroed(3);
286
        assert_eq!(offsets.as_ref(), &[0; 4]);
287
288
        let offsets = OffsetBuffer::<i32>::new_zeroed(0);
289
        assert_eq!(offsets.as_ref(), &[0; 1]);
290
    }
291
292
    #[test]
293
    #[should_panic(expected = "overflow")]
294
    fn offsets_new_zeroed_overflow() {
295
        OffsetBuffer::<i32>::new_zeroed(usize::MAX);
296
    }
297
298
    #[test]
299
    #[should_panic(expected = "offsets must be monotonically increasing")]
300
    fn non_monotonic_offsets() {
301
        OffsetBuffer::new(vec![1, 2, 0].into());
302
    }
303
304
    #[test]
305
    fn from_lengths() {
306
        let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
307
        assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]);
308
309
        let half_max = i32::MAX / 2;
310
        let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]);
311
        assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]);
312
    }
313
314
    #[test]
315
    #[should_panic(expected = "offset overflow")]
316
    fn from_lengths_offset_overflow() {
317
        OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]);
318
    }
319
320
    #[test]
321
    #[should_panic(expected = "usize overflow")]
322
    fn from_lengths_usize_overflow() {
323
        OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]);
324
    }
325
326
    #[test]
327
    #[should_panic(expected = "offset overflow")]
328
    fn from_repeated_lengths_offset_length_overflow() {
329
        OffsetBuffer::<i32>::from_repeated_length(i32::MAX as usize / 4, 5);
330
    }
331
332
    #[test]
333
    #[should_panic(expected = "offset overflow")]
334
    fn from_repeated_lengths_offset_repeat_overflow() {
335
        OffsetBuffer::<i32>::from_repeated_length(1, i32::MAX as usize + 1);
336
    }
337
338
    #[test]
339
    #[should_panic(expected = "offset overflow")]
340
    fn from_repeated_lengths_usize_length_overflow() {
341
        OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 1);
342
    }
343
344
    #[test]
345
    #[should_panic(expected = "usize overflow")]
346
    fn from_repeated_lengths_usize_length_usize_overflow() {
347
        OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 2);
348
    }
349
350
    #[test]
351
    #[should_panic(expected = "offset overflow")]
352
    fn from_repeated_lengths_usize_repeat_overflow() {
353
        OffsetBuffer::<i32>::from_repeated_length(1, usize::MAX);
354
    }
355
356
    #[test]
357
    fn get_lengths() {
358
        let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
359
        assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
360
    }
361
362
    #[test]
363
    fn get_lengths_should_be_with_fixed_size() {
364
        let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
365
        let iter = offsets.lengths();
366
        assert_eq!(iter.size_hint(), (3, Some(3)));
367
        assert_eq!(iter.len(), 3);
368
    }
369
370
    #[test]
371
    fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() {
372
        let offsets = OffsetBuffer::<i32>::new_empty();
373
        assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]);
374
    }
375
376
    #[test]
377
    fn impl_eq() {
378
        fn are_equal<T: Eq>(a: &T, b: &T) -> bool {
379
            a.eq(b)
380
        }
381
382
        assert!(
383
            are_equal(
384
                &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])),
385
                &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]))
386
            ),
387
            "OffsetBuffer should implement Eq."
388
        );
389
    }
390
391
    #[test]
392
    fn impl_default() {
393
        let default = OffsetBuffer::<i32>::default();
394
        assert_eq!(default.as_ref(), &[0]);
395
    }
396
397
    #[test]
398
    fn from_repeated_length_basic() {
399
        // Basic case with length 4, repeated 3 times
400
        let buffer = OffsetBuffer::<i32>::from_repeated_length(4, 3);
401
        assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]);
402
403
        // Verify the lengths are correct
404
        let lengths: Vec<usize> = buffer.lengths().collect();
405
        assert_eq!(lengths, vec![4, 4, 4]);
406
    }
407
408
    #[test]
409
    fn from_repeated_length_single_repeat() {
410
        // Length 5, repeated once
411
        let buffer = OffsetBuffer::<i32>::from_repeated_length(5, 1);
412
        assert_eq!(buffer.as_ref(), &[0, 5]);
413
414
        let lengths: Vec<usize> = buffer.lengths().collect();
415
        assert_eq!(lengths, vec![5]);
416
    }
417
418
    #[test]
419
    fn from_repeated_length_zero_repeats() {
420
        let buffer = OffsetBuffer::<i32>::from_repeated_length(10, 0);
421
        assert_eq!(buffer, OffsetBuffer::<i32>::new_empty());
422
    }
423
424
    #[test]
425
    fn from_repeated_length_zero_length() {
426
        // Zero length, repeated 5 times (all zeros)
427
        let buffer = OffsetBuffer::<i32>::from_repeated_length(0, 5);
428
        assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]);
429
430
        // All lengths should be 0
431
        let lengths: Vec<usize> = buffer.lengths().collect();
432
        assert_eq!(lengths, vec![0, 0, 0, 0, 0]);
433
    }
434
435
    #[test]
436
    fn from_repeated_length_large_values() {
437
        // Test with larger values that don't overflow
438
        let buffer = OffsetBuffer::<i32>::from_repeated_length(1000, 100);
439
        assert_eq!(buffer[0], 0);
440
441
        // Verify all lengths are 1000
442
        let lengths: Vec<usize> = buffer.lengths().collect();
443
        assert_eq!(lengths.len(), 100);
444
        assert!(lengths.iter().all(|&len| len == 1000));
445
    }
446
447
    #[test]
448
    fn from_repeated_length_unit_length() {
449
        // Length 1, repeated multiple times
450
        let buffer = OffsetBuffer::<i32>::from_repeated_length(1, 10);
451
        assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
452
453
        let lengths: Vec<usize> = buffer.lengths().collect();
454
        assert_eq!(lengths, vec![1; 10]);
455
    }
456
457
    #[test]
458
    fn from_repeated_length_max_safe_values() {
459
        // Test with maximum safe values for i32
460
        // i32::MAX / 3 ensures we don't overflow when repeated twice
461
        let third_max = (i32::MAX / 3) as usize;
462
        let buffer = OffsetBuffer::<i32>::from_repeated_length(third_max, 2);
463
        assert_eq!(
464
            buffer.as_ref(),
465
            &[0, third_max as i32, (third_max * 2) as i32]
466
        );
467
    }
468
}