Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/run_array.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::any::Any;
19
use std::sync::Arc;
20
21
use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer};
22
use arrow_data::{ArrayData, ArrayDataBuilder};
23
use arrow_schema::{ArrowError, DataType, Field};
24
25
use crate::{
26
    Array, ArrayAccessor, ArrayRef, PrimitiveArray,
27
    builder::StringRunBuilder,
28
    make_array,
29
    run_iterator::RunArrayIter,
30
    types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
31
};
32
33
/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
34
///
35
/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
36
/// and is good for representing data containing same values repeated consecutively.
37
///
38
/// [`RunArray`] contains `run_ends` array and `values` array of same length.
39
/// The `run_ends` array stores the indexes at which the run ends. The `values` array
40
/// stores the value of each run. Below example illustrates how a logical array is represented in
41
/// [`RunArray`]
42
///
43
///
44
/// ```text
45
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
46
///   ┌─────────────────┐  ┌─────────┐       ┌─────────────────┐
47
/// │ │        A        │  │    2    │ │     │        A        │
48
///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
49
/// │ │        D        │  │    3    │ │     │        A        │    run length of 'A' = runs_ends[0] - 0 = 2
50
///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
51
/// │ │        B        │  │    6    │ │     │        D        │    run length of 'D' = run_ends[1] - run_ends[0] = 1
52
///   └─────────────────┘  └─────────┘       ├─────────────────┤
53
/// │        values          run_ends  │     │        B        │
54
///                                          ├─────────────────┤
55
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘     │        B        │
56
///                                          ├─────────────────┤
57
///                RunArray                  │        B        │    run length of 'B' = run_ends[2] - run_ends[1] = 3
58
///               length = 3                 └─────────────────┘
59
///
60
///                                             Logical array
61
///                                                Contents
62
/// ```
63
pub struct RunArray<R: RunEndIndexType> {
64
    data_type: DataType,
65
    run_ends: RunEndBuffer<R::Native>,
66
    values: ArrayRef,
67
}
68
69
impl<R: RunEndIndexType> Clone for RunArray<R> {
70
12
    fn clone(&self) -> Self {
71
12
        Self {
72
12
            data_type: self.data_type.clone(),
73
12
            run_ends: self.run_ends.clone(),
74
12
            values: self.values.clone(),
75
12
        }
76
12
    }
77
}
78
79
impl<R: RunEndIndexType> RunArray<R> {
80
    /// Calculates the logical length of the array encoded
81
    /// by the given run_ends array.
82
36
    pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
83
36
        let len = run_ends.len();
84
36
        if len == 0 {
85
0
            return 0;
86
36
        }
87
36
        run_ends.value(len - 1).as_usize()
88
36
    }
89
90
    /// Attempts to create RunArray using given run_ends (index where a run ends)
91
    /// and the values (value of the run). Returns an error if the given data is not compatible
92
    /// with RunEndEncoded specification.
93
36
    pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> Result<Self, ArrowError> {
94
36
        let run_ends_type = run_ends.data_type().clone();
95
36
        let values_type = values.data_type().clone();
96
36
        let ree_array_type = DataType::RunEndEncoded(
97
36
            Arc::new(Field::new("run_ends", run_ends_type, false)),
98
36
            Arc::new(Field::new("values", values_type, true)),
99
36
        );
100
36
        let len = RunArray::logical_len(run_ends);
101
36
        let builder = ArrayDataBuilder::new(ree_array_type)
102
36
            .len(len)
103
36
            .add_child_data(run_ends.to_data())
104
36
            .add_child_data(values.to_data());
105
106
        // `build_unchecked` is used to avoid recursive validation of child arrays.
107
36
        let array_data = unsafe { builder.build_unchecked() };
108
109
        // Safety: `validate_data` checks below
110
        //    1. The given array data has exactly two child arrays.
111
        //    2. The first child array (run_ends) has valid data type.
112
        //    3. run_ends array does not have null values
113
        //    4. run_ends array has non-zero and strictly increasing values.
114
        //    5. The length of run_ends array and values array are the same.
115
36
        array_data.validate_data()
?0
;
116
117
36
        Ok(array_data.into())
118
36
    }
119
120
    /// Returns a reference to [`RunEndBuffer`]
121
48
    pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
122
48
        &self.run_ends
123
48
    }
124
125
    /// Returns a reference to values array
126
    ///
127
    /// Note: any slicing of this [`RunArray`] array is not applied to the returned array
128
    /// and must be handled separately
129
61
    pub fn values(&self) -> &ArrayRef {
130
61
        &self.values
131
61
    }
132
133
    /// Returns the physical index at which the array slice starts.
134
1
    pub fn get_start_physical_index(&self) -> usize {
135
1
        self.run_ends.get_start_physical_index()
136
1
    }
137
138
    /// Returns the physical index at which the array slice ends.
139
    pub fn get_end_physical_index(&self) -> usize {
140
        self.run_ends.get_end_physical_index()
141
    }
142
143
    /// Downcast this [`RunArray`] to a [`TypedRunArray`]
144
    ///
145
    /// ```
146
    /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type};
147
    ///
148
    /// let orig = [Some("a"), Some("b"), None];
149
    /// let run_array = RunArray::<Int32Type>::from_iter(orig);
150
    /// let typed = run_array.downcast::<StringArray>().unwrap();
151
    /// assert_eq!(typed.value(0), "a");
152
    /// assert_eq!(typed.value(1), "b");
153
    /// assert!(typed.values().is_null(2));
154
    /// ```
155
    ///
156
    pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
157
        let values = self.values.as_any().downcast_ref()?;
158
        Some(TypedRunArray {
159
            run_array: self,
160
            values,
161
        })
162
    }
163
164
    /// Returns index to the physical array for the given index to the logical array.
165
    /// This function adjusts the input logical index based on `ArrayData::offset`
166
    /// Performs a binary search on the run_ends array for the input index.
167
    ///
168
    /// The result is arbitrary if `logical_index >= self.len()`
169
30
    pub fn get_physical_index(&self, logical_index: usize) -> usize {
170
30
        self.run_ends.get_physical_index(logical_index)
171
30
    }
172
173
    /// Returns the physical indices of the input logical indices. Returns error if any of the logical
174
    /// index cannot be converted to physical index. The logical indices are sorted and iterated along
175
    /// with run_ends array to find matching physical index. The approach used here was chosen over
176
    /// finding physical index for each logical index using binary search using the function
177
    /// `get_physical_index`. Running benchmarks on both approaches showed that the approach used here
178
    /// scaled well for larger inputs.
179
    /// See <https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more details.
180
    #[inline]
181
1
    pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> Result<Vec<usize>, ArrowError>
182
1
    where
183
1
        I: ArrowNativeType,
184
    {
185
1
        let len = self.run_ends().len();
186
1
        let offset = self.run_ends().offset();
187
188
1
        let indices_len = logical_indices.len();
189
190
1
        if indices_len == 0 {
191
0
            return Ok(vec![]);
192
1
        }
193
194
        // `ordered_indices` store index into `logical_indices` and can be used
195
        // to iterate `logical_indices` in sorted order.
196
1
        let mut ordered_indices: Vec<usize> = (0..indices_len).collect();
197
198
        // Instead of sorting `logical_indices` directly, sort the `ordered_indices`
199
        // whose values are index of `logical_indices`
200
13
        
ordered_indices1
.
sort_unstable_by1
(|lhs, rhs| {
201
13
            logical_indices[*lhs]
202
13
                .partial_cmp(&logical_indices[*rhs])
203
13
                .unwrap()
204
13
        });
205
206
        // Return early if all the logical indices cannot be converted to physical indices.
207
1
        let largest_logical_index = logical_indices[*ordered_indices.last().unwrap()].as_usize();
208
1
        if largest_logical_index >= len {
209
0
            return Err(ArrowError::InvalidArgumentError(format!(
210
0
                "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.",
211
0
            )));
212
1
        }
213
214
        // Skip some physical indices based on offset.
215
1
        let skip_value = self.get_start_physical_index();
216
217
1
        let mut physical_indices = vec![0; indices_len];
218
219
1
        let mut ordered_index = 0_usize;
220
6
        for (physical_index, run_end) in 
self.run_ends.values()1
.
iter1
().
enumerate1
().
skip1
(
skip_value1
)
221
        {
222
            // Get the run end index (relative to offset) of current physical index
223
6
            let run_end_value = run_end.as_usize() - offset;
224
225
            // All the `logical_indices` that are less than current run end index
226
            // belongs to current physical index.
227
13
            while ordered_index < indices_len
228
12
                && logical_indices[ordered_indices[ordered_index]].as_usize() < run_end_value
229
7
            {
230
7
                physical_indices[ordered_indices[ordered_index]] = physical_index;
231
7
                ordered_index += 1;
232
7
            }
233
        }
234
235
        // If there are input values >= run_ends.last_value then we'll not be able to convert
236
        // all logical indices to physical indices.
237
1
        if ordered_index < logical_indices.len() {
238
0
            let logical_index = logical_indices[ordered_indices[ordered_index]].as_usize();
239
0
            return Err(ArrowError::InvalidArgumentError(format!(
240
0
                "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {logical_index}.",
241
0
            )));
242
1
        }
243
1
        Ok(physical_indices)
244
1
    }
245
246
    /// Returns a zero-copy slice of this array with the indicated offset and length.
247
1
    pub fn slice(&self, offset: usize, length: usize) -> Self {
248
1
        Self {
249
1
            data_type: self.data_type.clone(),
250
1
            run_ends: self.run_ends.slice(offset, length),
251
1
            values: self.values.clone(),
252
1
        }
253
1
    }
254
}
255
256
impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
257
    // The method assumes the caller already validated the data using `ArrayData::validate_data()`
258
48
    fn from(data: ArrayData) -> Self {
259
48
        match data.data_type() {
260
48
            DataType::RunEndEncoded(_, _) => {}
261
            _ => {
262
0
                panic!(
263
0
                    "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"
264
                );
265
            }
266
        }
267
268
        // Safety
269
        // ArrayData is valid
270
48
        let child = &data.child_data()[0];
271
48
        assert_eq!(child.data_type(), &R::DATA_TYPE, 
"Incorrect run ends type"0
);
272
48
        let run_ends = unsafe {
273
48
            let scalar = child.buffers()[0].clone().into();
274
48
            RunEndBuffer::new_unchecked(scalar, data.offset(), data.len())
275
        };
276
277
48
        let values = make_array(data.child_data()[1].clone());
278
48
        Self {
279
48
            data_type: data.data_type().clone(),
280
48
            run_ends,
281
48
            values,
282
48
        }
283
48
    }
284
}
285
286
impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData {
287
12
    fn from(array: RunArray<R>) -> Self {
288
12
        let len = array.run_ends.len();
289
12
        let offset = array.run_ends.offset();
290
291
12
        let run_ends = ArrayDataBuilder::new(R::DATA_TYPE)
292
12
            .len(array.run_ends.values().len())
293
12
            .buffers(vec![array.run_ends.into_inner().into_inner()]);
294
295
12
        let run_ends = unsafe { run_ends.build_unchecked() };
296
297
12
        let builder = ArrayDataBuilder::new(array.data_type)
298
12
            .len(len)
299
12
            .offset(offset)
300
12
            .child_data(vec![run_ends, array.values.to_data()]);
301
302
12
        unsafe { builder.build_unchecked() }
303
12
    }
304
}
305
306
impl<T: RunEndIndexType> Array for RunArray<T> {
307
29
    fn as_any(&self) -> &dyn Any {
308
29
        self
309
29
    }
310
311
12
    fn to_data(&self) -> ArrayData {
312
12
        self.clone().into()
313
12
    }
314
315
0
    fn into_data(self) -> ArrayData {
316
0
        self.into()
317
0
    }
318
319
50
    fn data_type(&self) -> &DataType {
320
50
        &self.data_type
321
50
    }
322
323
1
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
324
1
        Arc::new(self.slice(offset, length))
325
1
    }
326
327
24
    fn len(&self) -> usize {
328
24
        self.run_ends.len()
329
24
    }
330
331
0
    fn is_empty(&self) -> bool {
332
0
        self.run_ends.is_empty()
333
0
    }
334
335
0
    fn shrink_to_fit(&mut self) {
336
0
        self.run_ends.shrink_to_fit();
337
0
        self.values.shrink_to_fit();
338
0
    }
339
340
0
    fn offset(&self) -> usize {
341
0
        self.run_ends.offset()
342
0
    }
343
344
0
    fn nulls(&self) -> Option<&NullBuffer> {
345
0
        None
346
0
    }
347
348
0
    fn logical_nulls(&self) -> Option<NullBuffer> {
349
0
        let len = self.len();
350
0
        let nulls = self.values.logical_nulls()?;
351
0
        let mut out = BooleanBufferBuilder::new(len);
352
0
        let offset = self.run_ends.offset();
353
0
        let mut valid_start = 0;
354
0
        let mut last_end = 0;
355
0
        for (idx, end) in self.run_ends.values().iter().enumerate() {
356
0
            let end = end.as_usize();
357
0
            if end < offset {
358
0
                continue;
359
0
            }
360
0
            let end = (end - offset).min(len);
361
0
            if nulls.is_null(idx) {
362
0
                if valid_start < last_end {
363
0
                    out.append_n(last_end - valid_start, true);
364
0
                }
365
0
                out.append_n(end - last_end, false);
366
0
                valid_start = end;
367
0
            }
368
0
            last_end = end;
369
0
            if end == len {
370
0
                break;
371
0
            }
372
        }
373
0
        if valid_start < len {
374
0
            out.append_n(len - valid_start, true)
375
0
        }
376
        // Sanity check
377
0
        assert_eq!(out.len(), len);
378
0
        Some(out.finish().into())
379
0
    }
380
381
0
    fn is_nullable(&self) -> bool {
382
0
        !self.is_empty() && self.values.is_nullable()
383
0
    }
384
385
0
    fn get_buffer_memory_size(&self) -> usize {
386
0
        self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size()
387
0
    }
388
389
0
    fn get_array_memory_size(&self) -> usize {
390
0
        std::mem::size_of::<Self>()
391
0
            + self.run_ends.inner().inner().capacity()
392
0
            + self.values.get_array_memory_size()
393
0
    }
394
}
395
396
impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> {
397
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
398
0
        writeln!(
399
0
            f,
400
0
            "RunArray {{run_ends: {:?}, values: {:?}}}",
401
0
            self.run_ends.values(),
402
            self.values
403
        )
404
0
    }
405
}
406
407
/// Constructs a `RunArray` from an iterator of optional strings.
408
///
409
/// # Example:
410
/// ```
411
/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
412
///
413
/// let test = vec!["a", "a", "b", "c", "c"];
414
/// let array: RunArray<Int16Type> = test
415
///     .iter()
416
///     .map(|&x| if x == "b" { None } else { Some(x) })
417
///     .collect();
418
/// assert_eq!(
419
///     "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
420
///     format!("{:?}", array)
421
/// );
422
/// ```
423
impl<'a, T: RunEndIndexType> FromIterator<Option<&'a str>> for RunArray<T> {
424
2
    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
425
2
        let it = iter.into_iter();
426
2
        let (lower, _) = it.size_hint();
427
2
        let mut builder = StringRunBuilder::with_capacity(lower, 256);
428
9
        
it2
.
for_each2
(|i| {
429
9
            builder.append_option(i);
430
9
        });
431
432
2
        builder.finish()
433
2
    }
434
}
435
436
/// Constructs a `RunArray` from an iterator of strings.
437
///
438
/// # Example:
439
///
440
/// ```
441
/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
442
///
443
/// let test = vec!["a", "a", "b", "c"];
444
/// let array: RunArray<Int16Type> = test.into_iter().collect();
445
/// assert_eq!(
446
///     "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
447
///     format!("{:?}", array)
448
/// );
449
/// ```
450
impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray<T> {
451
2
    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
452
2
        let it = iter.into_iter();
453
2
        let (lower, _) = it.size_hint();
454
2
        let mut builder = StringRunBuilder::with_capacity(lower, 256);
455
9
        
it2
.
for_each2
(|i| {
456
9
            builder.append_value(i);
457
9
        });
458
459
2
        builder.finish()
460
2
    }
461
}
462
463
///
464
/// A [`RunArray`] with `i16` run ends
465
///
466
/// # Example: Using `collect`
467
/// ```
468
/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray};
469
/// # use std::sync::Arc;
470
///
471
/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
472
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
473
/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
474
/// assert_eq!(array.values(), &values);
475
/// ```
476
pub type Int16RunArray = RunArray<Int16Type>;
477
478
///
479
/// A [`RunArray`] with `i32` run ends
480
///
481
/// # Example: Using `collect`
482
/// ```
483
/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray};
484
/// # use std::sync::Arc;
485
///
486
/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
487
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
488
/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
489
/// assert_eq!(array.values(), &values);
490
/// ```
491
pub type Int32RunArray = RunArray<Int32Type>;
492
493
///
494
/// A [`RunArray`] with `i64` run ends
495
///
496
/// # Example: Using `collect`
497
/// ```
498
/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray};
499
/// # use std::sync::Arc;
500
///
501
/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
502
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
503
/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
504
/// assert_eq!(array.values(), &values);
505
/// ```
506
pub type Int64RunArray = RunArray<Int64Type>;
507
508
/// A [`RunArray`] typed typed on its child values array
509
///
510
/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements
511
///
512
/// ```
513
/// use arrow_array::{RunArray, StringArray, types::Int32Type};
514
///
515
/// let orig = ["a", "b", "a", "b"];
516
/// let ree_array = RunArray::<Int32Type>::from_iter(orig);
517
///
518
/// // `TypedRunArray` allows you to access the values directly
519
/// let typed = ree_array.downcast::<StringArray>().unwrap();
520
///
521
/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
522
///     assert_eq!(maybe_val.unwrap(), orig)
523
/// }
524
/// ```
525
pub struct TypedRunArray<'a, R: RunEndIndexType, V> {
526
    /// The run array
527
    run_array: &'a RunArray<R>,
528
529
    /// The values of the run_array
530
    values: &'a V,
531
}
532
533
// Manually implement `Clone` to avoid `V: Clone` type constraint
534
impl<R: RunEndIndexType, V> Clone for TypedRunArray<'_, R, V> {
535
    fn clone(&self) -> Self {
536
        *self
537
    }
538
}
539
540
impl<R: RunEndIndexType, V> Copy for TypedRunArray<'_, R, V> {}
541
542
impl<R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'_, R, V> {
543
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
544
        writeln!(f, "TypedRunArray({:?})", self.run_array)
545
    }
546
}
547
548
impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> {
549
    /// Returns the run_ends of this [`TypedRunArray`]
550
    pub fn run_ends(&self) -> &'a RunEndBuffer<R::Native> {
551
        self.run_array.run_ends()
552
    }
553
554
    /// Returns the values of this [`TypedRunArray`]
555
    pub fn values(&self) -> &'a V {
556
        self.values
557
    }
558
559
    /// Returns the run array of this [`TypedRunArray`]
560
    pub fn run_array(&self) -> &'a RunArray<R> {
561
        self.run_array
562
    }
563
}
564
565
impl<R: RunEndIndexType, V: Sync> Array for TypedRunArray<'_, R, V> {
566
    fn as_any(&self) -> &dyn Any {
567
        self.run_array
568
    }
569
570
    fn to_data(&self) -> ArrayData {
571
        self.run_array.to_data()
572
    }
573
574
    fn into_data(self) -> ArrayData {
575
        self.run_array.into_data()
576
    }
577
578
    fn data_type(&self) -> &DataType {
579
        self.run_array.data_type()
580
    }
581
582
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
583
        Arc::new(self.run_array.slice(offset, length))
584
    }
585
586
    fn len(&self) -> usize {
587
        self.run_array.len()
588
    }
589
590
    fn is_empty(&self) -> bool {
591
        self.run_array.is_empty()
592
    }
593
594
    fn offset(&self) -> usize {
595
        self.run_array.offset()
596
    }
597
598
    fn nulls(&self) -> Option<&NullBuffer> {
599
        self.run_array.nulls()
600
    }
601
602
    fn logical_nulls(&self) -> Option<NullBuffer> {
603
        self.run_array.logical_nulls()
604
    }
605
606
    fn logical_null_count(&self) -> usize {
607
        self.run_array.logical_null_count()
608
    }
609
610
    fn is_nullable(&self) -> bool {
611
        self.run_array.is_nullable()
612
    }
613
614
    fn get_buffer_memory_size(&self) -> usize {
615
        self.run_array.get_buffer_memory_size()
616
    }
617
618
    fn get_array_memory_size(&self) -> usize {
619
        self.run_array.get_array_memory_size()
620
    }
621
}
622
623
// Array accessor converts the index of logical array to the index of the physical array
624
// using binary search. The time complexity is O(log N) where N is number of runs.
625
impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V>
626
where
627
    R: RunEndIndexType,
628
    V: Sync + Send,
629
    &'a V: ArrayAccessor,
630
    <&'a V as ArrayAccessor>::Item: Default,
631
{
632
    type Item = <&'a V as ArrayAccessor>::Item;
633
634
    fn value(&self, logical_index: usize) -> Self::Item {
635
        assert!(
636
            logical_index < self.len(),
637
            "Trying to access an element at index {} from a TypedRunArray of length {}",
638
            logical_index,
639
            self.len()
640
        );
641
        unsafe { self.value_unchecked(logical_index) }
642
    }
643
644
    unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item {
645
        let physical_index = self.run_array.get_physical_index(logical_index);
646
        unsafe { self.values().value_unchecked(physical_index) }
647
    }
648
}
649
650
impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V>
651
where
652
    R: RunEndIndexType,
653
    V: Sync + Send,
654
    &'a V: ArrayAccessor,
655
    <&'a V as ArrayAccessor>::Item: Default,
656
{
657
    type Item = Option<<&'a V as ArrayAccessor>::Item>;
658
    type IntoIter = RunArrayIter<'a, R, V>;
659
660
    fn into_iter(self) -> Self::IntoIter {
661
        RunArrayIter::new(self)
662
    }
663
}
664
665
#[cfg(test)]
666
mod tests {
667
    use rand::Rng;
668
    use rand::rng;
669
    use rand::seq::SliceRandom;
670
671
    use super::*;
672
    use crate::builder::PrimitiveRunBuilder;
673
    use crate::cast::AsArray;
674
    use crate::types::{Int8Type, UInt32Type};
675
    use crate::{Int16Array, Int32Array, StringArray};
676
677
    fn build_input_array(size: usize) -> Vec<Option<i32>> {
678
        // The input array is created by shuffling and repeating
679
        // the seed values random number of times.
680
        let mut seed: Vec<Option<i32>> = vec![
681
            None,
682
            None,
683
            None,
684
            Some(1),
685
            Some(2),
686
            Some(3),
687
            Some(4),
688
            Some(5),
689
            Some(6),
690
            Some(7),
691
            Some(8),
692
            Some(9),
693
        ];
694
        let mut result: Vec<Option<i32>> = Vec::with_capacity(size);
695
        let mut ix = 0;
696
        let mut rng = rng();
697
        // run length can go up to 8. Cap the max run length for smaller arrays to size / 2.
698
        let max_run_length = 8_usize.min(1_usize.max(size / 2));
699
        while result.len() < size {
700
            // shuffle the seed array if all the values are iterated.
701
            if ix == 0 {
702
                seed.shuffle(&mut rng);
703
            }
704
            // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays
705
            let num = max_run_length.min(rng.random_range(1..=max_run_length));
706
            for _ in 0..num {
707
                result.push(seed[ix]);
708
            }
709
            ix += 1;
710
            if ix == seed.len() {
711
                ix = 0
712
            }
713
        }
714
        result.resize(size, None);
715
        result
716
    }
717
718
    // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]`
719
    fn compare_logical_and_physical_indices(
720
        logical_indices: &[u32],
721
        logical_array: &[Option<i32>],
722
        physical_indices: &[usize],
723
        physical_array: &PrimitiveArray<Int32Type>,
724
    ) {
725
        assert_eq!(logical_indices.len(), physical_indices.len());
726
727
        // check value in logical index in the logical_array matches physical index in physical_array
728
        logical_indices
729
            .iter()
730
            .map(|f| f.as_usize())
731
            .zip(physical_indices.iter())
732
            .for_each(|(logical_ix, physical_ix)| {
733
                let expected = logical_array[logical_ix];
734
                match expected {
735
                    Some(val) => {
736
                        assert!(physical_array.is_valid(*physical_ix));
737
                        let actual = physical_array.value(*physical_ix);
738
                        assert_eq!(val, actual);
739
                    }
740
                    None => {
741
                        assert!(physical_array.is_null(*physical_ix))
742
                    }
743
                };
744
            });
745
    }
746
    #[test]
747
    fn test_run_array() {
748
        // Construct a value array
749
        let value_data =
750
            PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
751
752
        // Construct a run_ends array:
753
        let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22];
754
        let run_ends_data =
755
            PrimitiveArray::<Int16Type>::from_iter_values(run_ends_values.iter().copied());
756
757
        // Construct a run ends encoded array from the above two
758
        let ree_array = RunArray::<Int16Type>::try_new(&run_ends_data, &value_data).unwrap();
759
760
        assert_eq!(ree_array.len(), 22);
761
        assert_eq!(ree_array.null_count(), 0);
762
763
        let values = ree_array.values();
764
        assert_eq!(value_data.into_data(), values.to_data());
765
        assert_eq!(&DataType::Int8, values.data_type());
766
767
        let run_ends = ree_array.run_ends();
768
        assert_eq!(run_ends.values(), &run_ends_values);
769
    }
770
771
    #[test]
772
    fn test_run_array_fmt_debug() {
773
        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(3);
774
        builder.append_value(12345678);
775
        builder.append_null();
776
        builder.append_value(22345678);
777
        let array = builder.finish();
778
        assert_eq!(
779
            "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray<UInt32>\n[\n  12345678,\n  null,\n  22345678,\n]}\n",
780
            format!("{array:?}")
781
        );
782
783
        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(20);
784
        for _ in 0..20 {
785
            builder.append_value(1);
786
        }
787
        let array = builder.finish();
788
789
        assert_eq!(array.len(), 20);
790
        assert_eq!(array.null_count(), 0);
791
        assert_eq!(array.logical_null_count(), 0);
792
793
        assert_eq!(
794
            "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
795
            format!("{array:?}")
796
        );
797
    }
798
799
    #[test]
800
    fn test_run_array_from_iter() {
801
        let test = vec!["a", "a", "b", "c"];
802
        let array: RunArray<Int16Type> = test
803
            .iter()
804
            .map(|&x| if x == "b" { None } else { Some(x) })
805
            .collect();
806
        assert_eq!(
807
            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
808
            format!("{array:?}")
809
        );
810
811
        assert_eq!(array.len(), 4);
812
        assert_eq!(array.null_count(), 0);
813
        assert_eq!(array.logical_null_count(), 1);
814
815
        let array: RunArray<Int16Type> = test.into_iter().collect();
816
        assert_eq!(
817
            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
818
            format!("{array:?}")
819
        );
820
    }
821
822
    #[test]
823
    fn test_run_array_run_ends_as_primitive_array() {
824
        let test = vec!["a", "b", "c", "a"];
825
        let array: RunArray<Int16Type> = test.into_iter().collect();
826
827
        assert_eq!(array.len(), 4);
828
        assert_eq!(array.null_count(), 0);
829
        assert_eq!(array.logical_null_count(), 0);
830
831
        let run_ends = array.run_ends();
832
        assert_eq!(&[1, 2, 3, 4], run_ends.values());
833
    }
834
835
    #[test]
836
    fn test_run_array_as_primitive_array_with_null() {
837
        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
838
        let array: RunArray<Int32Type> = test.into_iter().collect();
839
840
        assert_eq!(array.len(), 6);
841
        assert_eq!(array.null_count(), 0);
842
        assert_eq!(array.logical_null_count(), 3);
843
844
        let run_ends = array.run_ends();
845
        assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
846
847
        let values_data = array.values();
848
        assert_eq!(2, values_data.null_count());
849
        assert_eq!(5, values_data.len());
850
    }
851
852
    #[test]
853
    fn test_run_array_all_nulls() {
854
        let test = vec![None, None, None];
855
        let array: RunArray<Int32Type> = test.into_iter().collect();
856
857
        assert_eq!(array.len(), 3);
858
        assert_eq!(array.null_count(), 0);
859
        assert_eq!(array.logical_null_count(), 3);
860
861
        let run_ends = array.run_ends();
862
        assert_eq!(3, run_ends.len());
863
        assert_eq!(&[3], run_ends.values());
864
865
        let values_data = array.values();
866
        assert_eq!(1, values_data.null_count());
867
    }
868
869
    #[test]
870
    fn test_run_array_try_new() {
871
        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
872
            .into_iter()
873
            .collect();
874
        let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
875
876
        let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
877
        assert_eq!(array.values().data_type(), &DataType::Utf8);
878
879
        assert_eq!(array.null_count(), 0);
880
        assert_eq!(array.logical_null_count(), 1);
881
        assert_eq!(array.len(), 4);
882
        assert_eq!(array.values().null_count(), 1);
883
884
        assert_eq!(
885
            "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  null,\n  \"baz\",\n]}\n",
886
            format!("{array:?}")
887
        );
888
    }
889
890
    #[test]
891
    fn test_run_array_int16_type_definition() {
892
        let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
893
        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
894
        assert_eq!(array.run_ends().values(), &[2, 3, 5]);
895
        assert_eq!(array.values(), &values);
896
    }
897
898
    #[test]
899
    fn test_run_array_empty_string() {
900
        let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect();
901
        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "", "c"]));
902
        assert_eq!(array.run_ends().values(), &[2, 4, 5]);
903
        assert_eq!(array.values(), &values);
904
    }
905
906
    #[test]
907
    fn test_run_array_length_mismatch() {
908
        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
909
            .into_iter()
910
            .collect();
911
        let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect();
912
913
        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
914
        let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string());
915
        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
916
    }
917
918
    #[test]
919
    fn test_run_array_run_ends_with_null() {
920
        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
921
            .into_iter()
922
            .collect();
923
        let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect();
924
925
        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
926
        let expected = ArrowError::InvalidArgumentError(
927
            "Found null values in run_ends array. The run_ends array should not have null values."
928
                .to_string(),
929
        );
930
        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
931
    }
932
933
    #[test]
934
    fn test_run_array_run_ends_with_zeroes() {
935
        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
936
            .into_iter()
937
            .collect();
938
        let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect();
939
940
        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
941
        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string());
942
        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
943
    }
944
945
    #[test]
946
    fn test_run_array_run_ends_non_increasing() {
947
        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
948
            .into_iter()
949
            .collect();
950
        let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect();
951
952
        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
953
        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string());
954
        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
955
    }
956
957
    #[test]
958
    #[should_panic(expected = "Incorrect run ends type")]
959
    fn test_run_array_run_ends_data_type_mismatch() {
960
        let a = RunArray::<Int32Type>::from_iter(["32"]);
961
        let _ = RunArray::<Int64Type>::from(a.into_data());
962
    }
963
964
    #[test]
965
    fn test_ree_array_accessor() {
966
        let input_array = build_input_array(256);
967
968
        // Encode the input_array to ree_array
969
        let mut builder =
970
            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
971
        builder.extend(input_array.iter().copied());
972
        let run_array = builder.finish();
973
        let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
974
975
        // Access every index and check if the value in the input array matches returned value.
976
        for (i, inp_val) in input_array.iter().enumerate() {
977
            if let Some(val) = inp_val {
978
                let actual = typed.value(i);
979
                assert_eq!(*val, actual)
980
            } else {
981
                let physical_ix = run_array.get_physical_index(i);
982
                assert!(typed.values().is_null(physical_ix));
983
            };
984
        }
985
    }
986
987
    #[test]
988
    #[cfg_attr(miri, ignore)] // Takes too long
989
    fn test_get_physical_indices() {
990
        // Test for logical lengths starting from 10 to 250 increasing by 10
991
        for logical_len in (0..250).step_by(10) {
992
            let input_array = build_input_array(logical_len);
993
994
            // create run array using input_array
995
            let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
996
            builder.extend(input_array.clone().into_iter());
997
998
            let run_array = builder.finish();
999
            let physical_values_array = run_array.values().as_primitive::<Int32Type>();
1000
1001
            // create an array consisting of all the indices repeated twice and shuffled.
1002
            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as u32)).collect();
1003
            // add same indices once more
1004
            logical_indices.append(&mut logical_indices.clone());
1005
            let mut rng = rng();
1006
            logical_indices.shuffle(&mut rng);
1007
1008
            let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap();
1009
1010
            assert_eq!(logical_indices.len(), physical_indices.len());
1011
1012
            // check value in logical index in the input_array matches physical index in typed_run_array
1013
            compare_logical_and_physical_indices(
1014
                &logical_indices,
1015
                &input_array,
1016
                &physical_indices,
1017
                physical_values_array,
1018
            );
1019
        }
1020
    }
1021
1022
    #[test]
1023
    #[cfg_attr(miri, ignore)] // Takes too long
1024
    fn test_get_physical_indices_sliced() {
1025
        let total_len = 80;
1026
        let input_array = build_input_array(total_len);
1027
1028
        // Encode the input_array to run array
1029
        let mut builder =
1030
            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
1031
        builder.extend(input_array.iter().copied());
1032
        let run_array = builder.finish();
1033
        let physical_values_array = run_array.values().as_primitive::<Int32Type>();
1034
1035
        // test for all slice lengths.
1036
        for slice_len in 1..=total_len {
1037
            // create an array consisting of all the indices repeated twice and shuffled.
1038
            let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect();
1039
            // add same indices once more
1040
            logical_indices.append(&mut logical_indices.clone());
1041
            let mut rng = rng();
1042
            logical_indices.shuffle(&mut rng);
1043
1044
            // test for offset = 0 and slice length = slice_len
1045
            // slice the input array using which the run array was built.
1046
            let sliced_input_array = &input_array[0..slice_len];
1047
1048
            // slice the run array
1049
            let sliced_run_array: RunArray<Int16Type> =
1050
                run_array.slice(0, slice_len).into_data().into();
1051
1052
            // Get physical indices.
1053
            let physical_indices = sliced_run_array
1054
                .get_physical_indices(&logical_indices)
1055
                .unwrap();
1056
1057
            compare_logical_and_physical_indices(
1058
                &logical_indices,
1059
                sliced_input_array,
1060
                &physical_indices,
1061
                physical_values_array,
1062
            );
1063
1064
            // test for offset = total_len - slice_len and slice length = slice_len
1065
            // slice the input array using which the run array was built.
1066
            let sliced_input_array = &input_array[total_len - slice_len..total_len];
1067
1068
            // slice the run array
1069
            let sliced_run_array: RunArray<Int16Type> = run_array
1070
                .slice(total_len - slice_len, slice_len)
1071
                .into_data()
1072
                .into();
1073
1074
            // Get physical indices
1075
            let physical_indices = sliced_run_array
1076
                .get_physical_indices(&logical_indices)
1077
                .unwrap();
1078
1079
            compare_logical_and_physical_indices(
1080
                &logical_indices,
1081
                sliced_input_array,
1082
                &physical_indices,
1083
                physical_values_array,
1084
            );
1085
        }
1086
    }
1087
1088
    #[test]
1089
    fn test_logical_nulls() {
1090
        let run = Int32Array::from(vec![3, 6, 9, 12]);
1091
        let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
1092
        let array = RunArray::try_new(&run, &values).unwrap();
1093
1094
        let expected = [
1095
            true, true, true, false, false, false, true, true, true, false, false, false,
1096
        ];
1097
1098
        let n = array.logical_nulls().unwrap();
1099
        assert_eq!(n.null_count(), 6);
1100
1101
        let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
1102
        for (offset, length) in slices {
1103
            let a = array.slice(offset, length);
1104
            let n = a.logical_nulls().unwrap();
1105
            let n = n.into_iter().collect::<Vec<_>>();
1106
            assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}");
1107
        }
1108
    }
1109
1110
    #[test]
1111
    fn test_run_array_eq_identical() {
1112
        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1113
        let values1 = StringArray::from(vec!["a", "b", "c"]);
1114
        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1115
1116
        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1117
        let values2 = StringArray::from(vec!["a", "b", "c"]);
1118
        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1119
1120
        assert_eq!(array1, array2);
1121
    }
1122
1123
    #[test]
1124
    fn test_run_array_ne_different_run_ends() {
1125
        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1126
        let values1 = StringArray::from(vec!["a", "b", "c"]);
1127
        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1128
1129
        let run_ends2 = Int32Array::from(vec![1, 4, 6]);
1130
        let values2 = StringArray::from(vec!["a", "b", "c"]);
1131
        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1132
1133
        assert_ne!(array1, array2);
1134
    }
1135
1136
    #[test]
1137
    fn test_run_array_ne_different_values() {
1138
        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1139
        let values1 = StringArray::from(vec!["a", "b", "c"]);
1140
        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1141
1142
        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1143
        let values2 = StringArray::from(vec!["a", "b", "d"]);
1144
        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1145
1146
        assert_ne!(array1, array2);
1147
    }
1148
1149
    #[test]
1150
    fn test_run_array_eq_with_nulls() {
1151
        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1152
        let values1 = StringArray::from(vec![Some("a"), None, Some("c")]);
1153
        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1154
1155
        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1156
        let values2 = StringArray::from(vec![Some("a"), None, Some("c")]);
1157
        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1158
1159
        assert_eq!(array1, array2);
1160
    }
1161
1162
    #[test]
1163
    fn test_run_array_eq_different_run_end_types() {
1164
        let run_ends_i16_1 = Int16Array::from(vec![2_i16, 4, 6]);
1165
        let values_i16_1 = StringArray::from(vec!["a", "b", "c"]);
1166
        let array_i16_1 = RunArray::<Int16Type>::try_new(&run_ends_i16_1, &values_i16_1).unwrap();
1167
1168
        let run_ends_i16_2 = Int16Array::from(vec![2_i16, 4, 6]);
1169
        let values_i16_2 = StringArray::from(vec!["a", "b", "c"]);
1170
        let array_i16_2 = RunArray::<Int16Type>::try_new(&run_ends_i16_2, &values_i16_2).unwrap();
1171
1172
        assert_eq!(array_i16_1, array_i16_2);
1173
    }
1174
}