Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_byte_run_builder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::types::bytes::ByteArrayNativeType;
19
use std::{any::Any, sync::Arc};
20
21
use crate::{
22
    ArrayRef, ArrowPrimitiveType, RunArray,
23
    types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type},
24
};
25
26
use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
27
28
use arrow_buffer::ArrowNativeType;
29
30
/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray)
31
///
32
/// # Example:
33
///
34
/// ```
35
///
36
/// # use arrow_array::builder::GenericByteRunBuilder;
37
/// # use arrow_array::{GenericByteArray, BinaryArray};
38
/// # use arrow_array::types::{BinaryType, Int16Type};
39
/// # use arrow_array::{Array, Int16Array};
40
/// # use arrow_array::cast::AsArray;
41
///
42
/// let mut builder =
43
/// GenericByteRunBuilder::<Int16Type, BinaryType>::new();
44
/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter());
45
/// builder.append_value(b"def");
46
/// builder.append_null();
47
/// let array = builder.finish();
48
///
49
/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]);
50
///
51
/// let av = array.values();
52
///
53
/// assert!(!av.is_null(0));
54
/// assert!(av.is_null(1));
55
/// assert!(!av.is_null(2));
56
/// assert!(av.is_null(3));
57
///
58
/// // Values are polymorphic and so require a downcast.
59
/// let ava: &BinaryArray = av.as_binary();
60
///
61
/// assert_eq!(ava.value(0), b"abc");
62
/// assert_eq!(ava.value(2), b"def");
63
/// ```
64
#[derive(Debug)]
65
pub struct GenericByteRunBuilder<R, V>
66
where
67
    R: ArrowPrimitiveType,
68
    V: ByteArrayType,
69
{
70
    run_ends_builder: PrimitiveBuilder<R>,
71
    values_builder: GenericByteBuilder<V>,
72
    current_value: Vec<u8>,
73
    has_current_value: bool,
74
    current_run_end_index: usize,
75
    prev_run_end_index: usize,
76
}
77
78
impl<R, V> Default for GenericByteRunBuilder<R, V>
79
where
80
    R: ArrowPrimitiveType,
81
    V: ByteArrayType,
82
{
83
    fn default() -> Self {
84
        Self::new()
85
    }
86
}
87
88
impl<R, V> GenericByteRunBuilder<R, V>
89
where
90
    R: ArrowPrimitiveType,
91
    V: ByteArrayType,
92
{
93
    /// Creates a new `GenericByteRunBuilder`
94
    pub fn new() -> Self {
95
        Self {
96
            run_ends_builder: PrimitiveBuilder::new(),
97
            values_builder: GenericByteBuilder::<V>::new(),
98
            current_value: Vec::new(),
99
            has_current_value: false,
100
            current_run_end_index: 0,
101
            prev_run_end_index: 0,
102
        }
103
    }
104
105
    /// Creates a new `GenericByteRunBuilder` with the provided capacity
106
    ///
107
    /// `capacity`: the expected number of run-end encoded values.
108
    /// `data_capacity`: the expected number of bytes of run end encoded values
109
4
    pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self {
110
4
        Self {
111
4
            run_ends_builder: PrimitiveBuilder::with_capacity(capacity),
112
4
            values_builder: GenericByteBuilder::<V>::with_capacity(capacity, data_capacity),
113
4
            current_value: Vec::new(),
114
4
            has_current_value: false,
115
4
            current_run_end_index: 0,
116
4
            prev_run_end_index: 0,
117
4
        }
118
4
    }
119
}
120
121
impl<R, V> ArrayBuilder for GenericByteRunBuilder<R, V>
122
where
123
    R: RunEndIndexType,
124
    V: ByteArrayType,
125
{
126
    /// Returns the builder as a non-mutable `Any` reference.
127
    fn as_any(&self) -> &dyn Any {
128
        self
129
    }
130
131
    /// Returns the builder as a mutable `Any` reference.
132
    fn as_any_mut(&mut self) -> &mut dyn Any {
133
        self
134
    }
135
136
    /// Returns the boxed builder as a box of `Any`.
137
    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
138
        self
139
    }
140
141
    /// Returns the length of logical array encoded by
142
    /// the eventual runs array.
143
    fn len(&self) -> usize {
144
        self.current_run_end_index
145
    }
146
147
    /// Builds the array and reset this builder.
148
    fn finish(&mut self) -> ArrayRef {
149
        Arc::new(self.finish())
150
    }
151
152
    /// Builds the array without resetting the builder.
153
    fn finish_cloned(&self) -> ArrayRef {
154
        Arc::new(self.finish_cloned())
155
    }
156
}
157
158
impl<R, V> GenericByteRunBuilder<R, V>
159
where
160
    R: RunEndIndexType,
161
    V: ByteArrayType,
162
{
163
    /// Appends optional value to the logical array encoded by the RunArray.
164
9
    pub fn append_option(&mut self, input_value: Option<impl AsRef<V::Native>>) {
165
9
        match input_value {
166
6
            Some(value) => self.append_value(value),
167
3
            None => self.append_null(),
168
        }
169
9
    }
170
171
    /// Appends value to the logical array encoded by the RunArray.
172
15
    pub fn append_value(&mut self, input_value: impl AsRef<V::Native>) {
173
15
        let value: &[u8] = input_value.as_ref().as_ref();
174
15
        if !self.has_current_value {
175
5
            self.append_run_end();
176
5
            self.current_value.extend_from_slice(value);
177
5
            self.has_current_value = true;
178
10
        } else if self.current_value.as_slice() != value {
179
5
            self.append_run_end();
180
5
            self.current_value.clear();
181
5
            self.current_value.extend_from_slice(value);
182
5
        }
183
15
        self.current_run_end_index += 1;
184
15
    }
185
186
    /// Appends null to the logical array encoded by the RunArray.
187
3
    pub fn append_null(&mut self) {
188
3
        if self.has_current_value {
189
1
            self.append_run_end();
190
1
            self.current_value.clear();
191
1
            self.has_current_value = false;
192
2
        }
193
3
        self.current_run_end_index += 1;
194
3
    }
195
196
    /// Creates the RunArray and resets the builder.
197
    /// Panics if RunArray cannot be built.
198
4
    pub fn finish(&mut self) -> RunArray<R> {
199
        // write the last run end to the array.
200
4
        self.append_run_end();
201
202
        // reset the run end index to zero.
203
4
        self.current_value.clear();
204
4
        self.has_current_value = false;
205
4
        self.current_run_end_index = 0;
206
4
        self.prev_run_end_index = 0;
207
208
        // build the run encoded array by adding run_ends and values array as its children.
209
4
        let run_ends_array = self.run_ends_builder.finish();
210
4
        let values_array = self.values_builder.finish();
211
4
        RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
212
4
    }
213
214
    /// Creates the RunArray and without resetting the builder.
215
    /// Panics if RunArray cannot be built.
216
    pub fn finish_cloned(&self) -> RunArray<R> {
217
        let mut run_ends_array = self.run_ends_builder.finish_cloned();
218
        let mut values_array = self.values_builder.finish_cloned();
219
220
        // Add current run if one exists
221
        if self.prev_run_end_index != self.current_run_end_index {
222
            let mut run_end_builder = run_ends_array.into_builder().unwrap();
223
            let mut values_builder = values_array.into_builder().unwrap();
224
            self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder);
225
            run_ends_array = run_end_builder.finish();
226
            values_array = values_builder.finish();
227
        }
228
229
        RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
230
    }
231
232
    // Appends the current run to the array.
233
15
    fn append_run_end(&mut self) {
234
        // empty array or the function called without appending any value.
235
15
        if self.prev_run_end_index == self.current_run_end_index {
236
3
            return;
237
12
        }
238
12
        let run_end_index = self.run_end_index_as_native();
239
12
        self.run_ends_builder.append_value(run_end_index);
240
12
        if self.has_current_value {
241
10
            let slice = self.current_value.as_slice();
242
10
            let native = unsafe {
243
10
                // Safety:
244
10
                // As self.current_value is created from V::Native. The value V::Native can be
245
10
                // built back from the bytes without validations
246
10
                V::Native::from_bytes_unchecked(slice)
247
10
            };
248
10
            self.values_builder.append_value(native);
249
10
        } else {
250
2
            self.values_builder.append_null();
251
2
        }
252
12
        self.prev_run_end_index = self.current_run_end_index;
253
15
    }
254
255
    // Similar to `append_run_end` but on custom builders.
256
    // Used in `finish_cloned` which is not suppose to mutate `self`.
257
    fn append_run_end_with_builders(
258
        &self,
259
        run_ends_builder: &mut PrimitiveBuilder<R>,
260
        values_builder: &mut GenericByteBuilder<V>,
261
    ) {
262
        let run_end_index = self.run_end_index_as_native();
263
        run_ends_builder.append_value(run_end_index);
264
        if self.has_current_value {
265
            let slice = self.current_value.as_slice();
266
            let native = unsafe {
267
                // Safety:
268
                // As self.current_value is created from V::Native. The value V::Native can be
269
                // built back from the bytes without validations
270
                V::Native::from_bytes_unchecked(slice)
271
            };
272
            values_builder.append_value(native);
273
        } else {
274
            values_builder.append_null();
275
        }
276
    }
277
278
12
    fn run_end_index_as_native(&self) -> R::Native {
279
12
        R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| 
{0
280
0
            panic!(
281
0
                "Cannot convert the value {} from `usize` to native form of arrow datatype {}",
282
                self.current_run_end_index,
283
0
                R::DATA_TYPE
284
            )
285
        })
286
12
    }
287
}
288
289
impl<R, V, S> Extend<Option<S>> for GenericByteRunBuilder<R, V>
290
where
291
    R: RunEndIndexType,
292
    V: ByteArrayType,
293
    S: AsRef<V::Native>,
294
{
295
    fn extend<T: IntoIterator<Item = Option<S>>>(&mut self, iter: T) {
296
        for elem in iter {
297
            self.append_option(elem);
298
        }
299
    }
300
}
301
302
/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray)
303
///
304
/// ```
305
/// // Create a run-end encoded array with run-end indexes data type as `i16`.
306
/// // The encoded values are Strings.
307
///
308
/// # use arrow_array::builder::StringRunBuilder;
309
/// # use arrow_array::{Int16Array, StringArray};
310
/// # use arrow_array::types::Int16Type;
311
/// # use arrow_array::cast::AsArray;
312
/// #
313
/// let mut builder = StringRunBuilder::<Int16Type>::new();
314
///
315
/// // The builder builds the dictionary value by value
316
/// builder.append_value("abc");
317
/// builder.append_null();
318
/// builder.extend([Some("def"), Some("def"), Some("abc")]);
319
/// let array = builder.finish();
320
///
321
/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
322
///
323
/// // Values are polymorphic and so require a downcast.
324
/// let av = array.values();
325
/// let ava: &StringArray = av.as_string::<i32>();
326
///
327
/// assert_eq!(ava.value(0), "abc");
328
/// assert!(av.is_null(1));
329
/// assert_eq!(ava.value(2), "def");
330
/// assert_eq!(ava.value(3), "abc");
331
///
332
/// ```
333
pub type StringRunBuilder<K> = GenericByteRunBuilder<K, Utf8Type>;
334
335
/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
336
pub type LargeStringRunBuilder<K> = GenericByteRunBuilder<K, LargeUtf8Type>;
337
338
/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray)
339
///
340
/// ```
341
/// // Create a run-end encoded array with run-end indexes data type as `i16`.
342
/// // The encoded data is binary values.
343
///
344
/// # use arrow_array::builder::BinaryRunBuilder;
345
/// # use arrow_array::{BinaryArray, Int16Array};
346
/// # use arrow_array::cast::AsArray;
347
/// # use arrow_array::types::Int16Type;
348
///
349
/// let mut builder = BinaryRunBuilder::<Int16Type>::new();
350
///
351
/// // The builder builds the dictionary value by value
352
/// builder.append_value(b"abc");
353
/// builder.append_null();
354
/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]);
355
/// let array = builder.finish();
356
///
357
/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
358
///
359
/// // Values are polymorphic and so require a downcast.
360
/// let av = array.values();
361
/// let ava: &BinaryArray = av.as_binary();
362
///
363
/// assert_eq!(ava.value(0), b"abc");
364
/// assert!(av.is_null(1));
365
/// assert_eq!(ava.value(2), b"def");
366
/// assert_eq!(ava.value(3), b"abc");
367
///
368
/// ```
369
pub type BinaryRunBuilder<K> = GenericByteRunBuilder<K, BinaryType>;
370
371
/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
372
pub type LargeBinaryRunBuilder<K> = GenericByteRunBuilder<K, LargeBinaryType>;
373
374
#[cfg(test)]
375
mod tests {
376
    use super::*;
377
378
    use crate::GenericByteArray;
379
    use crate::Int16RunArray;
380
    use crate::array::Array;
381
    use crate::cast::AsArray;
382
    use crate::types::{Int16Type, Int32Type};
383
384
    fn test_bytes_run_builder<T>(values: Vec<&T::Native>)
385
    where
386
        T: ByteArrayType,
387
        <T as ByteArrayType>::Native: PartialEq,
388
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
389
    {
390
        let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
391
        builder.append_value(values[0]);
392
        builder.append_value(values[0]);
393
        builder.append_value(values[0]);
394
        builder.append_null();
395
        builder.append_null();
396
        builder.append_value(values[1]);
397
        builder.append_value(values[1]);
398
        builder.append_value(values[2]);
399
        builder.append_value(values[2]);
400
        builder.append_value(values[2]);
401
        builder.append_value(values[2]);
402
        let array = builder.finish();
403
404
        assert_eq!(array.len(), 11);
405
        assert_eq!(array.null_count(), 0);
406
        assert_eq!(array.logical_null_count(), 2);
407
408
        assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]);
409
410
        // Values are polymorphic and so require a downcast.
411
        let av = array.values();
412
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
413
414
        assert_eq!(*ava.value(0), *values[0]);
415
        assert!(ava.is_null(1));
416
        assert_eq!(*ava.value(2), *values[1]);
417
        assert_eq!(*ava.value(3), *values[2]);
418
    }
419
420
    #[test]
421
    fn test_string_run_builder() {
422
        test_bytes_run_builder::<Utf8Type>(vec!["abc", "def", "ghi"]);
423
    }
424
425
    #[test]
426
    fn test_string_run_builder_with_empty_strings() {
427
        test_bytes_run_builder::<Utf8Type>(vec!["abc", "", "ghi"]);
428
    }
429
430
    #[test]
431
    fn test_binary_run_builder() {
432
        test_bytes_run_builder::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
433
    }
434
435
    fn test_bytes_run_builder_finish_cloned<T>(values: Vec<&T::Native>)
436
    where
437
        T: ByteArrayType,
438
        <T as ByteArrayType>::Native: PartialEq,
439
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
440
    {
441
        let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
442
443
        builder.append_value(values[0]);
444
        builder.append_null();
445
        builder.append_value(values[1]);
446
        builder.append_value(values[1]);
447
        builder.append_value(values[0]);
448
        let mut array: Int16RunArray = builder.finish_cloned();
449
450
        assert_eq!(array.len(), 5);
451
        assert_eq!(array.null_count(), 0);
452
        assert_eq!(array.logical_null_count(), 1);
453
454
        assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
455
456
        // Values are polymorphic and so require a downcast.
457
        let av = array.values();
458
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
459
460
        assert_eq!(ava.value(0), values[0]);
461
        assert!(ava.is_null(1));
462
        assert_eq!(ava.value(2), values[1]);
463
        assert_eq!(ava.value(3), values[0]);
464
465
        // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only
466
        // one entry in final output.
467
        builder.append_value(values[0]);
468
        builder.append_value(values[0]);
469
        builder.append_value(values[1]);
470
        array = builder.finish();
471
472
        assert_eq!(array.len(), 8);
473
        assert_eq!(array.null_count(), 0);
474
        assert_eq!(array.logical_null_count(), 1);
475
476
        assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]);
477
478
        // Values are polymorphic and so require a downcast.
479
        let av2 = array.values();
480
        let ava2: &GenericByteArray<T> =
481
            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
482
483
        assert_eq!(ava2.value(0), values[0]);
484
        assert!(ava2.is_null(1));
485
        assert_eq!(ava2.value(2), values[1]);
486
        // The value appended before and after `finish_cloned` has only one entry.
487
        assert_eq!(ava2.value(3), values[0]);
488
        assert_eq!(ava2.value(4), values[1]);
489
    }
490
491
    #[test]
492
    fn test_string_run_builder_finish_cloned() {
493
        test_bytes_run_builder_finish_cloned::<Utf8Type>(vec!["abc", "def", "ghi"]);
494
    }
495
496
    #[test]
497
    fn test_binary_run_builder_finish_cloned() {
498
        test_bytes_run_builder_finish_cloned::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
499
    }
500
501
    #[test]
502
    fn test_extend() {
503
        let mut builder = StringRunBuilder::<Int32Type>::new();
504
        builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some));
505
        builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some));
506
        let array = builder.finish();
507
508
        assert_eq!(array.len(), 10);
509
        assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]);
510
511
        let str_array = array.values().as_string::<i32>();
512
        assert_eq!(str_array.value(0), "a");
513
        assert_eq!(str_array.value(1), "");
514
        assert_eq!(str_array.value(2), "b");
515
        assert_eq!(str_array.value(3), "cupcakes");
516
    }
517
}