Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-data/src/transform/mod.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Low-level array data abstractions.
19
//!
20
//! Provides utilities for creating, manipulating, and converting Arrow arrays
21
//! made of primitive types, strings, and nested types.
22
23
use super::{ArrayData, ArrayDataBuilder, ByteView, data::new_buffers};
24
use crate::bit_mask::set_bits;
25
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
26
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, bit_util, i256};
27
use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
28
use half::f16;
29
use num_integer::Integer;
30
use std::mem;
31
32
mod boolean;
33
mod fixed_binary;
34
mod fixed_size_list;
35
mod list;
36
mod list_view;
37
mod null;
38
mod primitive;
39
mod run;
40
mod structure;
41
mod union;
42
mod utils;
43
mod variable_size;
44
45
type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>;
46
// function that extends `[start..start+len]` to the mutable array.
47
// this is dynamic because different data_types influence how buffers and children are extended.
48
type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>;
49
50
type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>;
51
52
/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData].
53
/// This is just a data container.
54
#[derive(Debug)]
55
struct _MutableArrayData<'a> {
56
    pub data_type: DataType,
57
    pub null_count: usize,
58
59
    pub len: usize,
60
    pub null_buffer: Option<MutableBuffer>,
61
62
    // arrow specification only allows up to 3 buffers (2 ignoring the nulls above).
63
    // Thus, we place them in the stack to avoid bound checks and greater data locality.
64
    pub buffer1: MutableBuffer,
65
    pub buffer2: MutableBuffer,
66
    pub child_data: Vec<MutableArrayData<'a>>,
67
}
68
69
impl _MutableArrayData<'_> {
70
112
    fn null_buffer(&mut self) -> &mut MutableBuffer {
71
112
        self.null_buffer
72
112
            .as_mut()
73
112
            .expect("MutableArrayData not nullable")
74
112
    }
75
}
76
77
200
fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits<'_> {
78
200
    if let Some(
nulls47
) = array.nulls() {
79
47
        let bytes = nulls.validity();
80
73
        
Box::new47
(move |mutable, start, len| {
81
73
            let mutable_len = mutable.len;
82
73
            let out = mutable.null_buffer();
83
73
            utils::resize_for_bits(out, mutable_len + len);
84
73
            mutable.null_count += set_bits(
85
73
                out.as_slice_mut(),
86
73
                bytes,
87
73
                mutable_len,
88
73
                nulls.offset() + start,
89
73
                len,
90
73
            );
91
73
        })
92
153
    } else if use_nulls {
93
37
        
Box::new28
(|mutable, _, len| {
94
37
            let mutable_len = mutable.len;
95
37
            let out = mutable.null_buffer();
96
37
            utils::resize_for_bits(out, mutable_len + len);
97
37
            let write_data = out.as_slice_mut();
98
47
            
(0..len)37
.
for_each37
(|i| {
99
47
                bit_util::set_bit(write_data, mutable_len + i);
100
47
            });
101
37
        })
102
    } else {
103
149
        
Box::new125
(|_, _, _| {})
104
    }
105
200
}
106
107
/// Efficiently create an [ArrayData] from one or more existing [ArrayData]s by
108
/// copying chunks.
109
///
110
/// The main use case of this struct is to perform unary operations to arrays of
111
/// arbitrary types, such as `filter` and `take`.
112
///
113
/// # Example
114
/// ```
115
/// use arrow_buffer::Buffer;
116
/// use arrow_data::ArrayData;
117
/// use arrow_data::transform::MutableArrayData;
118
/// use arrow_schema::DataType;
119
/// fn i32_array(values: &[i32]) -> ArrayData {
120
///   ArrayData::try_new(DataType::Int32, 5, None, 0, vec![Buffer::from_slice_ref(values)], vec![]).unwrap()
121
/// }
122
/// let arr1  = i32_array(&[1, 2, 3, 4, 5]);
123
/// let arr2  = i32_array(&[6, 7, 8, 9, 10]);
124
/// // Create a mutable array for copying values from arr1 and arr2, with a capacity for 6 elements
125
/// let capacity = 3 * std::mem::size_of::<i32>();
126
/// let mut mutable = MutableArrayData::new(vec![&arr1, &arr2], false, 10);
127
/// // Copy the first 3 elements from arr1
128
/// mutable.extend(0, 0, 3);
129
/// // Copy the last 3 elements from arr2
130
/// mutable.extend(1, 2, 4);
131
/// // Complete the MutableArrayData into a new ArrayData
132
/// let frozen = mutable.freeze();
133
/// assert_eq!(frozen, i32_array(&[1, 2, 3, 8, 9, 10]));
134
/// ```
135
pub struct MutableArrayData<'a> {
136
    /// Input arrays: the data being read FROM.
137
    ///
138
    /// Note this is "dead code" because all actual references to the arrays are
139
    /// stored in closures for extending values and nulls.
140
    #[allow(dead_code)]
141
    arrays: Vec<&'a ArrayData>,
142
143
    /// In progress output array: The data being written TO
144
    ///
145
    /// Note these fields are in a separate struct, [_MutableArrayData], as they
146
    /// cannot be in [MutableArrayData] itself due to mutability invariants (interior
147
    /// mutability): [MutableArrayData] contains a function that can only mutate
148
    /// [_MutableArrayData], not [MutableArrayData] itself
149
    data: _MutableArrayData<'a>,
150
151
    /// The child data of the `Array` in Dictionary arrays.
152
    ///
153
    /// This is not stored in `_MutableArrayData` because these values are
154
    /// constant and only needed at the end, when freezing [_MutableArrayData].
155
    dictionary: Option<ArrayData>,
156
157
    /// Variadic data buffers referenced by views.
158
    ///
159
    /// Note this this is not stored in `_MutableArrayData` because these values
160
    /// are constant and only needed at the end, when freezing
161
    /// [_MutableArrayData]
162
    variadic_data_buffers: Vec<Buffer>,
163
164
    /// function used to extend output array with values from input arrays.
165
    ///
166
    /// This function's lifetime is bound to the input arrays because it reads
167
    /// values from them.
168
    extend_values: Vec<Extend<'a>>,
169
170
    /// function used to extend the output array with nulls from input arrays.
171
    ///
172
    /// This function's lifetime is bound to the input arrays because it reads
173
    /// nulls from it.
174
    extend_null_bits: Vec<ExtendNullBits<'a>>,
175
176
    /// function used to extend the output array with null elements.
177
    ///
178
    /// This function is independent of the arrays and therefore has no lifetime.
179
    extend_nulls: ExtendNulls,
180
}
181
182
impl std::fmt::Debug for MutableArrayData<'_> {
183
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
184
        // ignores the closures.
185
0
        f.debug_struct("MutableArrayData")
186
0
            .field("data", &self.data)
187
0
            .finish()
188
0
    }
189
}
190
191
/// Builds an extend that adds `offset` to the source primitive
192
/// Additionally validates that `max` fits into the
193
/// the underlying primitive returning None if not
194
17
fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option<Extend<'_>> {
195
    macro_rules! validate_and_build {
196
        ($dt: ty) => {{
197
            let _: $dt = max.try_into().ok()?;
198
            let offset: $dt = offset.try_into().ok()?;
199
            Some(primitive::build_extend_with_offset(array, offset))
200
        }};
201
    }
202
17
    match array.data_type() {
203
17
        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
204
0
            DataType::UInt8 => validate_and_build!(u8),
205
0
            DataType::UInt16 => validate_and_build!(u16),
206
0
            DataType::UInt32 => validate_and_build!(u32),
207
0
            DataType::UInt64 => validate_and_build!(u64),
208
7
            DataType::Int8 => validate_and_build!(i8),
209
0
            DataType::Int16 => validate_and_build!(i16),
210
10
            DataType::Int32 => validate_and_build!(i32),
211
0
            DataType::Int64 => validate_and_build!(i64),
212
0
            _ => unreachable!(),
213
        },
214
0
        _ => None,
215
    }
216
17
}
217
218
/// Builds an extend that adds `buffer_offset` to any buffer indices encountered
219
4
fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> {
220
4
    let views = array.buffer::<u128>(0);
221
4
    Box::new(
222
10
        move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
223
10
            mutable
224
10
                .buffer1
225
10
                .extend(views[start..start + len].iter().map(|v| {
226
10
                    let len = *v as u32;
227
10
                    if len <= 12 {
228
8
                        return *v; // Stored inline
229
2
                    }
230
2
                    let mut view = ByteView::from(*v);
231
2
                    view.buffer_index += buffer_offset;
232
2
                    view.into()
233
10
                }))
234
10
        },
235
    )
236
4
}
237
238
179
fn build_extend(array: &ArrayData) -> Extend<'_> {
239
179
    match array.data_type() {
240
0
        DataType::Null => null::build_extend(array),
241
0
        DataType::Boolean => boolean::build_extend(array),
242
0
        DataType::UInt8 => primitive::build_extend::<u8>(array),
243
0
        DataType::UInt16 => primitive::build_extend::<u16>(array),
244
0
        DataType::UInt32 => primitive::build_extend::<u32>(array),
245
0
        DataType::UInt64 => primitive::build_extend::<u64>(array),
246
0
        DataType::Int8 => primitive::build_extend::<i8>(array),
247
2
        DataType::Int16 => primitive::build_extend::<i16>(array),
248
44
        DataType::Int32 => primitive::build_extend::<i32>(array),
249
6
        DataType::Int64 => primitive::build_extend::<i64>(array),
250
0
        DataType::Float32 => primitive::build_extend::<f32>(array),
251
5
        DataType::Float64 => primitive::build_extend::<f64>(array),
252
        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
253
0
            primitive::build_extend::<i32>(array)
254
        }
255
        DataType::Date64
256
        | DataType::Time64(_)
257
        | DataType::Timestamp(_, _)
258
        | DataType::Duration(_)
259
30
        | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::<i64>(array),
260
0
        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::<i128>(array),
261
0
        DataType::Decimal32(_, _) => primitive::build_extend::<i32>(array),
262
0
        DataType::Decimal64(_, _) => primitive::build_extend::<i64>(array),
263
30
        DataType::Decimal128(_, _) => primitive::build_extend::<i128>(array),
264
0
        DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array),
265
29
        DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array),
266
0
        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
267
0
        DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
268
5
        DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
269
1
        DataType::LargeList(_) => list::build_extend::<i64>(array),
270
0
        DataType::ListView(_) => list_view::build_extend::<i32>(array),
271
0
        DataType::LargeListView(_) => list_view::build_extend::<i64>(array),
272
0
        DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
273
1
        DataType::Struct(_) => structure::build_extend(array),
274
0
        DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
275
0
        DataType::Float16 => primitive::build_extend::<f16>(array),
276
6
        DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array),
277
8
        DataType::Union(_, mode) => match mode {
278
2
            UnionMode::Sparse => union::build_extend_sparse(array),
279
6
            UnionMode::Dense => union::build_extend_dense(array),
280
        },
281
12
        DataType::RunEndEncoded(_, _) => run::build_extend(array),
282
    }
283
179
}
284
285
112
fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
286
112
    Box::new(match 
data_type0
{
287
0
        DataType::Null => null::extend_nulls,
288
0
        DataType::Boolean => boolean::extend_nulls,
289
0
        DataType::UInt8 => primitive::extend_nulls::<u8>,
290
0
        DataType::UInt16 => primitive::extend_nulls::<u16>,
291
0
        DataType::UInt32 => primitive::extend_nulls::<u32>,
292
0
        DataType::UInt64 => primitive::extend_nulls::<u64>,
293
0
        DataType::Int8 => primitive::extend_nulls::<i8>,
294
1
        DataType::Int16 => primitive::extend_nulls::<i16>,
295
27
        DataType::Int32 => primitive::extend_nulls::<i32>,
296
3
        DataType::Int64 => primitive::extend_nulls::<i64>,
297
0
        DataType::Float32 => primitive::extend_nulls::<f32>,
298
5
        DataType::Float64 => primitive::extend_nulls::<f64>,
299
0
        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
300
0
            primitive::extend_nulls::<i32>
301
0
        }
302
        DataType::Date64
303
        | DataType::Time64(_)
304
        | DataType::Timestamp(_, _)
305
        | DataType::Duration(_)
306
15
        | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>,
307
0
        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>,
308
0
        DataType::Decimal32(_, _) => primitive::extend_nulls::<i32>,
309
0
        DataType::Decimal64(_, _) => primitive::extend_nulls::<i64>,
310
15
        DataType::Decimal128(_, _) => primitive::extend_nulls::<i128>,
311
0
        DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>,
312
14
        DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>,
313
0
        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
314
2
        DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
315
3
        DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
316
1
        DataType::LargeList(_) => list::extend_nulls::<i64>,
317
0
        DataType::ListView(_) => list_view::extend_nulls::<i32>,
318
0
        DataType::LargeListView(_) => list_view::extend_nulls::<i64>,
319
8
        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
320
0
            DataType::UInt8 => primitive::extend_nulls::<u8>,
321
0
            DataType::UInt16 => primitive::extend_nulls::<u16>,
322
0
            DataType::UInt32 => primitive::extend_nulls::<u32>,
323
0
            DataType::UInt64 => primitive::extend_nulls::<u64>,
324
3
            DataType::Int8 => primitive::extend_nulls::<i8>,
325
0
            DataType::Int16 => primitive::extend_nulls::<i16>,
326
5
            DataType::Int32 => primitive::extend_nulls::<i32>,
327
0
            DataType::Int64 => primitive::extend_nulls::<i64>,
328
0
            _ => unreachable!(),
329
        },
330
1
        DataType::Struct(_) => structure::extend_nulls,
331
0
        DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls,
332
0
        DataType::Float16 => primitive::extend_nulls::<f16>,
333
4
        DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls,
334
7
        DataType::Union(_, mode) => match mode {
335
1
            UnionMode::Sparse => union::extend_nulls_sparse,
336
6
            UnionMode::Dense => union::extend_nulls_dense,
337
        },
338
6
        DataType::RunEndEncoded(_, _) => run::extend_nulls,
339
    })
340
112
}
341
342
0
fn preallocate_offset_and_binary_buffer<Offset: ArrowNativeType + Integer>(
343
0
    capacity: usize,
344
0
    binary_size: usize,
345
0
) -> [MutableBuffer; 2] {
346
    // offsets
347
0
    let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
348
    // safety: `unsafe` code assumes that this buffer is initialized with one element
349
0
    buffer.push(Offset::zero());
350
351
0
    [
352
0
        buffer,
353
0
        MutableBuffer::new(binary_size * mem::size_of::<u8>()),
354
0
    ]
355
0
}
356
357
/// Define capacities to pre-allocate for child data or data buffers.
358
#[derive(Debug, Clone)]
359
pub enum Capacities {
360
    /// Binary, Utf8 and LargeUtf8 data types
361
    ///
362
    /// Defines
363
    /// * the capacity of the array offsets
364
    /// * the capacity of the binary/ str buffer
365
    Binary(usize, Option<usize>),
366
    /// List and LargeList data types
367
    ///
368
    /// Defines
369
    /// * the capacity of the array offsets
370
    /// * the capacity of the child data
371
    List(usize, Option<Box<Capacities>>),
372
    /// Struct type
373
    ///
374
    /// Defines
375
    /// * the capacity of the array
376
    /// * the capacities of the fields
377
    Struct(usize, Option<Vec<Capacities>>),
378
    /// Dictionary type
379
    ///
380
    /// Defines
381
    /// * the capacity of the array/keys
382
    /// * the capacity of the values
383
    Dictionary(usize, Option<Box<Capacities>>),
384
    /// Don't preallocate inner buffers and rely on array growth strategy
385
    Array(usize),
386
}
387
388
impl<'a> MutableArrayData<'a> {
389
    /// Returns a new [MutableArrayData] with capacity to `capacity` slots and
390
    /// specialized to create an [ArrayData] from multiple `arrays`.
391
    ///
392
    /// # Arguments
393
    /// * `arrays` - the source arrays to copy from
394
    /// * `use_nulls` - a flag used to optimize insertions
395
    ///   - `false` if the only source of nulls are the arrays themselves
396
    ///   - `true` if the user plans to call [MutableArrayData::extend_nulls].
397
    /// * capacity - the preallocated capacity of the output array, in bytes
398
    ///
399
    /// Thus, if `use_nulls` is `false`, calling
400
    /// [MutableArrayData::extend_nulls] should not be used.
401
96
    pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self {
402
96
        Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity))
403
96
    }
404
405
    /// Similar to [MutableArrayData::new], but lets users define the
406
    /// preallocated capacities of the array with more granularity.
407
    ///
408
    /// See [MutableArrayData::new] for more information on the arguments.
409
    ///
410
    /// # Panics
411
    ///
412
    /// This function panics if the given `capacities` don't match the data type
413
    /// of `arrays`. Or when a [Capacities] variant is not yet supported.
414
112
    pub fn with_capacities(
415
112
        arrays: Vec<&'a ArrayData>,
416
112
        use_nulls: bool,
417
112
        capacities: Capacities,
418
112
    ) -> Self {
419
112
        let data_type = arrays[0].data_type();
420
421
112
        for 
a88
in arrays.iter().skip(1) {
422
88
            assert_eq!(
423
                data_type,
424
88
                a.data_type(),
425
0
                "Arrays with inconsistent types passed to MutableArrayData"
426
            )
427
        }
428
429
        // if any of the arrays has nulls, insertions from any array requires setting bits
430
        // as there is at least one array with nulls.
431
177
        let 
use_nulls112
=
use_nulls112
|
arrays.iter()112
.
any112
(|array| array.null_count() > 0);
432
433
        let mut array_capacity;
434
435
112
        let [buffer1, buffer2] = match (data_type, &capacities) {
436
            (
437
                DataType::LargeUtf8 | DataType::LargeBinary,
438
0
                Capacities::Binary(capacity, Some(value_cap)),
439
            ) => {
440
0
                array_capacity = *capacity;
441
0
                preallocate_offset_and_binary_buffer::<i64>(*capacity, *value_cap)
442
            }
443
0
            (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => {
444
0
                array_capacity = *capacity;
445
0
                preallocate_offset_and_binary_buffer::<i32>(*capacity, *value_cap)
446
            }
447
112
            (_, Capacities::Array(capacity)) => {
448
112
                array_capacity = *capacity;
449
112
                new_buffers(data_type, *capacity)
450
            }
451
            (
452
                DataType::List(_)
453
                | DataType::LargeList(_)
454
                | DataType::ListView(_)
455
                | DataType::LargeListView(_)
456
                | DataType::FixedSizeList(_, _),
457
0
                Capacities::List(capacity, _),
458
            ) => {
459
0
                array_capacity = *capacity;
460
0
                new_buffers(data_type, *capacity)
461
            }
462
0
            _ => panic!("Capacities: {capacities:?} not yet supported"),
463
        };
464
465
112
        let child_data = match &data_type {
466
            DataType::Decimal32(_, _)
467
            | DataType::Decimal64(_, _)
468
            | DataType::Decimal128(_, _)
469
            | DataType::Decimal256(_, _)
470
            | DataType::Null
471
            | DataType::Boolean
472
            | DataType::UInt8
473
            | DataType::UInt16
474
            | DataType::UInt32
475
            | DataType::UInt64
476
            | DataType::Int8
477
            | DataType::Int16
478
            | DataType::Int32
479
            | DataType::Int64
480
            | DataType::Float16
481
            | DataType::Float32
482
            | DataType::Float64
483
            | DataType::Date32
484
            | DataType::Date64
485
            | DataType::Time32(_)
486
            | DataType::Time64(_)
487
            | DataType::Duration(_)
488
            | DataType::Timestamp(_, _)
489
            | DataType::Utf8
490
            | DataType::Binary
491
            | DataType::LargeUtf8
492
            | DataType::LargeBinary
493
            | DataType::BinaryView
494
            | DataType::Utf8View
495
            | DataType::Interval(_)
496
82
            | DataType::FixedSizeBinary(_) => vec![],
497
            DataType::Map(_, _)
498
            | DataType::List(_)
499
            | DataType::LargeList(_)
500
            | DataType::ListView(_)
501
            | DataType::LargeListView(_) => {
502
4
                let children = arrays
503
4
                    .iter()
504
6
                    .
map4
(|array| &array.child_data()[0])
505
4
                    .collect::<Vec<_>>();
506
507
4
                let capacities =
508
4
                    if let Capacities::List(
capacity0
,
ref child_capacities0
) = capacities {
509
0
                        child_capacities
510
0
                            .clone()
511
0
                            .map(|c| *c)
512
0
                            .unwrap_or(Capacities::Array(capacity))
513
                    } else {
514
4
                        Capacities::Array(array_capacity)
515
                    };
516
517
4
                vec![MutableArrayData::with_capacities(
518
4
                    children, use_nulls, capacities,
519
                )]
520
            }
521
            // the dictionary type just appends keys and clones the values.
522
8
            DataType::Dictionary(_, _) => vec![],
523
1
            DataType::Struct(fields) => match 
capacities0
{
524
0
                Capacities::Struct(capacity, Some(ref child_capacities)) => {
525
0
                    array_capacity = capacity;
526
0
                    (0..fields.len())
527
0
                        .zip(child_capacities)
528
0
                        .map(|(i, child_cap)| {
529
0
                            let child_arrays = arrays
530
0
                                .iter()
531
0
                                .map(|array| &array.child_data()[i])
532
0
                                .collect::<Vec<_>>();
533
0
                            MutableArrayData::with_capacities(
534
0
                                child_arrays,
535
0
                                use_nulls,
536
0
                                child_cap.clone(),
537
                            )
538
0
                        })
539
0
                        .collect::<Vec<_>>()
540
                }
541
0
                Capacities::Struct(capacity, None) => {
542
0
                    array_capacity = capacity;
543
0
                    (0..fields.len())
544
0
                        .map(|i| {
545
0
                            let child_arrays = arrays
546
0
                                .iter()
547
0
                                .map(|array| &array.child_data()[i])
548
0
                                .collect::<Vec<_>>();
549
0
                            MutableArrayData::new(child_arrays, use_nulls, capacity)
550
0
                        })
551
0
                        .collect::<Vec<_>>()
552
                }
553
1
                _ => (0..fields.len())
554
2
                    .
map1
(|i| {
555
2
                        let child_arrays = arrays
556
2
                            .iter()
557
2
                            .map(|array| &array.child_data()[i])
558
2
                            .collect::<Vec<_>>();
559
2
                        MutableArrayData::new(child_arrays, use_nulls, array_capacity)
560
2
                    })
561
1
                    .collect::<Vec<_>>(),
562
            },
563
            DataType::RunEndEncoded(_, _) => {
564
6
                let run_ends_child = arrays
565
6
                    .iter()
566
12
                    .
map6
(|array| &array.child_data()[0])
567
6
                    .collect::<Vec<_>>();
568
6
                let value_child = arrays
569
6
                    .iter()
570
12
                    .
map6
(|array| &array.child_data()[1])
571
6
                    .collect::<Vec<_>>();
572
6
                vec![
573
6
                    MutableArrayData::new(run_ends_child, false, array_capacity),
574
6
                    MutableArrayData::new(value_child, use_nulls, array_capacity),
575
                ]
576
            }
577
4
            DataType::FixedSizeList(_, size) => {
578
4
                let children = arrays
579
4
                    .iter()
580
6
                    .
map4
(|array| &array.child_data()[0])
581
4
                    .collect::<Vec<_>>();
582
4
                let capacities =
583
4
                    if let Capacities::List(
capacity0
,
ref child_capacities0
) = capacities {
584
0
                        child_capacities
585
0
                            .clone()
586
0
                            .map(|c| *c)
587
0
                            .unwrap_or(Capacities::Array(capacity * *size as usize))
588
                    } else {
589
4
                        Capacities::Array(array_capacity * *size as usize)
590
                    };
591
4
                vec![MutableArrayData::with_capacities(
592
4
                    children, use_nulls, capacities,
593
                )]
594
            }
595
7
            DataType::Union(fields, _) => (0..fields.len())
596
12
                .
map7
(|i| {
597
12
                    let child_arrays = arrays
598
12
                        .iter()
599
13
                        .
map12
(|array| &array.child_data()[i])
600
12
                        .collect::<Vec<_>>();
601
12
                    MutableArrayData::new(child_arrays, use_nulls, array_capacity)
602
12
                })
603
7
                .collect::<Vec<_>>(),
604
        };
605
606
        // Get the dictionary if any, and if it is a concatenation of multiple
607
112
        let (dictionary, dict_concat) = match &data_type {
608
            DataType::Dictionary(_, _) => {
609
                // If more than one dictionary, concatenate dictionaries together
610
8
                let dict_concat = !arrays
611
8
                    .windows(2)
612
9
                    .
all8
(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0]));
613
614
8
                match dict_concat {
615
2
                    false => (Some(arrays[0].child_data()[0].clone()), false),
616
                    true => {
617
6
                        if let Capacities::Dictionary(_, _) = capacities {
618
0
                            panic!("dictionary capacity not yet supported")
619
6
                        }
620
6
                        let dictionaries: Vec<_> =
621
13
                            
arrays.iter()6
.
map6
(|array| &array.child_data()[0]).
collect6
();
622
6
                        let lengths: Vec<_> = dictionaries
623
6
                            .iter()
624
13
                            .
map6
(|dictionary| dictionary.len())
625
6
                            .collect();
626
6
                        let capacity = lengths.iter().sum();
627
628
6
                        let mut mutable = MutableArrayData::new(dictionaries, false, capacity);
629
630
13
                        for (i, len) in 
lengths.iter()6
.
enumerate6
() {
631
13
                            mutable.extend(i, 0, *len)
632
                        }
633
634
6
                        (Some(mutable.freeze()), true)
635
                    }
636
                }
637
            }
638
104
            _ => (None, false),
639
        };
640
641
112
        let variadic_data_buffers = match &data_type {
642
2
            DataType::BinaryView | DataType::Utf8View => arrays
643
2
                .iter()
644
4
                .
flat_map2
(|x| x.buffers().iter().skip(1))
645
2
                .map(Buffer::clone)
646
2
                .collect(),
647
110
            _ => vec![],
648
        };
649
650
112
        let extend_nulls = build_extend_nulls(data_type);
651
652
112
        let extend_null_bits = arrays
653
112
            .iter()
654
200
            .
map112
(|array| build_extend_null_bits(array, use_nulls))
655
112
            .collect();
656
657
112
        let null_buffer = use_nulls.then(|| 
{41
658
41
            let null_bytes = bit_util::ceil(array_capacity, 8);
659
41
            MutableBuffer::from_len_zeroed(null_bytes)
660
41
        });
661
662
112
        let extend_values = match &data_type {
663
            DataType::Dictionary(_, _) => {
664
8
                let mut next_offset = 0;
665
8
                let extend_values: Result<Vec<_>, _> = arrays
666
8
                    .iter()
667
17
                    .
map8
(|array| {
668
17
                        let offset = next_offset;
669
17
                        let dict_len = array.child_data()[0].len();
670
671
17
                        if dict_concat {
672
13
                            next_offset += dict_len;
673
13
                        
}4
674
675
17
                        build_extend_dictionary(array, offset, offset + dict_len)
676
17
                            .ok_or(ArrowError::DictionaryKeyOverflowError)
677
17
                    })
678
8
                    .collect();
679
680
8
                extend_values.expect("MutableArrayData::new is infallible")
681
            }
682
            DataType::BinaryView | DataType::Utf8View => {
683
2
                let mut next_offset = 0u32;
684
2
                arrays
685
2
                    .iter()
686
4
                    .
map2
(|arr| {
687
4
                        let num_data_buffers = (arr.buffers().len() - 1) as u32;
688
4
                        let offset = next_offset;
689
4
                        next_offset = next_offset
690
4
                            .checked_add(num_data_buffers)
691
4
                            .expect("view buffer index overflow");
692
4
                        build_extend_view(arr, offset)
693
4
                    })
694
2
                    .collect()
695
            }
696
179
            _ => 
arrays.iter()102
.
map102
(|array| build_extend(array)).
collect102
(),
697
        };
698
699
112
        let data = _MutableArrayData {
700
112
            data_type: data_type.clone(),
701
112
            len: 0,
702
112
            null_count: 0,
703
112
            null_buffer,
704
112
            buffer1,
705
112
            buffer2,
706
112
            child_data,
707
112
        };
708
112
        Self {
709
112
            arrays,
710
112
            data,
711
112
            dictionary,
712
112
            variadic_data_buffers,
713
112
            extend_values,
714
112
            extend_null_bits,
715
112
            extend_nulls,
716
112
        }
717
112
    }
718
719
    /// Extends the in progress array with a region of the input arrays
720
    ///
721
    /// # Arguments
722
    /// * `index` - the index of array that you what to copy values from
723
    /// * `start` - the start index of the chunk (inclusive)
724
    /// * `end` - the end index of the chunk (exclusive)
725
    ///
726
    /// # Panic
727
    /// This function panics if there is an invalid index,
728
    /// i.e. `index` >= the number of source arrays
729
    /// or `end` > the length of the `index`th array
730
259
    pub fn extend(&mut self, index: usize, start: usize, end: usize) {
731
259
        let len = end - start;
732
259
        (self.extend_null_bits[index])(&mut self.data, start, len);
733
259
        (self.extend_values[index])(&mut self.data, index, start, len);
734
259
        self.data.len += len;
735
259
    }
736
737
    /// Extends the in progress array with null elements, ignoring the input arrays.
738
    ///
739
    /// # Panics
740
    ///
741
    /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays
742
2
    pub fn extend_nulls(&mut self, len: usize) {
743
2
        self.data.len += len;
744
2
        let bit_len = bit_util::ceil(self.data.len, 8);
745
2
        let nulls = self.data.null_buffer();
746
2
        nulls.resize(bit_len, 0);
747
2
        self.data.null_count += len;
748
2
        (self.extend_nulls)(&mut self.data, len);
749
2
    }
750
751
    /// Returns the current length
752
    #[inline]
753
11
    pub fn len(&self) -> usize {
754
11
        self.data.len
755
11
    }
756
757
    /// Returns true if len is 0
758
    #[inline]
759
    pub fn is_empty(&self) -> bool {
760
        self.data.len == 0
761
    }
762
763
    /// Returns the current null count
764
    #[inline]
765
    pub fn null_count(&self) -> usize {
766
        self.data.null_count
767
    }
768
769
    /// Creates a [ArrayData] from the in progress array, consuming `self`.
770
112
    pub fn freeze(self) -> ArrayData {
771
112
        unsafe { self.into_builder().build_unchecked() }
772
112
    }
773
774
    /// Consume self and returns the in progress array as [`ArrayDataBuilder`].
775
    ///
776
    /// This is useful for extending the default behavior of MutableArrayData.
777
112
    pub fn into_builder(self) -> ArrayDataBuilder {
778
112
        let data = self.data;
779
780
112
        let buffers = match data.data_type {
781
            DataType::Null
782
            | DataType::Struct(_)
783
            | DataType::FixedSizeList(_, _)
784
            | DataType::RunEndEncoded(_, _) => {
785
11
                vec![]
786
            }
787
            DataType::BinaryView | DataType::Utf8View => {
788
2
                let mut b = self.variadic_data_buffers;
789
2
                b.insert(0, data.buffer1.into());
790
2
                b
791
            }
792
            DataType::Utf8
793
            | DataType::Binary
794
            | DataType::LargeUtf8
795
            | DataType::LargeBinary
796
            | DataType::ListView(_)
797
            | DataType::LargeListView(_) => {
798
14
                vec![data.buffer1.into(), data.buffer2.into()]
799
            }
800
7
            DataType::Union(_, mode) => {
801
7
                match mode {
802
                    // Based on Union's DataTypeLayout
803
1
                    UnionMode::Sparse => vec![data.buffer1.into()],
804
6
                    UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()],
805
                }
806
            }
807
78
            _ => vec![data.buffer1.into()],
808
        };
809
810
112
        let child_data = match data.data_type {
811
8
            DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()],
812
104
            _ => data.child_data.into_iter().map(|x| 
x34
.
freeze34
()).collect(),
813
        };
814
815
112
        let nulls = match data.data_type {
816
            // RunEndEncoded and Null arrays cannot have top-level null bitmasks
817
6
            DataType::RunEndEncoded(_, _) | DataType::Null => None,
818
106
            _ => data
819
106
                .null_buffer
820
106
                .map(|nulls| 
{41
821
41
                    let bools = BooleanBuffer::new(nulls.into(), 0, data.len);
822
41
                    unsafe { NullBuffer::new_unchecked(bools, data.null_count) }
823
41
                })
824
106
                .filter(|n| 
n41
.
null_count41
() > 0),
825
        };
826
827
112
        ArrayDataBuilder::new(data.data_type)
828
112
            .offset(0)
829
112
            .len(data.len)
830
112
            .nulls(nulls)
831
112
            .buffers(buffers)
832
112
            .child_data(child_data)
833
112
    }
834
}
835
836
// See arrow/tests/array_transform.rs for tests of transform functionality
837
838
#[cfg(test)]
839
mod test {
840
    use super::*;
841
    use arrow_schema::Field;
842
    use std::sync::Arc;
843
844
    #[test]
845
    fn test_list_append_with_capacities() {
846
        let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new(
847
            "element",
848
            DataType::Int64,
849
            false,
850
        ))));
851
852
        let mutable = MutableArrayData::with_capacities(
853
            vec![&array],
854
            false,
855
            Capacities::List(6, Some(Box::new(Capacities::Array(17)))),
856
        );
857
858
        // capacities are rounded up to multiples of 64 by MutableBuffer
859
        assert_eq!(mutable.data.buffer1.capacity(), 64);
860
        assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192);
861
    }
862
}