Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-data/src/transform/mod.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Low-level array data abstractions.
19
//!
20
//! Provides utilities for creating, manipulating, and converting Arrow arrays
21
//! made of primitive types, strings, and nested types.
22
23
use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView};
24
use crate::bit_mask::set_bits;
25
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
26
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
27
use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
28
use half::f16;
29
use num::Integer;
30
use std::mem;
31
32
mod boolean;
33
mod fixed_binary;
34
mod fixed_size_list;
35
mod list;
36
mod null;
37
mod primitive;
38
mod run;
39
mod structure;
40
mod union;
41
mod utils;
42
mod variable_size;
43
44
type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>;
45
// function that extends `[start..start+len]` to the mutable array.
46
// this is dynamic because different data_types influence how buffers and children are extended.
47
type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>;
48
49
type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>;
50
51
/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData].
52
/// This is just a data container.
53
#[derive(Debug)]
54
struct _MutableArrayData<'a> {
55
    pub data_type: DataType,
56
    pub null_count: usize,
57
58
    pub len: usize,
59
    pub null_buffer: Option<MutableBuffer>,
60
61
    // arrow specification only allows up to 3 buffers (2 ignoring the nulls above).
62
    // Thus, we place them in the stack to avoid bound checks and greater data locality.
63
    pub buffer1: MutableBuffer,
64
    pub buffer2: MutableBuffer,
65
    pub child_data: Vec<MutableArrayData<'a>>,
66
}
67
68
impl _MutableArrayData<'_> {
69
47
    fn null_buffer(&mut self) -> &mut MutableBuffer {
70
47
        self.null_buffer
71
47
            .as_mut()
72
47
            .expect("MutableArrayData not nullable")
73
47
    }
74
}
75
76
51
fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits<'_> {
77
51
    if let Some(
nulls12
) = array.nulls() {
78
12
        let bytes = nulls.validity();
79
12
        Box::new(move |mutable, start, len| {
80
12
            let mutable_len = mutable.len;
81
12
            let out = mutable.null_buffer();
82
12
            utils::resize_for_bits(out, mutable_len + len);
83
12
            mutable.null_count += set_bits(
84
12
                out.as_slice_mut(),
85
12
                bytes,
86
12
                mutable_len,
87
12
                nulls.offset() + start,
88
12
                len,
89
12
            );
90
12
        })
91
39
    } else if use_nulls {
92
35
        Box::new(|mutable, _, len| {
93
35
            let mutable_len = mutable.len;
94
35
            let out = mutable.null_buffer();
95
35
            utils::resize_for_bits(out, mutable_len + len);
96
35
            let write_data = out.as_slice_mut();
97
45
            
(0..len)35
.
for_each35
(|i| {
98
45
                bit_util::set_bit(write_data, mutable_len + i);
99
45
            });
100
35
        })
101
    } else {
102
4
        Box::new(|_, _, _| {})
103
    }
104
51
}
105
106
/// Efficiently create an [ArrayData] from one or more existing [ArrayData]s by
107
/// copying chunks.
108
///
109
/// The main use case of this struct is to perform unary operations to arrays of
110
/// arbitrary types, such as `filter` and `take`.
111
///
112
/// # Example
113
/// ```
114
/// use arrow_buffer::Buffer;
115
/// use arrow_data::ArrayData;
116
/// use arrow_data::transform::MutableArrayData;
117
/// use arrow_schema::DataType;
118
/// fn i32_array(values: &[i32]) -> ArrayData {
119
///   ArrayData::try_new(DataType::Int32, 5, None, 0, vec![Buffer::from_slice_ref(values)], vec![]).unwrap()
120
/// }
121
/// let arr1  = i32_array(&[1, 2, 3, 4, 5]);
122
/// let arr2  = i32_array(&[6, 7, 8, 9, 10]);
123
/// // Create a mutable array for copying values from arr1 and arr2, with a capacity for 6 elements
124
/// let capacity = 3 * std::mem::size_of::<i32>();
125
/// let mut mutable = MutableArrayData::new(vec![&arr1, &arr2], false, 10);
126
/// // Copy the first 3 elements from arr1
127
/// mutable.extend(0, 0, 3);
128
/// // Copy the last 3 elements from arr2
129
/// mutable.extend(1, 2, 4);
130
/// // Complete the MutableArrayData into a new ArrayData
131
/// let frozen = mutable.freeze();
132
/// assert_eq!(frozen, i32_array(&[1, 2, 3, 8, 9, 10]));
133
/// ```
134
pub struct MutableArrayData<'a> {
135
    /// Input arrays: the data being read FROM.
136
    ///
137
    /// Note this is "dead code" because all actual references to the arrays are
138
    /// stored in closures for extending values and nulls.
139
    #[allow(dead_code)]
140
    arrays: Vec<&'a ArrayData>,
141
142
    /// In progress output array: The data being written TO
143
    ///
144
    /// Note these fields are in a separate struct, [_MutableArrayData], as they
145
    /// cannot be in [MutableArrayData] itself due to mutability invariants (interior
146
    /// mutability): [MutableArrayData] contains a function that can only mutate
147
    /// [_MutableArrayData], not [MutableArrayData] itself
148
    data: _MutableArrayData<'a>,
149
150
    /// The child data of the `Array` in Dictionary arrays.
151
    ///
152
    /// This is not stored in `_MutableArrayData` because these values are
153
    /// constant and only needed at the end, when freezing [_MutableArrayData].
154
    dictionary: Option<ArrayData>,
155
156
    /// Variadic data buffers referenced by views.
157
    ///
158
    /// Note this this is not stored in `_MutableArrayData` because these values
159
    /// are constant and only needed at the end, when freezing
160
    /// [_MutableArrayData]
161
    variadic_data_buffers: Vec<Buffer>,
162
163
    /// function used to extend output array with values from input arrays.
164
    ///
165
    /// This function's lifetime is bound to the input arrays because it reads
166
    /// values from them.
167
    extend_values: Vec<Extend<'a>>,
168
169
    /// function used to extend the output array with nulls from input arrays.
170
    ///
171
    /// This function's lifetime is bound to the input arrays because it reads
172
    /// nulls from it.
173
    extend_null_bits: Vec<ExtendNullBits<'a>>,
174
175
    /// function used to extend the output array with null elements.
176
    ///
177
    /// This function is independent of the arrays and therefore has no lifetime.
178
    extend_nulls: ExtendNulls,
179
}
180
181
impl std::fmt::Debug for MutableArrayData<'_> {
182
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
183
        // ignores the closures.
184
0
        f.debug_struct("MutableArrayData")
185
0
            .field("data", &self.data)
186
0
            .finish()
187
0
    }
188
}
189
190
/// Builds an extend that adds `offset` to the source primitive
191
/// Additionally validates that `max` fits into the
192
/// the underlying primitive returning None if not
193
0
fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option<Extend<'_>> {
194
    macro_rules! validate_and_build {
195
        ($dt: ty) => {{
196
            let _: $dt = max.try_into().ok()?;
197
            let offset: $dt = offset.try_into().ok()?;
198
            Some(primitive::build_extend_with_offset(array, offset))
199
        }};
200
    }
201
0
    match array.data_type() {
202
0
        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
203
0
            DataType::UInt8 => validate_and_build!(u8),
204
0
            DataType::UInt16 => validate_and_build!(u16),
205
0
            DataType::UInt32 => validate_and_build!(u32),
206
0
            DataType::UInt64 => validate_and_build!(u64),
207
0
            DataType::Int8 => validate_and_build!(i8),
208
0
            DataType::Int16 => validate_and_build!(i16),
209
0
            DataType::Int32 => validate_and_build!(i32),
210
0
            DataType::Int64 => validate_and_build!(i64),
211
0
            _ => unreachable!(),
212
        },
213
0
        _ => None,
214
    }
215
0
}
216
217
/// Builds an extend that adds `buffer_offset` to any buffer indices encountered
218
0
fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> {
219
0
    let views = array.buffer::<u128>(0);
220
0
    Box::new(
221
0
        move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
222
0
            mutable
223
0
                .buffer1
224
0
                .extend(views[start..start + len].iter().map(|v| {
225
0
                    let len = *v as u32;
226
0
                    if len <= 12 {
227
0
                        return *v; // Stored inline
228
0
                    }
229
0
                    let mut view = ByteView::from(*v);
230
0
                    view.buffer_index += buffer_offset;
231
0
                    view.into()
232
0
                }))
233
0
        },
234
    )
235
0
}
236
237
51
fn build_extend(array: &ArrayData) -> Extend<'_> {
238
51
    match array.data_type() {
239
0
        DataType::Null => null::build_extend(array),
240
0
        DataType::Boolean => boolean::build_extend(array),
241
0
        DataType::UInt8 => primitive::build_extend::<u8>(array),
242
0
        DataType::UInt16 => primitive::build_extend::<u16>(array),
243
0
        DataType::UInt32 => primitive::build_extend::<u32>(array),
244
0
        DataType::UInt64 => primitive::build_extend::<u64>(array),
245
0
        DataType::Int8 => primitive::build_extend::<i8>(array),
246
0
        DataType::Int16 => primitive::build_extend::<i16>(array),
247
6
        DataType::Int32 => primitive::build_extend::<i32>(array),
248
0
        DataType::Int64 => primitive::build_extend::<i64>(array),
249
0
        DataType::Float32 => primitive::build_extend::<f32>(array),
250
3
        DataType::Float64 => primitive::build_extend::<f64>(array),
251
        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
252
0
            primitive::build_extend::<i32>(array)
253
        }
254
        DataType::Date64
255
        | DataType::Time64(_)
256
        | DataType::Timestamp(_, _)
257
        | DataType::Duration(_)
258
0
        | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::<i64>(array),
259
0
        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::<i128>(array),
260
0
        DataType::Decimal32(_, _) => primitive::build_extend::<i32>(array),
261
0
        DataType::Decimal64(_, _) => primitive::build_extend::<i64>(array),
262
0
        DataType::Decimal128(_, _) => primitive::build_extend::<i128>(array),
263
0
        DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array),
264
9
        DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array),
265
0
        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
266
0
        DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
267
12
        DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
268
        DataType::ListView(_) | DataType::LargeListView(_) => {
269
0
            unimplemented!("ListView/LargeListView not implemented")
270
        }
271
0
        DataType::LargeList(_) => list::build_extend::<i64>(array),
272
0
        DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
273
15
        DataType::Struct(_) => structure::build_extend(array),
274
6
        DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
275
0
        DataType::Float16 => primitive::build_extend::<f16>(array),
276
0
        DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array),
277
0
        DataType::Union(_, mode) => match mode {
278
0
            UnionMode::Sparse => union::build_extend_sparse(array),
279
0
            UnionMode::Dense => union::build_extend_dense(array),
280
        },
281
0
        DataType::RunEndEncoded(_, _) => run::build_extend(array),
282
    }
283
51
}
284
285
18
fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
286
18
    Box::new(match 
data_type0
{
287
0
        DataType::Null => null::extend_nulls,
288
0
        DataType::Boolean => boolean::extend_nulls,
289
0
        DataType::UInt8 => primitive::extend_nulls::<u8>,
290
0
        DataType::UInt16 => primitive::extend_nulls::<u16>,
291
0
        DataType::UInt32 => primitive::extend_nulls::<u32>,
292
0
        DataType::UInt64 => primitive::extend_nulls::<u64>,
293
0
        DataType::Int8 => primitive::extend_nulls::<i8>,
294
0
        DataType::Int16 => primitive::extend_nulls::<i16>,
295
2
        DataType::Int32 => primitive::extend_nulls::<i32>,
296
0
        DataType::Int64 => primitive::extend_nulls::<i64>,
297
0
        DataType::Float32 => primitive::extend_nulls::<f32>,
298
1
        DataType::Float64 => primitive::extend_nulls::<f64>,
299
0
        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
300
0
            primitive::extend_nulls::<i32>
301
0
        }
302
        DataType::Date64
303
        | DataType::Time64(_)
304
        | DataType::Timestamp(_, _)
305
        | DataType::Duration(_)
306
0
        | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>,
307
0
        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>,
308
0
        DataType::Decimal32(_, _) => primitive::extend_nulls::<i32>,
309
0
        DataType::Decimal64(_, _) => primitive::extend_nulls::<i64>,
310
0
        DataType::Decimal128(_, _) => primitive::extend_nulls::<i128>,
311
0
        DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>,
312
3
        DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>,
313
0
        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
314
0
        DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
315
4
        DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
316
        DataType::ListView(_) | DataType::LargeListView(_) => {
317
0
            unimplemented!("ListView/LargeListView not implemented")
318
        }
319
0
        DataType::LargeList(_) => list::extend_nulls::<i64>,
320
0
        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
321
0
            DataType::UInt8 => primitive::extend_nulls::<u8>,
322
0
            DataType::UInt16 => primitive::extend_nulls::<u16>,
323
0
            DataType::UInt32 => primitive::extend_nulls::<u32>,
324
0
            DataType::UInt64 => primitive::extend_nulls::<u64>,
325
0
            DataType::Int8 => primitive::extend_nulls::<i8>,
326
0
            DataType::Int16 => primitive::extend_nulls::<i16>,
327
0
            DataType::Int32 => primitive::extend_nulls::<i32>,
328
0
            DataType::Int64 => primitive::extend_nulls::<i64>,
329
0
            _ => unreachable!(),
330
        },
331
5
        DataType::Struct(_) => structure::extend_nulls,
332
3
        DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls,
333
0
        DataType::Float16 => primitive::extend_nulls::<f16>,
334
0
        DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls,
335
0
        DataType::Union(_, mode) => match mode {
336
0
            UnionMode::Sparse => union::extend_nulls_sparse,
337
0
            UnionMode::Dense => union::extend_nulls_dense,
338
        },
339
0
        DataType::RunEndEncoded(_, _) => run::extend_nulls,
340
    })
341
18
}
342
343
0
fn preallocate_offset_and_binary_buffer<Offset: ArrowNativeType + Integer>(
344
0
    capacity: usize,
345
0
    binary_size: usize,
346
0
) -> [MutableBuffer; 2] {
347
    // offsets
348
0
    let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
349
    // safety: `unsafe` code assumes that this buffer is initialized with one element
350
0
    buffer.push(Offset::zero());
351
352
0
    [
353
0
        buffer,
354
0
        MutableBuffer::new(binary_size * mem::size_of::<u8>()),
355
0
    ]
356
0
}
357
358
/// Define capacities to pre-allocate for child data or data buffers.
359
#[derive(Debug, Clone)]
360
pub enum Capacities {
361
    /// Binary, Utf8 and LargeUtf8 data types
362
    ///
363
    /// Defines
364
    /// * the capacity of the array offsets
365
    /// * the capacity of the binary/ str buffer
366
    Binary(usize, Option<usize>),
367
    /// List and LargeList data types
368
    ///
369
    /// Defines
370
    /// * the capacity of the array offsets
371
    /// * the capacity of the child data
372
    List(usize, Option<Box<Capacities>>),
373
    /// Struct type
374
    ///
375
    /// Defines
376
    /// * the capacity of the array
377
    /// * the capacities of the fields
378
    Struct(usize, Option<Vec<Capacities>>),
379
    /// Dictionary type
380
    ///
381
    /// Defines
382
    /// * the capacity of the array/keys
383
    /// * the capacity of the values
384
    Dictionary(usize, Option<Box<Capacities>>),
385
    /// Don't preallocate inner buffers and rely on array growth strategy
386
    Array(usize),
387
}
388
389
impl<'a> MutableArrayData<'a> {
390
    /// Returns a new [MutableArrayData] with capacity to `capacity` slots and
391
    /// specialized to create an [ArrayData] from multiple `arrays`.
392
    ///
393
    /// # Arguments
394
    /// * `arrays` - the source arrays to copy from
395
    /// * `use_nulls` - a flag used to optimize insertions
396
    ///   - `false` if the only source of nulls are the arrays themselves
397
    ///   - `true` if the user plans to call [MutableArrayData::extend_nulls].
398
    /// * capacity - the preallocated capacity of the output array, in bytes
399
    ///
400
    /// Thus, if `use_nulls` is `false`, calling
401
    /// [MutableArrayData::extend_nulls] should not be used.
402
8
    pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self {
403
8
        Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity))
404
8
    }
405
406
    /// Similar to [MutableArrayData::new], but lets users define the
407
    /// preallocated capacities of the array with more granularity.
408
    ///
409
    /// See [MutableArrayData::new] for more information on the arguments.
410
    ///
411
    /// # Panics
412
    ///
413
    /// This function panics if the given `capacities` don't match the data type
414
    /// of `arrays`. Or when a [Capacities] variant is not yet supported.
415
18
    pub fn with_capacities(
416
18
        arrays: Vec<&'a ArrayData>,
417
18
        use_nulls: bool,
418
18
        capacities: Capacities,
419
18
    ) -> Self {
420
18
        let data_type = arrays[0].data_type();
421
422
33
        for a in 
arrays.iter()18
.
skip18
(1) {
423
33
            assert_eq!(
424
                data_type,
425
33
                a.data_type(),
426
0
                "Arrays with inconsistent types passed to MutableArrayData"
427
            )
428
        }
429
430
        // if any of the arrays has nulls, insertions from any array requires setting bits
431
        // as there is at least one array with nulls.
432
35
        let 
use_nulls18
=
use_nulls18
|
arrays.iter()18
.
any18
(|array| array.null_count() > 0);
433
434
        let mut array_capacity;
435
436
18
        let [buffer1, buffer2] = match (data_type, &capacities) {
437
            (
438
                DataType::LargeUtf8 | DataType::LargeBinary,
439
0
                Capacities::Binary(capacity, Some(value_cap)),
440
            ) => {
441
0
                array_capacity = *capacity;
442
0
                preallocate_offset_and_binary_buffer::<i64>(*capacity, *value_cap)
443
            }
444
0
            (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => {
445
0
                array_capacity = *capacity;
446
0
                preallocate_offset_and_binary_buffer::<i32>(*capacity, *value_cap)
447
            }
448
18
            (_, Capacities::Array(capacity)) => {
449
18
                array_capacity = *capacity;
450
18
                new_buffers(data_type, *capacity)
451
            }
452
            (
453
                DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
454
0
                Capacities::List(capacity, _),
455
            ) => {
456
0
                array_capacity = *capacity;
457
0
                new_buffers(data_type, *capacity)
458
            }
459
0
            _ => panic!("Capacities: {capacities:?} not yet supported"),
460
        };
461
462
18
        let child_data = match &data_type {
463
            DataType::Decimal32(_, _)
464
            | DataType::Decimal64(_, _)
465
            | DataType::Decimal128(_, _)
466
            | DataType::Decimal256(_, _)
467
            | DataType::Null
468
            | DataType::Boolean
469
            | DataType::UInt8
470
            | DataType::UInt16
471
            | DataType::UInt32
472
            | DataType::UInt64
473
            | DataType::Int8
474
            | DataType::Int16
475
            | DataType::Int32
476
            | DataType::Int64
477
            | DataType::Float16
478
            | DataType::Float32
479
            | DataType::Float64
480
            | DataType::Date32
481
            | DataType::Date64
482
            | DataType::Time32(_)
483
            | DataType::Time64(_)
484
            | DataType::Duration(_)
485
            | DataType::Timestamp(_, _)
486
            | DataType::Utf8
487
            | DataType::Binary
488
            | DataType::LargeUtf8
489
            | DataType::LargeBinary
490
            | DataType::BinaryView
491
            | DataType::Utf8View
492
            | DataType::Interval(_)
493
9
            | DataType::FixedSizeBinary(_) => vec![],
494
            DataType::ListView(_) | DataType::LargeListView(_) => {
495
0
                unimplemented!("ListView/LargeListView not implemented")
496
            }
497
            DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
498
4
                let children = arrays
499
4
                    .iter()
500
12
                    .
map4
(|array| &array.child_data()[0])
501
4
                    .collect::<Vec<_>>();
502
503
4
                let capacities =
504
4
                    if let Capacities::List(
capacity0
,
ref child_capacities0
) = capacities {
505
0
                        child_capacities
506
0
                            .clone()
507
0
                            .map(|c| *c)
508
0
                            .unwrap_or(Capacities::Array(capacity))
509
                    } else {
510
4
                        Capacities::Array(array_capacity)
511
                    };
512
513
4
                vec![MutableArrayData::with_capacities(
514
4
                    children, use_nulls, capacities,
515
                )]
516
            }
517
            // the dictionary type just appends keys and clones the values.
518
0
            DataType::Dictionary(_, _) => vec![],
519
5
            DataType::Struct(fields) => match 
capacities0
{
520
0
                Capacities::Struct(capacity, Some(ref child_capacities)) => {
521
0
                    array_capacity = capacity;
522
0
                    (0..fields.len())
523
0
                        .zip(child_capacities)
524
0
                        .map(|(i, child_cap)| {
525
0
                            let child_arrays = arrays
526
0
                                .iter()
527
0
                                .map(|array| &array.child_data()[i])
528
0
                                .collect::<Vec<_>>();
529
0
                            MutableArrayData::with_capacities(
530
0
                                child_arrays,
531
0
                                use_nulls,
532
0
                                child_cap.clone(),
533
                            )
534
0
                        })
535
0
                        .collect::<Vec<_>>()
536
                }
537
0
                Capacities::Struct(capacity, None) => {
538
0
                    array_capacity = capacity;
539
0
                    (0..fields.len())
540
0
                        .map(|i| {
541
0
                            let child_arrays = arrays
542
0
                                .iter()
543
0
                                .map(|array| &array.child_data()[i])
544
0
                                .collect::<Vec<_>>();
545
0
                            MutableArrayData::new(child_arrays, use_nulls, capacity)
546
0
                        })
547
0
                        .collect::<Vec<_>>()
548
                }
549
5
                _ => (0..fields.len())
550
8
                    .
map5
(|i| {
551
8
                        let child_arrays = arrays
552
8
                            .iter()
553
24
                            .
map8
(|array| &array.child_data()[i])
554
8
                            .collect::<Vec<_>>();
555
8
                        MutableArrayData::new(child_arrays, use_nulls, array_capacity)
556
8
                    })
557
5
                    .collect::<Vec<_>>(),
558
            },
559
            DataType::RunEndEncoded(_, _) => {
560
0
                let run_ends_child = arrays
561
0
                    .iter()
562
0
                    .map(|array| &array.child_data()[0])
563
0
                    .collect::<Vec<_>>();
564
0
                let value_child = arrays
565
0
                    .iter()
566
0
                    .map(|array| &array.child_data()[1])
567
0
                    .collect::<Vec<_>>();
568
0
                vec![
569
0
                    MutableArrayData::new(run_ends_child, false, array_capacity),
570
0
                    MutableArrayData::new(value_child, use_nulls, array_capacity),
571
                ]
572
            }
573
0
            DataType::FixedSizeList(_, size) => {
574
0
                let children = arrays
575
0
                    .iter()
576
0
                    .map(|array| &array.child_data()[0])
577
0
                    .collect::<Vec<_>>();
578
0
                let capacities =
579
0
                    if let Capacities::List(capacity, ref child_capacities) = capacities {
580
0
                        child_capacities
581
0
                            .clone()
582
0
                            .map(|c| *c)
583
0
                            .unwrap_or(Capacities::Array(capacity * *size as usize))
584
                    } else {
585
0
                        Capacities::Array(array_capacity * *size as usize)
586
                    };
587
0
                vec![MutableArrayData::with_capacities(
588
0
                    children, use_nulls, capacities,
589
                )]
590
            }
591
0
            DataType::Union(fields, _) => (0..fields.len())
592
0
                .map(|i| {
593
0
                    let child_arrays = arrays
594
0
                        .iter()
595
0
                        .map(|array| &array.child_data()[i])
596
0
                        .collect::<Vec<_>>();
597
0
                    MutableArrayData::new(child_arrays, use_nulls, array_capacity)
598
0
                })
599
0
                .collect::<Vec<_>>(),
600
        };
601
602
        // Get the dictionary if any, and if it is a concatenation of multiple
603
18
        let (dictionary, dict_concat) = match &data_type {
604
            DataType::Dictionary(_, _) => {
605
                // If more than one dictionary, concatenate dictionaries together
606
0
                let dict_concat = !arrays
607
0
                    .windows(2)
608
0
                    .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0]));
609
610
0
                match dict_concat {
611
0
                    false => (Some(arrays[0].child_data()[0].clone()), false),
612
                    true => {
613
0
                        if let Capacities::Dictionary(_, _) = capacities {
614
0
                            panic!("dictionary capacity not yet supported")
615
0
                        }
616
0
                        let dictionaries: Vec<_> =
617
0
                            arrays.iter().map(|array| &array.child_data()[0]).collect();
618
0
                        let lengths: Vec<_> = dictionaries
619
0
                            .iter()
620
0
                            .map(|dictionary| dictionary.len())
621
0
                            .collect();
622
0
                        let capacity = lengths.iter().sum();
623
624
0
                        let mut mutable = MutableArrayData::new(dictionaries, false, capacity);
625
626
0
                        for (i, len) in lengths.iter().enumerate() {
627
0
                            mutable.extend(i, 0, *len)
628
                        }
629
630
0
                        (Some(mutable.freeze()), true)
631
                    }
632
                }
633
            }
634
18
            _ => (None, false),
635
        };
636
637
18
        let variadic_data_buffers = match &data_type {
638
0
            DataType::BinaryView | DataType::Utf8View => arrays
639
0
                .iter()
640
0
                .flat_map(|x| x.buffers().iter().skip(1))
641
0
                .map(Buffer::clone)
642
0
                .collect(),
643
18
            _ => vec![],
644
        };
645
646
18
        let extend_nulls = build_extend_nulls(data_type);
647
648
18
        let extend_null_bits = arrays
649
18
            .iter()
650
51
            .
map18
(|array| build_extend_null_bits(array, use_nulls))
651
18
            .collect();
652
653
18
        let null_buffer = use_nulls.then(|| 
{16
654
16
            let null_bytes = bit_util::ceil(array_capacity, 8);
655
16
            MutableBuffer::from_len_zeroed(null_bytes)
656
16
        });
657
658
18
        let extend_values = match &data_type {
659
            DataType::Dictionary(_, _) => {
660
0
                let mut next_offset = 0;
661
0
                let extend_values: Result<Vec<_>, _> = arrays
662
0
                    .iter()
663
0
                    .map(|array| {
664
0
                        let offset = next_offset;
665
0
                        let dict_len = array.child_data()[0].len();
666
667
0
                        if dict_concat {
668
0
                            next_offset += dict_len;
669
0
                        }
670
671
0
                        build_extend_dictionary(array, offset, offset + dict_len)
672
0
                            .ok_or(ArrowError::DictionaryKeyOverflowError)
673
0
                    })
674
0
                    .collect();
675
676
0
                extend_values.expect("MutableArrayData::new is infallible")
677
            }
678
            DataType::BinaryView | DataType::Utf8View => {
679
0
                let mut next_offset = 0u32;
680
0
                arrays
681
0
                    .iter()
682
0
                    .map(|arr| {
683
0
                        let num_data_buffers = (arr.buffers().len() - 1) as u32;
684
0
                        let offset = next_offset;
685
0
                        next_offset = next_offset
686
0
                            .checked_add(num_data_buffers)
687
0
                            .expect("view buffer index overflow");
688
0
                        build_extend_view(arr, offset)
689
0
                    })
690
0
                    .collect()
691
            }
692
51
            _ => 
arrays.iter()18
.
map18
(|array| build_extend(array)).
collect18
(),
693
        };
694
695
18
        let data = _MutableArrayData {
696
18
            data_type: data_type.clone(),
697
18
            len: 0,
698
18
            null_count: 0,
699
18
            null_buffer,
700
18
            buffer1,
701
18
            buffer2,
702
18
            child_data,
703
18
        };
704
18
        Self {
705
18
            arrays,
706
18
            data,
707
18
            dictionary,
708
18
            variadic_data_buffers,
709
18
            extend_values,
710
18
            extend_null_bits,
711
18
            extend_nulls,
712
18
        }
713
18
    }
714
715
    /// Extends the in progress array with a region of the input arrays
716
    ///
717
    /// # Arguments
718
    /// * `index` - the index of array that you what to copy values from
719
    /// * `start` - the start index of the chunk (inclusive)
720
    /// * `end` - the end index of the chunk (exclusive)
721
    ///
722
    /// # Panic
723
    /// This function panics if there is an invalid index,
724
    /// i.e. `index` >= the number of source arrays
725
    /// or `end` > the length of the `index`th array
726
51
    pub fn extend(&mut self, index: usize, start: usize, end: usize) {
727
51
        let len = end - start;
728
51
        (self.extend_null_bits[index])(&mut self.data, start, len);
729
51
        (self.extend_values[index])(&mut self.data, index, start, len);
730
51
        self.data.len += len;
731
51
    }
732
733
    /// Extends the in progress array with null elements, ignoring the input arrays.
734
    ///
735
    /// # Panics
736
    ///
737
    /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays
738
0
    pub fn extend_nulls(&mut self, len: usize) {
739
0
        self.data.len += len;
740
0
        let bit_len = bit_util::ceil(self.data.len, 8);
741
0
        let nulls = self.data.null_buffer();
742
0
        nulls.resize(bit_len, 0);
743
0
        self.data.null_count += len;
744
0
        (self.extend_nulls)(&mut self.data, len);
745
0
    }
746
747
    /// Returns the current length
748
    #[inline]
749
0
    pub fn len(&self) -> usize {
750
0
        self.data.len
751
0
    }
752
753
    /// Returns true if len is 0
754
    #[inline]
755
    pub fn is_empty(&self) -> bool {
756
        self.data.len == 0
757
    }
758
759
    /// Returns the current null count
760
    #[inline]
761
    pub fn null_count(&self) -> usize {
762
        self.data.null_count
763
    }
764
765
    /// Creates a [ArrayData] from the in progress array, consuming `self`.
766
18
    pub fn freeze(self) -> ArrayData {
767
18
        unsafe { self.into_builder().build_unchecked() }
768
18
    }
769
770
    /// Consume self and returns the in progress array as [`ArrayDataBuilder`].
771
    ///
772
    /// This is useful for extending the default behavior of MutableArrayData.
773
18
    pub fn into_builder(self) -> ArrayDataBuilder {
774
18
        let data = self.data;
775
776
18
        let buffers = match data.data_type {
777
            DataType::Null
778
            | DataType::Struct(_)
779
            | DataType::FixedSizeList(_, _)
780
            | DataType::RunEndEncoded(_, _) => {
781
5
                vec![]
782
            }
783
            DataType::BinaryView | DataType::Utf8View => {
784
0
                let mut b = self.variadic_data_buffers;
785
0
                b.insert(0, data.buffer1.into());
786
0
                b
787
            }
788
            DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
789
3
                vec![data.buffer1.into(), data.buffer2.into()]
790
            }
791
0
            DataType::Union(_, mode) => {
792
0
                match mode {
793
                    // Based on Union's DataTypeLayout
794
0
                    UnionMode::Sparse => vec![data.buffer1.into()],
795
0
                    UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()],
796
                }
797
            }
798
10
            _ => vec![data.buffer1.into()],
799
        };
800
801
18
        let child_data = match data.data_type {
802
0
            DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()],
803
18
            _ => data.child_data.into_iter().map(|x| 
x12
.
freeze12
()).collect(),
804
        };
805
806
18
        let nulls = match data.data_type {
807
            // RunEndEncoded and Null arrays cannot have top-level null bitmasks
808
0
            DataType::RunEndEncoded(_, _) | DataType::Null => None,
809
18
            _ => data
810
18
                .null_buffer
811
18
                .map(|nulls| 
{16
812
16
                    let bools = BooleanBuffer::new(nulls.into(), 0, data.len);
813
16
                    unsafe { NullBuffer::new_unchecked(bools, data.null_count) }
814
16
                })
815
18
                .filter(|n| 
n16
.
null_count16
() > 0),
816
        };
817
818
18
        ArrayDataBuilder::new(data.data_type)
819
18
            .offset(0)
820
18
            .len(data.len)
821
18
            .nulls(nulls)
822
18
            .buffers(buffers)
823
18
            .child_data(child_data)
824
18
    }
825
}
826
827
// See arrow/tests/array_transform.rs for tests of transform functionality
828
829
#[cfg(test)]
830
mod test {
831
    use super::*;
832
    use arrow_schema::Field;
833
    use std::sync::Arc;
834
835
    #[test]
836
    fn test_list_append_with_capacities() {
837
        let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new(
838
            "element",
839
            DataType::Int64,
840
            false,
841
        ))));
842
843
        let mutable = MutableArrayData::with_capacities(
844
            vec![&array],
845
            false,
846
            Capacities::List(6, Some(Box::new(Capacities::Array(17)))),
847
        );
848
849
        // capacities are rounded up to multiples of 64 by MutableBuffer
850
        assert_eq!(mutable.data.buffer1.capacity(), 64);
851
        assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192);
852
    }
853
}