Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-data/src/data.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19
//! common attributes and operations for Arrow array.
20
21
use crate::bit_iterator::BitSliceIterator;
22
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23
use arrow_buffer::{
24
    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25
};
26
use arrow_schema::{ArrowError, DataType, UnionMode};
27
use std::mem;
28
use std::ops::Range;
29
use std::sync::Arc;
30
31
use crate::{equal, validate_binary_view, validate_string_view};
32
33
#[inline]
34
161k
pub(crate) fn contains_nulls(
35
161k
    null_bit_buffer: Option<&NullBuffer>,
36
161k
    offset: usize,
37
161k
    len: usize,
38
161k
) -> bool {
39
161k
    match null_bit_buffer {
40
798
        Some(buffer) => {
41
798
            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42
767
                Some((start, end)) => start != 0 || 
end != len493
,
43
31
                None => len != 0, // No non-null values
44
            }
45
        }
46
160k
        None => false, // No null buffer
47
    }
48
161k
}
49
50
#[inline]
51
56
pub(crate) fn count_nulls(
52
56
    null_bit_buffer: Option<&NullBuffer>,
53
56
    offset: usize,
54
56
    len: usize,
55
56
) -> usize {
56
56
    if let Some(
buf42
) = null_bit_buffer {
57
42
        let buffer = buf.buffer();
58
42
        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59
    } else {
60
14
        0
61
    }
62
56
}
63
64
/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65
#[inline]
66
112
pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67
112
    let empty_buffer = MutableBuffer::new(0);
68
112
    match data_type {
69
0
        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70
        DataType::Boolean => {
71
0
            let bytes = bit_util::ceil(capacity, 8);
72
0
            let buffer = MutableBuffer::new(bytes);
73
0
            [buffer, empty_buffer]
74
        }
75
        DataType::UInt8
76
        | DataType::UInt16
77
        | DataType::UInt32
78
        | DataType::UInt64
79
        | DataType::Int8
80
        | DataType::Int16
81
        | DataType::Int32
82
        | DataType::Int64
83
        | DataType::Float16
84
        | DataType::Float32
85
        | DataType::Float64
86
        | DataType::Decimal32(_, _)
87
        | DataType::Decimal64(_, _)
88
        | DataType::Decimal128(_, _)
89
        | DataType::Decimal256(_, _)
90
        | DataType::Date32
91
        | DataType::Time32(_)
92
        | DataType::Date64
93
        | DataType::Time64(_)
94
        | DataType::Duration(_)
95
        | DataType::Timestamp(_, _)
96
66
        | DataType::Interval(_) => [
97
66
            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98
66
            empty_buffer,
99
66
        ],
100
        DataType::Utf8 | DataType::Binary => {
101
14
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102
            // safety: `unsafe` code assumes that this buffer is initialized with one element
103
14
            buffer.push(0i32);
104
14
            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105
        }
106
        DataType::LargeUtf8 | DataType::LargeBinary => {
107
0
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108
            // safety: `unsafe` code assumes that this buffer is initialized with one element
109
0
            buffer.push(0i64);
110
0
            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111
        }
112
2
        DataType::BinaryView | DataType::Utf8View => [
113
2
            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114
2
            empty_buffer,
115
2
        ],
116
        DataType::List(_) | DataType::Map(_, _) => {
117
            // offset buffer always starts with a zero
118
3
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119
3
            buffer.push(0i32);
120
3
            [buffer, empty_buffer]
121
        }
122
0
        DataType::ListView(_) => [
123
0
            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124
0
            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125
0
        ],
126
        DataType::LargeList(_) => {
127
            // offset buffer always starts with a zero
128
1
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129
1
            buffer.push(0i64);
130
1
            [buffer, empty_buffer]
131
        }
132
0
        DataType::LargeListView(_) => [
133
0
            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134
0
            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135
0
        ],
136
0
        DataType::FixedSizeBinary(size) => {
137
0
            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138
        }
139
8
        DataType::Dictionary(k, _) => [
140
8
            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141
8
            empty_buffer,
142
8
        ],
143
        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144
11
            [empty_buffer, MutableBuffer::new(0)]
145
        }
146
7
        DataType::Union(_, mode) => {
147
7
            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148
7
            match mode {
149
1
                UnionMode::Sparse => [type_ids, empty_buffer],
150
                UnionMode::Dense => {
151
6
                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152
6
                    [type_ids, offsets]
153
                }
154
            }
155
        }
156
    }
157
112
}
158
159
/// A generic representation of Arrow array data which encapsulates common attributes
160
/// and operations for Arrow array.
161
///
162
/// Specific operations for different arrays types (e.g., primitive, list, struct)
163
/// are implemented in `Array`.
164
///
165
/// # Memory Layout
166
///
167
/// `ArrayData` has references to one or more underlying data buffers
168
/// and optional child ArrayData, depending on type as illustrated
169
/// below. Bitmaps are not shown for simplicity but they are stored
170
/// similarly to the buffers.
171
///
172
/// ```text
173
///                        offset
174
///                       points to
175
/// ┌───────────────────┐ start of  ┌───────┐       Different
176
/// │                   │   data    │       │     ArrayData may
177
/// │ArrayData {        │           │....   │     also refers to
178
/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179
/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180
/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181
/// │  buffers: [       │           │5882   │◀─
182
/// │    ...            │  │        │4323   │
183
/// │  ]                │   ─ ─ ─ ─▶│4859   │
184
/// │  child_data: [    │           │....   │
185
/// │    ...            │           │       │
186
/// │  ]                │           └───────┘
187
/// │}                  │
188
/// │                   │            Shared Buffer uses
189
/// │               │   │            bytes::Bytes to hold
190
/// └───────────────────┘            actual data values
191
///           ┌ ─ ─ ┘
192
///
193
///           ▼
194
/// ┌───────────────────┐
195
/// │ArrayData {        │
196
/// │  ...              │
197
/// │}                  │
198
/// │                   │
199
/// └───────────────────┘
200
///
201
/// Child ArrayData may also have its own buffers and children
202
/// ```
203
204
#[derive(Debug, Clone)]
205
pub struct ArrayData {
206
    /// The data type
207
    data_type: DataType,
208
209
    /// The number of elements
210
    len: usize,
211
212
    /// The offset in number of items (not bytes).
213
    ///
214
    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215
    /// does NOT apply to [`Self::nulls`].
216
    offset: usize,
217
218
    /// The buffers that store the actual data for this array, as defined
219
    /// in the [Arrow Spec].
220
    ///
221
    /// Depending on the array types, [`Self::buffers`] can hold different
222
    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223
    /// positions.
224
    ///
225
    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226
    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227
    /// we need to satisfy `len`.
228
    ///
229
    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230
    buffers: Vec<Buffer>,
231
232
    /// The child(ren) of this array.
233
    ///
234
    /// Only non-empty for nested types, such as `ListArray` and
235
    /// `StructArray`.
236
    ///
237
    /// The first logical element in each child element begins at `offset`.
238
    ///
239
    /// If the child element also has an offset then these offsets are
240
    /// cumulative.
241
    child_data: Vec<ArrayData>,
242
243
    /// The null bitmap.
244
    ///
245
    /// `None` indicates all values are non-null in this array.
246
    ///
247
    /// [`Self::offset]` does not apply to the null bitmap. While the
248
    /// BooleanBuffer may be sliced (have its own offset) internally, this
249
    /// `NullBuffer` always represents exactly `len` elements.
250
    nulls: Option<NullBuffer>,
251
}
252
253
/// A thread-safe, shared reference to the Arrow array data.
254
pub type ArrayDataRef = Arc<ArrayData>;
255
256
impl ArrayData {
257
    /// Create a new ArrayData instance;
258
    ///
259
    /// If `null_count` is not specified, the number of nulls in
260
    /// null_bit_buffer is calculated.
261
    ///
262
    /// If the number of nulls is 0 then the null_bit_buffer
263
    /// is set to `None`.
264
    ///
265
    /// # Safety
266
    ///
267
    /// The input values *must* form a valid Arrow array for
268
    /// `data_type`, or undefined behavior can result.
269
    ///
270
    /// Note: This is a low level API and most users of the arrow
271
    /// crate should create arrays using the methods in the `array`
272
    /// module.
273
2.12k
    pub unsafe fn new_unchecked(
274
2.12k
        data_type: DataType,
275
2.12k
        len: usize,
276
2.12k
        null_count: Option<usize>,
277
2.12k
        null_bit_buffer: Option<Buffer>,
278
2.12k
        offset: usize,
279
2.12k
        buffers: Vec<Buffer>,
280
2.12k
        child_data: Vec<ArrayData>,
281
2.12k
    ) -> Self {
282
2.12k
        let mut skip_validation = UnsafeFlag::new();
283
        // SAFETY: caller responsible for ensuring data is valid
284
2.12k
        unsafe { skip_validation.set(true) };
285
286
2.12k
        ArrayDataBuilder {
287
2.12k
            data_type,
288
2.12k
            len,
289
2.12k
            null_count,
290
2.12k
            null_bit_buffer,
291
2.12k
            nulls: None,
292
2.12k
            offset,
293
2.12k
            buffers,
294
2.12k
            child_data,
295
2.12k
            align_buffers: false,
296
2.12k
            skip_validation,
297
2.12k
        }
298
2.12k
        .build()
299
2.12k
        .unwrap()
300
2.12k
    }
301
302
    /// Create a new ArrayData, validating that the provided buffers form a valid
303
    /// Arrow array of the specified data type.
304
    ///
305
    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306
    /// is set to `None`.
307
    ///
308
    /// Internally this calls through to [`Self::validate_data`]
309
    ///
310
    /// Note: This is a low level API and most users of the arrow crate should create
311
    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312
2
    pub fn try_new(
313
2
        data_type: DataType,
314
2
        len: usize,
315
2
        null_bit_buffer: Option<Buffer>,
316
2
        offset: usize,
317
2
        buffers: Vec<Buffer>,
318
2
        child_data: Vec<ArrayData>,
319
2
    ) -> Result<Self, ArrowError> {
320
        // we must check the length of `null_bit_buffer` first
321
        // because we use this buffer to calculate `null_count`
322
        // in `Self::new_unchecked`.
323
2
        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324
2
            let needed_len = bit_util::ceil(len + offset, 8);
325
2
            if null_bit_buffer.len() < needed_len {
326
0
                return Err(ArrowError::InvalidArgumentError(format!(
327
0
                    "null_bit_buffer size too small. got {} needed {}",
328
0
                    null_bit_buffer.len(),
329
0
                    needed_len
330
0
                )));
331
2
            }
332
0
        }
333
        // Safety justification: `validate_full` is called below
334
2
        let new_self = unsafe {
335
2
            Self::new_unchecked(
336
2
                data_type,
337
2
                len,
338
2
                None,
339
2
                null_bit_buffer,
340
2
                offset,
341
2
                buffers,
342
2
                child_data,
343
            )
344
        };
345
346
        // As the data is not trusted, do a full validation of its contents
347
        // We don't need to validate children as we can assume that the
348
        // [`ArrayData`] in `child_data` have already been validated through
349
        // a call to `ArrayData::try_new` or created using unsafe
350
2
        new_self.validate_data()
?0
;
351
2
        Ok(new_self)
352
2
    }
353
354
    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355
    #[inline]
356
81.5k
    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357
81.5k
        ArrayDataBuilder::new(data_type)
358
81.5k
    }
359
360
    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361
    #[inline]
362
1.05M
    pub const fn data_type(&self) -> &DataType {
363
1.05M
        &self.data_type
364
1.05M
    }
365
366
    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367
1.37M
    pub fn buffers(&self) -> &[Buffer] {
368
1.37M
        &self.buffers
369
1.37M
    }
370
371
    /// Returns a slice of children [`ArrayData`]. This will be non
372
    /// empty for type such as lists and structs.
373
321k
    pub fn child_data(&self) -> &[ArrayData] {
374
321k
        &self.child_data[..]
375
321k
    }
376
377
    /// Returns whether the element at index `i` is null
378
    #[inline]
379
29.5k
    pub fn is_null(&self, i: usize) -> bool {
380
29.5k
        match &self.nulls {
381
20.1k
            Some(v) => v.is_null(i),
382
9.43k
            None => false,
383
        }
384
29.5k
    }
385
386
    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387
    ///
388
    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389
    #[inline]
390
853k
    pub fn nulls(&self) -> Option<&NullBuffer> {
391
853k
        self.nulls.as_ref()
392
853k
    }
393
394
    /// Returns whether the element at index `i` is not null
395
    #[inline]
396
    pub fn is_valid(&self, i: usize) -> bool {
397
        !self.is_null(i)
398
    }
399
400
    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401
    #[inline]
402
653k
    pub const fn len(&self) -> usize {
403
653k
        self.len
404
653k
    }
405
406
    /// Returns whether this [`ArrayData`] is empty
407
    #[inline]
408
160k
    pub const fn is_empty(&self) -> bool {
409
160k
        self.len == 0
410
160k
    }
411
412
    /// Returns the offset of this [`ArrayData`]
413
    #[inline]
414
328k
    pub const fn offset(&self) -> usize {
415
328k
        self.offset
416
328k
    }
417
418
    /// Returns the total number of nulls in this array
419
    #[inline]
420
163k
    pub fn null_count(&self) -> usize {
421
163k
        self.nulls
422
163k
            .as_ref()
423
163k
            .map(|x| 
x2.27k
.
null_count2.27k
())
424
163k
            .unwrap_or_default()
425
163k
    }
426
427
    /// Returns the total number of bytes of memory occupied by the
428
    /// buffers owned by this [`ArrayData`] and all of its
429
    /// children. (See also diagram on [`ArrayData`]).
430
    ///
431
    /// Note that this [`ArrayData`] may only refer to a subset of the
432
    /// data in the underlying [`Buffer`]s (due to `offset` and
433
    /// `length`), but the size returned includes the entire size of
434
    /// the buffers.
435
    ///
436
    /// If multiple [`ArrayData`]s refer to the same underlying
437
    /// [`Buffer`]s they will both report the same size.
438
0
    pub fn get_buffer_memory_size(&self) -> usize {
439
0
        let mut size = 0;
440
0
        for buffer in &self.buffers {
441
0
            size += buffer.capacity();
442
0
        }
443
0
        if let Some(bitmap) = &self.nulls {
444
0
            size += bitmap.buffer().capacity()
445
0
        }
446
0
        for child in &self.child_data {
447
0
            size += child.get_buffer_memory_size();
448
0
        }
449
0
        size
450
0
    }
451
452
    /// Returns the total number of the bytes of memory occupied by
453
    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454
    ///
455
    /// This is approximately the number of bytes if a new
456
    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457
    /// exactly the data needed.
458
    ///
459
    /// For example, a [`DataType::Int64`] with `100` elements,
460
    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461
    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462
    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463
    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464
0
    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465
0
        let mut result: usize = 0;
466
0
        let layout = layout(&self.data_type);
467
468
0
        for spec in layout.buffers.iter() {
469
0
            match spec {
470
0
                BufferSpec::FixedWidth { byte_width, .. } => {
471
0
                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472
0
                        ArrowError::ComputeError(
473
0
                            "Integer overflow computing buffer size".to_string(),
474
0
                        )
475
0
                    })?;
476
0
                    result += buffer_size;
477
                }
478
                BufferSpec::VariableWidth => {
479
0
                    let buffer_len = match self.data_type {
480
                        DataType::Utf8 | DataType::Binary => {
481
0
                            let offsets = self.typed_offsets::<i32>()?;
482
0
                            (offsets[self.len] - offsets[0]) as usize
483
                        }
484
                        DataType::LargeUtf8 | DataType::LargeBinary => {
485
0
                            let offsets = self.typed_offsets::<i64>()?;
486
0
                            (offsets[self.len] - offsets[0]) as usize
487
                        }
488
                        _ => {
489
0
                            return Err(ArrowError::NotYetImplemented(format!(
490
0
                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
491
0
                                self.data_type
492
0
                            )));
493
                        }
494
                    };
495
0
                    result += buffer_len;
496
                }
497
0
                BufferSpec::BitMap => {
498
0
                    let buffer_size = bit_util::ceil(self.len, 8);
499
0
                    result += buffer_size;
500
0
                }
501
0
                BufferSpec::AlwaysNull => {
502
0
                    // Nothing to do
503
0
                }
504
            }
505
        }
506
507
0
        if self.nulls().is_some() {
508
0
            result += bit_util::ceil(self.len, 8);
509
0
        }
510
511
0
        for child in &self.child_data {
512
0
            result += child.get_slice_memory_size()?;
513
        }
514
0
        Ok(result)
515
0
    }
516
517
    /// Returns the total number of bytes of memory occupied
518
    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
519
    /// children. (See also diagram on [`ArrayData`]).
520
    ///
521
    /// Equivalent to:
522
    ///  `size_of_val(self)` +
523
    ///  [`Self::get_buffer_memory_size`] +
524
    ///  `size_of_val(child)` for all children
525
0
    pub fn get_array_memory_size(&self) -> usize {
526
0
        let mut size = mem::size_of_val(self);
527
528
        // Calculate rest of the fields top down which contain actual data
529
0
        for buffer in &self.buffers {
530
0
            size += mem::size_of::<Buffer>();
531
0
            size += buffer.capacity();
532
0
        }
533
0
        if let Some(nulls) = &self.nulls {
534
0
            size += nulls.buffer().capacity();
535
0
        }
536
0
        for child in &self.child_data {
537
0
            size += child.get_array_memory_size();
538
0
        }
539
540
0
        size
541
0
    }
542
543
    /// Creates a zero-copy slice of itself. This creates a new
544
    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
545
    /// different offset and len
546
    ///
547
    /// # Panics
548
    ///
549
    /// Panics if `offset + length > self.len()`.
550
11
    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
551
11
        assert!((offset + length) <= self.len());
552
553
11
        if let DataType::Struct(_) = self.data_type() {
554
            // Slice into children
555
0
            let new_offset = self.offset + offset;
556
            ArrayData {
557
0
                data_type: self.data_type().clone(),
558
0
                len: length,
559
0
                offset: new_offset,
560
0
                buffers: self.buffers.clone(),
561
                // Slice child data, to propagate offsets down to them
562
0
                child_data: self
563
0
                    .child_data()
564
0
                    .iter()
565
0
                    .map(|data| data.slice(offset, length))
566
0
                    .collect(),
567
0
                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
568
            }
569
        } else {
570
11
            let mut new_data = self.clone();
571
572
11
            new_data.len = length;
573
11
            new_data.offset = offset + self.offset;
574
11
            new_data.nulls = self.nulls.as_ref().map(|x| 
x4
.
slice4
(
offset4
,
length4
));
575
576
11
            new_data
577
        }
578
11
    }
579
580
    /// Returns the `buffer` as a slice of type `T` starting at self.offset
581
    ///
582
    /// # Panics
583
    /// This function panics if:
584
    /// * the buffer is not byte-aligned with type T, or
585
    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
586
320k
    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
587
320k
        &self.buffers()[buffer].typed_data()[self.offset..]
588
320k
    }
589
590
    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
591
81
    pub fn new_null(data_type: &DataType, len: usize) -> Self {
592
81
        let bit_len = bit_util::ceil(len, 8);
593
106
        let 
zeroed81
= |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
594
595
81
        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
596
27
            Some(width) => (vec![zeroed(width * len)], vec![], true),
597
54
            None => match data_type {
598
1
                DataType::Null => (vec![], vec![], false),
599
1
                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
600
                DataType::Binary | DataType::Utf8 => {
601
33
                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
602
                }
603
0
                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
604
                DataType::LargeBinary | DataType::LargeUtf8 => {
605
0
                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
606
                }
607
1
                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
608
0
                DataType::List(f) | DataType::Map(f, _) => (
609
0
                    vec![zeroed((len + 1) * 4)],
610
0
                    vec![ArrayData::new_empty(f.data_type())],
611
0
                    true,
612
0
                ),
613
0
                DataType::LargeList(f) => (
614
0
                    vec![zeroed((len + 1) * 8)],
615
0
                    vec![ArrayData::new_empty(f.data_type())],
616
0
                    true,
617
0
                ),
618
0
                DataType::FixedSizeList(f, list_len) => (
619
0
                    vec![],
620
0
                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
621
0
                    true,
622
0
                ),
623
1
                DataType::Struct(fields) => (
624
1
                    vec![],
625
1
                    fields
626
1
                        .iter()
627
2
                        .
map1
(|f| Self::new_null(f.data_type(), len))
628
1
                        .collect(),
629
                    true,
630
                ),
631
11
                DataType::Dictionary(k, v) => (
632
11
                    vec![zeroed(k.primitive_width().unwrap() * len)],
633
11
                    vec![ArrayData::new_empty(v.as_ref())],
634
11
                    true,
635
11
                ),
636
5
                DataType::Union(f, mode) => {
637
5
                    let (id, _) = f.iter().next().unwrap();
638
5
                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
639
5
                    let buffers = match mode {
640
5
                        UnionMode::Sparse => vec![ids],
641
                        UnionMode::Dense => {
642
0
                            let end_offset = i32::from_usize(len).unwrap();
643
0
                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
644
                        }
645
                    };
646
647
5
                    let children = f
648
5
                        .iter()
649
5
                        .enumerate()
650
5
                        .map(|(idx, (_, f))| {
651
5
                            if idx == 0 || 
*mode == UnionMode::Sparse0
{
652
5
                                Self::new_null(f.data_type(), len)
653
                            } else {
654
0
                                Self::new_empty(f.data_type())
655
                            }
656
5
                        })
657
5
                        .collect();
658
659
5
                    (buffers, children, false)
660
                }
661
1
                DataType::RunEndEncoded(r, v) => {
662
1
                    let runs = match r.data_type() {
663
                        DataType::Int16 => {
664
0
                            let i = i16::from_usize(len).expect("run overflow");
665
0
                            Buffer::from_slice_ref([i])
666
                        }
667
                        DataType::Int32 => {
668
0
                            let i = i32::from_usize(len).expect("run overflow");
669
0
                            Buffer::from_slice_ref([i])
670
                        }
671
                        DataType::Int64 => {
672
1
                            let i = i64::from_usize(len).expect("run overflow");
673
1
                            Buffer::from_slice_ref([i])
674
                        }
675
0
                        dt => unreachable!("Invalid run ends data type {dt}"),
676
                    };
677
678
1
                    let builder = ArrayData::builder(r.data_type().clone())
679
1
                        .len(1)
680
1
                        .buffers(vec![runs]);
681
682
                    // SAFETY:
683
                    // Valid by construction
684
1
                    let runs = unsafe { builder.build_unchecked() };
685
1
                    (
686
1
                        vec![],
687
1
                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
688
1
                        false,
689
1
                    )
690
                }
691
0
                d => unreachable!("{d}"),
692
            },
693
        };
694
695
81
        let mut builder = ArrayDataBuilder::new(data_type.clone())
696
81
            .len(len)
697
81
            .buffers(buffers)
698
81
            .child_data(child_data);
699
700
81
        if has_nulls {
701
74
            builder = builder.nulls(Some(NullBuffer::new_null(len)))
702
7
        }
703
704
        // SAFETY:
705
        // Data valid by construction
706
81
        unsafe { builder.build_unchecked() }
707
81
    }
708
709
    /// Returns a new empty [ArrayData] valid for `data_type`.
710
51
    pub fn new_empty(data_type: &DataType) -> Self {
711
51
        Self::new_null(data_type, 0)
712
51
    }
713
714
    /// Verifies that the buffers meet the minimum alignment requirements for the data type
715
    ///
716
    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
717
    ///
718
    /// This can be useful for when interacting with data sent over IPC or FFI, that may
719
    /// not meet the minimum alignment requirements
720
    ///
721
    /// This also aligns buffers of children data
722
0
    pub fn align_buffers(&mut self) {
723
0
        let layout = layout(&self.data_type);
724
0
        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
725
0
            if let BufferSpec::FixedWidth { alignment, .. } = spec {
726
0
                if buffer.as_ptr().align_offset(*alignment) != 0 {
727
0
                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
728
0
                }
729
0
            }
730
        }
731
        // align children data recursively
732
0
        for data in self.child_data.iter_mut() {
733
0
            data.align_buffers()
734
        }
735
0
    }
736
737
    /// "cheap" validation of an `ArrayData`. Ensures buffers are
738
    /// sufficiently sized to store `len` + `offset` total elements of
739
    /// `data_type` and performs other inexpensive consistency checks.
740
    ///
741
    /// This check is "cheap" in the sense that it does not validate the
742
    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
743
    /// are within the bounds of the values buffer).
744
    ///
745
    /// See [ArrayData::validate_data] to validate fully the offset content
746
    /// and the validity of utf8 data
747
1.04k
    pub fn validate(&self) -> Result<(), ArrowError> {
748
        // Need at least this mich space in each buffer
749
1.04k
        let len_plus_offset = self.len + self.offset;
750
751
        // Check that the data layout conforms to the spec
752
1.04k
        let layout = layout(&self.data_type);
753
754
1.04k
        if !layout.can_contain_null_mask && 
self.nulls40
.
is_some40
() {
755
0
            return Err(ArrowError::InvalidArgumentError(format!(
756
0
                "Arrays of type {:?} cannot contain a null bitmask",
757
0
                self.data_type,
758
0
            )));
759
1.04k
        }
760
761
        // Check data buffers length for view types and other types
762
1.04k
        if self.buffers.len() < layout.buffers.len()
763
1.04k
            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
764
        {
765
0
            return Err(ArrowError::InvalidArgumentError(format!(
766
0
                "Expected {} buffers in array of type {:?}, got {}",
767
0
                layout.buffers.len(),
768
0
                self.data_type,
769
0
                self.buffers.len(),
770
0
            )));
771
1.04k
        }
772
773
1.04k
        for (
i1.00k
, (
buffer1.00k
,
spec1.00k
)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
774
1.00k
            match spec {
775
                BufferSpec::FixedWidth {
776
892
                    byte_width,
777
892
                    alignment,
778
                } => {
779
892
                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
780
781
892
                    if buffer.len() < min_buffer_size {
782
0
                        return Err(ArrowError::InvalidArgumentError(format!(
783
0
                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
784
0
                            min_buffer_size,
785
0
                            i,
786
0
                            self.data_type,
787
0
                            buffer.len()
788
0
                        )));
789
892
                    }
790
791
892
                    let align_offset = buffer.as_ptr().align_offset(*alignment);
792
892
                    if align_offset != 0 {
793
0
                        return Err(ArrowError::InvalidArgumentError(format!(
794
0
                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795
0
                            self.data_type,
796
0
                            align_offset.min(alignment - align_offset)
797
0
                        )));
798
892
                    }
799
                }
800
6
                BufferSpec::VariableWidth => {
801
6
                    // not cheap to validate (need to look at the
802
6
                    // data). Partially checked in validate_offsets
803
6
                    // called below. Can check with `validate_full`
804
6
                }
805
                BufferSpec::BitMap => {
806
105
                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
807
105
                    if buffer.len() < min_buffer_size {
808
0
                        return Err(ArrowError::InvalidArgumentError(format!(
809
0
                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
810
0
                            min_buffer_size,
811
0
                            i,
812
0
                            self.data_type,
813
0
                            buffer.len()
814
0
                        )));
815
105
                    }
816
                }
817
0
                BufferSpec::AlwaysNull => {
818
0
                    // Nothing to validate
819
0
                }
820
            }
821
        }
822
823
        // check null bit buffer size
824
1.04k
        if let Some(
nulls538
) = self.nulls() {
825
538
            if nulls.null_count() > self.len {
826
0
                return Err(ArrowError::InvalidArgumentError(format!(
827
0
                    "null_count {} for an array exceeds length of {} elements",
828
0
                    nulls.null_count(),
829
0
                    self.len
830
0
                )));
831
538
            }
832
833
538
            let actual_len = nulls.validity().len();
834
538
            let needed_len = bit_util::ceil(len_plus_offset, 8);
835
538
            if actual_len < needed_len {
836
0
                return Err(ArrowError::InvalidArgumentError(format!(
837
0
                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
838
0
                )));
839
538
            }
840
841
538
            if nulls.len() != self.len {
842
0
                return Err(ArrowError::InvalidArgumentError(format!(
843
0
                    "null buffer incorrect size. got {} expected {}",
844
0
                    nulls.len(),
845
0
                    self.len
846
0
                )));
847
538
            }
848
503
        }
849
850
1.04k
        self.validate_child_data()
?0
;
851
852
        // Additional Type specific checks
853
1.04k
        match &self.data_type {
854
            DataType::Utf8 | DataType::Binary => {
855
6
                self.validate_offsets::<i32>(self.buffers[1].len())
?0
;
856
            }
857
            DataType::LargeUtf8 | DataType::LargeBinary => {
858
0
                self.validate_offsets::<i64>(self.buffers[1].len())?;
859
            }
860
0
            DataType::Dictionary(key_type, _value_type) => {
861
                // At the moment, constructing a DictionaryArray will also check this
862
0
                if !DataType::is_dictionary_key_type(key_type) {
863
0
                    return Err(ArrowError::InvalidArgumentError(format!(
864
0
                        "Dictionary key type must be integer, but was {key_type}"
865
0
                    )));
866
0
                }
867
            }
868
40
            DataType::RunEndEncoded(run_ends_type, _) => {
869
40
                if run_ends_type.is_nullable() {
870
0
                    return Err(ArrowError::InvalidArgumentError(
871
0
                        "The nullable should be set to false for the field defining run_ends array.".to_string()
872
0
                    ));
873
40
                }
874
40
                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
875
0
                    return Err(ArrowError::InvalidArgumentError(format!(
876
0
                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
877
0
                        run_ends_type.data_type()
878
0
                    )));
879
40
                }
880
            }
881
995
            _ => {}
882
        };
883
884
1.04k
        Ok(())
885
1.04k
    }
886
887
    /// Returns a reference to the data in `buffer` as a typed slice
888
    /// (typically `&[i32]` or `&[i64]`) after validating. The
889
    /// returned slice is guaranteed to have at least `self.len + 1`
890
    /// entries.
891
    ///
892
    /// For an empty array, the `buffer` can also be empty.
893
40
    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
894
        // An empty list-like array can have 0 offsets
895
40
        if self.len == 0 && 
self.buffers[0]0
.
is_empty0
() {
896
0
            return Ok(&[]);
897
40
        }
898
899
40
        self.typed_buffer(0, self.len + 1)
900
40
    }
901
902
    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
903
80
    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
904
80
        &self,
905
80
        idx: usize,
906
80
        len: usize,
907
80
    ) -> Result<&[T], ArrowError> {
908
80
        let buffer = &self.buffers[idx];
909
910
80
        let required_len = (len + self.offset) * mem::size_of::<T>();
911
912
80
        if buffer.len() < required_len {
913
0
            return Err(ArrowError::InvalidArgumentError(format!(
914
0
                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
915
0
                idx,
916
0
                self.data_type,
917
0
                required_len,
918
0
                buffer.len()
919
0
            )));
920
80
        }
921
922
80
        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
923
80
    }
924
925
    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
926
    /// offsets (of type T) into some other buffer of `values_length` bytes long
927
23
    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
928
23
        &self,
929
23
        values_length: usize,
930
23
    ) -> Result<(), ArrowError> {
931
        // Justification: buffer size was validated above
932
23
        let offsets = self.typed_offsets::<T>()
?0
;
933
23
        if offsets.is_empty() {
934
0
            return Ok(());
935
23
        }
936
937
23
        let first_offset = offsets[0].to_usize().ok_or_else(|| 
{0
938
0
            ArrowError::InvalidArgumentError(format!(
939
0
                "Error converting offset[0] ({}) to usize for {}",
940
0
                offsets[0], self.data_type
941
0
            ))
942
0
        })?;
943
944
23
        let last_offset = offsets[self.len].to_usize().ok_or_else(|| 
{0
945
0
            ArrowError::InvalidArgumentError(format!(
946
0
                "Error converting offset[{}] ({}) to usize for {}",
947
0
                self.len, offsets[self.len], self.data_type
948
0
            ))
949
0
        })?;
950
951
23
        if first_offset > values_length {
952
0
            return Err(ArrowError::InvalidArgumentError(format!(
953
0
                "First offset {} of {} is larger than values length {}",
954
0
                first_offset, self.data_type, values_length,
955
0
            )));
956
23
        }
957
958
23
        if last_offset > values_length {
959
0
            return Err(ArrowError::InvalidArgumentError(format!(
960
0
                "Last offset {} of {} is larger than values length {}",
961
0
                last_offset, self.data_type, values_length,
962
0
            )));
963
23
        }
964
965
23
        if first_offset > last_offset {
966
0
            return Err(ArrowError::InvalidArgumentError(format!(
967
0
                "First offset {} in {} is smaller than last offset {}",
968
0
                first_offset, self.data_type, last_offset,
969
0
            )));
970
23
        }
971
972
23
        Ok(())
973
23
    }
974
975
    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
976
    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
977
0
    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
978
0
        &self,
979
0
        values_length: usize,
980
0
    ) -> Result<(), ArrowError> {
981
0
        let offsets: &[T] = self.typed_buffer(0, self.len)?;
982
0
        let sizes: &[T] = self.typed_buffer(1, self.len)?;
983
0
        if offsets.len() != sizes.len() {
984
0
            return Err(ArrowError::ComputeError(format!(
985
0
                "ListView offsets len {} does not match sizes len {}",
986
0
                offsets.len(),
987
0
                sizes.len()
988
0
            )));
989
0
        }
990
991
0
        for i in 0..sizes.len() {
992
0
            let size = sizes[i].to_usize().ok_or_else(|| {
993
0
                ArrowError::InvalidArgumentError(format!(
994
0
                    "Error converting size[{}] ({}) to usize for {}",
995
0
                    i, sizes[i], self.data_type
996
0
                ))
997
0
            })?;
998
0
            let offset = offsets[i].to_usize().ok_or_else(|| {
999
0
                ArrowError::InvalidArgumentError(format!(
1000
0
                    "Error converting offset[{}] ({}) to usize for {}",
1001
0
                    i, offsets[i], self.data_type
1002
0
                ))
1003
0
            })?;
1004
0
            if size
1005
0
                .checked_add(offset)
1006
0
                .expect("Offset and size have exceeded the usize boundary")
1007
0
                > values_length
1008
            {
1009
0
                return Err(ArrowError::InvalidArgumentError(format!(
1010
0
                    "Size {} at index {} is larger than the remaining values for {}",
1011
0
                    size, i, self.data_type
1012
0
                )));
1013
0
            }
1014
        }
1015
0
        Ok(())
1016
0
    }
1017
1018
    /// Validates the layout of `child_data` ArrayData structures
1019
1.04k
    fn validate_child_data(&self) -> Result<(), ArrowError> {
1020
1.04k
        match &self.data_type {
1021
7
            DataType::List(field) | DataType::Map(
field2
, _) => {
1022
9
                let values_data = self.get_single_valid_child_data(field.data_type())
?0
;
1023
9
                self.validate_offsets::<i32>(values_data.len)
?0
;
1024
9
                Ok(())
1025
            }
1026
8
            DataType::LargeList(field) => {
1027
8
                let values_data = self.get_single_valid_child_data(field.data_type())
?0
;
1028
8
                self.validate_offsets::<i64>(values_data.len)
?0
;
1029
8
                Ok(())
1030
            }
1031
0
            DataType::ListView(field) => {
1032
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1033
0
                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1034
0
                Ok(())
1035
            }
1036
0
            DataType::LargeListView(field) => {
1037
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1038
0
                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1039
0
                Ok(())
1040
            }
1041
2
            DataType::FixedSizeList(field, list_size) => {
1042
2
                let values_data = self.get_single_valid_child_data(field.data_type())
?0
;
1043
1044
2
                let list_size: usize = (*list_size).try_into().map_err(|_| 
{0
1045
0
                    ArrowError::InvalidArgumentError(format!(
1046
0
                        "{} has a negative list_size {}",
1047
0
                        self.data_type, list_size
1048
0
                    ))
1049
0
                })?;
1050
1051
2
                let expected_values_len = self.len
1052
2
                    .checked_mul(list_size)
1053
2
                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1054
1055
2
                if values_data.len < expected_values_len {
1056
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1057
0
                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1058
0
                        values_data.len, self.len, list_size, self.data_type
1059
0
                    )));
1060
2
                }
1061
1062
2
                Ok(())
1063
            }
1064
2
            DataType::Struct(fields) => {
1065
2
                self.validate_num_child_data(fields.len())
?0
;
1066
4
                for (i, field) in 
fields.iter()2
.
enumerate2
() {
1067
4
                    let field_data = self.get_valid_child_data(i, field.data_type())
?0
;
1068
1069
                    // Ensure child field has sufficient size
1070
4
                    if field_data.len < self.len {
1071
0
                        return Err(ArrowError::InvalidArgumentError(format!(
1072
0
                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1073
0
                            self.data_type,
1074
0
                            i,
1075
0
                            field.name(),
1076
0
                            field_data.len,
1077
0
                            self.len
1078
0
                        )));
1079
4
                    }
1080
                }
1081
2
                Ok(())
1082
            }
1083
40
            DataType::RunEndEncoded(run_ends_field, values_field) => {
1084
40
                self.validate_num_child_data(2)
?0
;
1085
40
                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())
?0
;
1086
40
                let values_data = self.get_valid_child_data(1, values_field.data_type())
?0
;
1087
40
                if run_ends_data.len != values_data.len {
1088
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1089
0
                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1090
0
                        run_ends_data.len, values_data.len
1091
0
                    )));
1092
40
                }
1093
40
                if run_ends_data.nulls.is_some() {
1094
0
                    return Err(ArrowError::InvalidArgumentError(
1095
0
                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1096
0
                    ));
1097
40
                }
1098
40
                Ok(())
1099
            }
1100
0
            DataType::Union(fields, mode) => {
1101
0
                self.validate_num_child_data(fields.len())?;
1102
1103
0
                for (i, (_, field)) in fields.iter().enumerate() {
1104
0
                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1105
1106
0
                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1107
0
                        return Err(ArrowError::InvalidArgumentError(format!(
1108
0
                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1109
0
                            i,
1110
0
                            field_data.len,
1111
0
                            self.len + self.offset
1112
0
                        )));
1113
0
                    }
1114
                }
1115
0
                Ok(())
1116
            }
1117
0
            DataType::Dictionary(_key_type, value_type) => {
1118
0
                self.get_single_valid_child_data(value_type)?;
1119
0
                Ok(())
1120
            }
1121
            _ => {
1122
                // other types do not have child data
1123
980
                if !self.child_data.is_empty() {
1124
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1125
0
                        "Expected no child arrays for type {} but got {}",
1126
0
                        self.data_type,
1127
0
                        self.child_data.len()
1128
0
                    )));
1129
980
                }
1130
980
                Ok(())
1131
            }
1132
        }
1133
1.04k
    }
1134
1135
    /// Ensures that this array data has a single child_data with the
1136
    /// expected type, and calls `validate()` on it. Returns a
1137
    /// reference to that child_data
1138
19
    fn get_single_valid_child_data(
1139
19
        &self,
1140
19
        expected_type: &DataType,
1141
19
    ) -> Result<&ArrayData, ArrowError> {
1142
19
        self.validate_num_child_data(1)
?0
;
1143
19
        self.get_valid_child_data(0, expected_type)
1144
19
    }
1145
1146
    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1147
61
    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1148
61
        if self.child_data.len() != expected_len {
1149
0
            Err(ArrowError::InvalidArgumentError(format!(
1150
0
                "Value data for {} should contain {} child data array(s), had {}",
1151
0
                self.data_type,
1152
0
                expected_len,
1153
0
                self.child_data.len()
1154
0
            )))
1155
        } else {
1156
61
            Ok(())
1157
        }
1158
61
    }
1159
1160
    /// Ensures that `child_data[i]` has the expected type, calls
1161
    /// `validate()` on it, and returns a reference to that child_data
1162
103
    fn get_valid_child_data(
1163
103
        &self,
1164
103
        i: usize,
1165
103
        expected_type: &DataType,
1166
103
    ) -> Result<&ArrayData, ArrowError> {
1167
103
        let values_data = self.child_data.get(i).ok_or_else(|| 
{0
1168
0
            ArrowError::InvalidArgumentError(format!(
1169
0
                "{} did not have enough child arrays. Expected at least {} but had only {}",
1170
0
                self.data_type,
1171
0
                i + 1,
1172
0
                self.child_data.len()
1173
0
            ))
1174
0
        })?;
1175
1176
103
        if expected_type != &values_data.data_type {
1177
0
            return Err(ArrowError::InvalidArgumentError(format!(
1178
0
                "Child type mismatch for {}. Expected {} but child data had {}",
1179
0
                self.data_type, expected_type, values_data.data_type
1180
0
            )));
1181
103
        }
1182
1183
103
        values_data.validate()
?0
;
1184
103
        Ok(values_data)
1185
103
    }
1186
1187
    /// Validate that the data contained within this [`ArrayData`] is valid
1188
    ///
1189
    /// 1. Null count is correct
1190
    /// 2. All offsets are valid
1191
    /// 3. All String data is valid UTF-8
1192
    /// 4. All dictionary offsets are valid
1193
    ///
1194
    /// Internally this calls:
1195
    ///
1196
    /// * [`Self::validate`]
1197
    /// * [`Self::validate_nulls`]
1198
    /// * [`Self::validate_values`]
1199
    ///
1200
    /// Note: this does not recurse into children, for a recursive variant
1201
    /// see [`Self::validate_full`]
1202
170
    pub fn validate_data(&self) -> Result<(), ArrowError> {
1203
170
        self.validate()
?0
;
1204
1205
170
        self.validate_nulls()
?0
;
1206
170
        self.validate_values()
?0
;
1207
170
        Ok(())
1208
170
    }
1209
1210
    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1211
    ///
1212
    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1213
    /// and all its children recursively
1214
0
    pub fn validate_full(&self) -> Result<(), ArrowError> {
1215
0
        self.validate_data()?;
1216
        // validate all children recursively
1217
0
        self.child_data
1218
0
            .iter()
1219
0
            .enumerate()
1220
0
            .try_for_each(|(i, child_data)| {
1221
0
                child_data.validate_full().map_err(|e| {
1222
0
                    ArrowError::InvalidArgumentError(format!(
1223
0
                        "{} child #{} invalid: {}",
1224
0
                        self.data_type, i, e
1225
0
                    ))
1226
0
                })
1227
0
            })?;
1228
0
        Ok(())
1229
0
    }
1230
1231
    /// Validates the values stored within this [`ArrayData`] are valid
1232
    /// without recursing into child [`ArrayData`]
1233
    ///
1234
    /// Does not (yet) check
1235
    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1236
    /// 2. the the null count is correct and that any
1237
    /// 3. nullability requirements of its children are correct
1238
    ///
1239
    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1240
170
    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1241
170
        if let Some(
nulls13
) = &self.nulls {
1242
13
            let actual = nulls.len() - nulls.inner().count_set_bits();
1243
13
            if actual != nulls.null_count() {
1244
0
                return Err(ArrowError::InvalidArgumentError(format!(
1245
0
                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1246
0
                    nulls.null_count(),
1247
0
                    actual
1248
0
                )));
1249
13
            }
1250
157
        }
1251
1252
        // In general non-nullable children should not contain nulls, however, for certain
1253
        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1254
        // space in the child. As such we permit nulls in the children in the corresponding
1255
        // positions for such types
1256
170
        match &self.data_type {
1257
8
            DataType::List(
f7
) | DataType::LargeList(f) | DataType::Map(
f2
, _) => {
1258
17
                if !f.is_nullable() {
1259
9
                    self.validate_non_nullable(None, &self.child_data[0])
?0
1260
8
                }
1261
            }
1262
2
            DataType::FixedSizeList(field, len) => {
1263
2
                let child = &self.child_data[0];
1264
2
                if !field.is_nullable() {
1265
2
                    match &self.nulls {
1266
1
                        Some(nulls) => {
1267
1
                            let element_len = *len as usize;
1268
1
                            let expanded = nulls.expand(element_len);
1269
1
                            self.validate_non_nullable(Some(&expanded), child)
?0
;
1270
                        }
1271
1
                        None => self.validate_non_nullable(None, child)
?0
,
1272
                    }
1273
0
                }
1274
            }
1275
0
            DataType::Struct(fields) => {
1276
0
                for (field, child) in fields.iter().zip(&self.child_data) {
1277
0
                    if !field.is_nullable() {
1278
0
                        self.validate_non_nullable(self.nulls(), child)?
1279
0
                    }
1280
                }
1281
            }
1282
151
            _ => {}
1283
        }
1284
1285
170
        Ok(())
1286
170
    }
1287
1288
    /// Verifies that `child` contains no nulls not present in `mask`
1289
11
    fn validate_non_nullable(
1290
11
        &self,
1291
11
        mask: Option<&NullBuffer>,
1292
11
        child: &ArrayData,
1293
11
    ) -> Result<(), ArrowError> {
1294
11
        let 
mask1
= match mask {
1295
1
            Some(mask) => mask,
1296
            None => {
1297
10
                return match child.null_count() {
1298
10
                    0 => Ok(()),
1299
0
                    _ => Err(ArrowError::InvalidArgumentError(format!(
1300
0
                        "non-nullable child of type {} contains nulls not present in parent {}",
1301
0
                        child.data_type, self.data_type
1302
0
                    ))),
1303
                };
1304
            }
1305
        };
1306
1307
1
        match child.nulls() {
1308
0
            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1309
0
                "non-nullable child of type {} contains nulls not present in parent",
1310
0
                child.data_type
1311
0
            ))),
1312
1
            _ => Ok(()),
1313
        }
1314
11
    }
1315
1316
    /// Validates the values stored within this [`ArrayData`] are valid
1317
    /// without recursing into child [`ArrayData`]
1318
    ///
1319
    /// Does not (yet) check
1320
    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1321
170
    pub fn validate_values(&self) -> Result<(), ArrowError> {
1322
170
        match &self.data_type {
1323
0
            DataType::Utf8 => self.validate_utf8::<i32>(),
1324
0
            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1325
0
            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1326
0
            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1327
            DataType::BinaryView => {
1328
0
                let views = self.typed_buffer::<u128>(0, self.len)?;
1329
0
                validate_binary_view(views, &self.buffers[1..])
1330
            }
1331
            DataType::Utf8View => {
1332
0
                let views = self.typed_buffer::<u128>(0, self.len)?;
1333
0
                validate_string_view(views, &self.buffers[1..])
1334
            }
1335
            DataType::List(_) | DataType::Map(_, _) => {
1336
9
                let child = &self.child_data[0];
1337
9
                self.validate_offsets_full::<i32>(child.len)
1338
            }
1339
            DataType::LargeList(_) => {
1340
8
                let child = &self.child_data[0];
1341
8
                self.validate_offsets_full::<i64>(child.len)
1342
            }
1343
            DataType::Union(_, _) => {
1344
                // Validate Union Array as part of implementing new Union semantics
1345
                // See comments in `ArrayData::validate()`
1346
                // https://github.com/apache/arrow-rs/issues/85
1347
                //
1348
                // TODO file follow on ticket for full union validation
1349
0
                Ok(())
1350
            }
1351
0
            DataType::Dictionary(key_type, _value_type) => {
1352
0
                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1353
0
                let max_value = dictionary_length - 1;
1354
0
                match key_type.as_ref() {
1355
0
                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1356
0
                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1357
0
                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1358
0
                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1359
0
                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1360
0
                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1361
0
                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1362
0
                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1363
0
                    _ => unreachable!(),
1364
                }
1365
            }
1366
40
            DataType::RunEndEncoded(run_ends, _values) => {
1367
40
                let run_ends_data = self.child_data()[0].clone();
1368
40
                match run_ends.data_type() {
1369
5
                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1370
26
                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1371
9
                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1372
0
                    _ => unreachable!(),
1373
                }
1374
            }
1375
            _ => {
1376
                // No extra validation check required for other types
1377
113
                Ok(())
1378
            }
1379
        }
1380
170
    }
1381
1382
    /// Calls the `validate(item_index, range)` function for each of
1383
    /// the ranges specified in the arrow offsets buffer of type
1384
    /// `T`. Also validates that each offset is smaller than
1385
    /// `offset_limit`
1386
    ///
1387
    /// For an empty array, the offsets buffer can either be empty
1388
    /// or contain a single `0`.
1389
    ///
1390
    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1391
    /// function would call `validate([1,2])`, and `validate([2,4])`
1392
17
    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1393
17
    where
1394
17
        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1395
17
        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1396
    {
1397
17
        self.typed_offsets::<T>()
?0
1398
17
            .iter()
1399
17
            .enumerate()
1400
83
            .
map17
(|(i, x)| {
1401
                // check if the offset can be converted to usize
1402
83
                let r = x.to_usize().ok_or_else(|| 
{0
1403
0
                    ArrowError::InvalidArgumentError(format!(
1404
0
                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1405
                    );
1406
                // check if the offset exceeds the limit
1407
83
                match r {
1408
83
                    Ok(n) if n <= offset_limit => Ok((i, n)),
1409
0
                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1410
0
                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1411
0
                    ),
1412
0
                    Err(e) => Err(e),
1413
                }
1414
83
            })
1415
83
            .
scan17
(0_usize, |start, end| {
1416
                // check offsets are monotonically increasing
1417
83
                match end {
1418
83
                    Ok((i, end)) if *start <= end => {
1419
83
                        let range = Some(Ok((i, *start..end)));
1420
83
                        *start = end;
1421
83
                        range
1422
                    }
1423
0
                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1424
0
                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1425
0
                        i - 1, start, end))
1426
0
                    )),
1427
0
                    Err(err) => Some(Err(err)),
1428
                }
1429
83
            })
1430
17
            .skip(1) // the first element is meaningless
1431
66
            .
try_for_each17
(|res: Result<(usize, Range<usize>), ArrowError>| {
1432
66
                let (item_index, range) = res
?0
;
1433
66
                validate(item_index-1, range)
1434
66
            })
1435
17
    }
1436
1437
    /// Ensures that all strings formed by the offsets in `buffers[0]`
1438
    /// into `buffers[1]` are valid utf8 sequences
1439
0
    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1440
0
    where
1441
0
        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1442
    {
1443
0
        let values_buffer = &self.buffers[1].as_slice();
1444
0
        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1445
            // Validate Offsets are correct
1446
0
            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1447
0
                if !values_str.is_char_boundary(range.start)
1448
0
                    || !values_str.is_char_boundary(range.end)
1449
                {
1450
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1451
0
                        "incomplete utf-8 byte sequence from index {string_index}"
1452
0
                    )));
1453
0
                }
1454
0
                Ok(())
1455
0
            })
1456
        } else {
1457
            // find specific offset that failed utf8 validation
1458
0
            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1459
0
                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1460
0
                    ArrowError::InvalidArgumentError(format!(
1461
0
                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1462
0
                    ))
1463
0
                })?;
1464
0
                Ok(())
1465
0
            })
1466
        }
1467
0
    }
1468
1469
    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1470
    /// between `0` and `offset_limit`
1471
17
    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1472
17
    where
1473
17
        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1474
    {
1475
66
        
self17
.
validate_each_offset17
::<T, _>(
offset_limit17
, |_string_index, _range| {
1476
            // No validation applied to each value, but the iteration
1477
            // itself applies bounds checking to each range
1478
66
            Ok(())
1479
66
        })
1480
17
    }
1481
1482
    /// Validates that each value in self.buffers (typed as T)
1483
    /// is within the range [0, max_value], inclusive
1484
0
    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1485
0
    where
1486
0
        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1487
    {
1488
0
        let required_len = self.len + self.offset;
1489
0
        let buffer = &self.buffers[0];
1490
1491
        // This should have been checked as part of `validate()` prior
1492
        // to calling `validate_full()` but double check to be sure
1493
0
        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1494
1495
        // Justification: buffer size was validated above
1496
0
        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1497
1498
0
        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1499
            // Do not check the value is null (value can be arbitrary)
1500
0
            if self.is_null(i) {
1501
0
                return Ok(());
1502
0
            }
1503
0
            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1504
0
                ArrowError::InvalidArgumentError(format!(
1505
0
                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1506
0
                ))
1507
0
            })?;
1508
1509
0
            if dict_index < 0 || dict_index > max_value {
1510
0
                return Err(ArrowError::InvalidArgumentError(format!(
1511
0
                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1512
0
                )));
1513
0
            }
1514
0
            Ok(())
1515
0
        })
1516
0
    }
1517
1518
    /// Validates that each value in run_ends array is positive and strictly increasing.
1519
40
    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1520
40
    where
1521
40
        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1522
    {
1523
40
        let values = self.typed_buffer::<T>(0, self.len)
?0
;
1524
40
        let mut prev_value: i64 = 0_i64;
1525
117
        
values40
.iter().enumerate().
try_for_each40
(|(ix, &inp_value)| {
1526
117
            let value: i64 = inp_value.try_into().map_err(|_| 
{0
1527
0
                ArrowError::InvalidArgumentError(format!(
1528
0
                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1529
0
                ))
1530
0
            })?;
1531
117
            if value <= 0_i64 {
1532
0
                return Err(ArrowError::InvalidArgumentError(format!(
1533
0
                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1534
0
                )));
1535
117
            }
1536
117
            if ix > 0 && 
value <= prev_value77
{
1537
0
                return Err(ArrowError::InvalidArgumentError(format!(
1538
0
                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1539
0
                )));
1540
117
            }
1541
1542
117
            prev_value = value;
1543
117
            Ok(())
1544
117
        })
?0
;
1545
1546
40
        if prev_value.as_usize() < (self.offset + self.len) {
1547
0
            return Err(ArrowError::InvalidArgumentError(format!(
1548
0
                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1549
0
                self.offset + self.len
1550
0
            )));
1551
40
        }
1552
40
        Ok(())
1553
40
    }
1554
1555
    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1556
    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1557
    /// return false when the arrays are logically equal
1558
17
    pub fn ptr_eq(&self, other: &Self) -> bool {
1559
17
        if self.offset != other.offset
1560
17
            || self.len != other.len
1561
8
            || self.data_type != other.data_type
1562
8
            || self.buffers.len() != other.buffers.len()
1563
8
            || self.child_data.len() != other.child_data.len()
1564
        {
1565
9
            return false;
1566
8
        }
1567
1568
8
        match (&self.nulls, &other.nulls) {
1569
0
            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1570
0
            (Some(_), None) | (None, Some(_)) => return false,
1571
8
            _ => {}
1572
        };
1573
1574
8
        if !self
1575
8
            .buffers
1576
8
            .iter()
1577
8
            .zip(other.buffers.iter())
1578
12
            .
all8
(|(a, b)| a.as_ptr() == b.as_ptr())
1579
        {
1580
1
            return false;
1581
7
        }
1582
1583
7
        self.child_data
1584
7
            .iter()
1585
7
            .zip(other.child_data.iter())
1586
7
            .all(|(a, b)| 
a0
.
ptr_eq0
(
b0
))
1587
17
    }
1588
1589
    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1590
321k
    pub fn into_builder(self) -> ArrayDataBuilder {
1591
321k
        self.into()
1592
321k
    }
1593
}
1594
1595
/// Return the expected [`DataTypeLayout`] Arrays of this data
1596
/// type are expected to have
1597
1.05k
pub fn layout(data_type: &DataType) -> DataTypeLayout {
1598
    // based on C/C++ implementation in
1599
    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1600
    use arrow_schema::IntervalUnit::*;
1601
1602
0
    match data_type {
1603
0
        DataType::Null => DataTypeLayout {
1604
0
            buffers: vec![],
1605
0
            can_contain_null_mask: false,
1606
0
            variadic: false,
1607
0
        },
1608
105
        DataType::Boolean => DataTypeLayout {
1609
105
            buffers: vec![BufferSpec::BitMap],
1610
105
            can_contain_null_mask: true,
1611
105
            variadic: false,
1612
105
        },
1613
0
        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1614
8
        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1615
846
        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1616
16
        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1617
0
        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1618
0
        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1619
2
        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1620
0
        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1621
0
        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1622
0
        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1623
0
        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1624
0
        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1625
0
        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1626
0
        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1627
0
        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1628
0
        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1629
0
        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1630
0
        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1631
        DataType::Interval(MonthDayNano) => {
1632
0
            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1633
        }
1634
0
        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1635
0
        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1636
0
        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1637
0
        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1638
0
        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1639
0
        DataType::FixedSizeBinary(size) => {
1640
0
            let spec = BufferSpec::FixedWidth {
1641
0
                byte_width: (*size).try_into().unwrap(),
1642
0
                alignment: mem::align_of::<u8>(),
1643
0
            };
1644
0
            DataTypeLayout {
1645
0
                buffers: vec![spec],
1646
0
                can_contain_null_mask: true,
1647
0
                variadic: false,
1648
0
            }
1649
        }
1650
0
        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1651
0
        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1652
9
        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1653
0
        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1654
0
        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1655
2
        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1656
7
        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1657
0
        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1658
0
        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1659
8
        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1660
2
        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1661
2
        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1662
40
        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1663
3
        DataType::Union(_, mode) => {
1664
3
            let type_ids = BufferSpec::FixedWidth {
1665
3
                byte_width: mem::size_of::<i8>(),
1666
3
                alignment: mem::align_of::<i8>(),
1667
3
            };
1668
1669
            DataTypeLayout {
1670
3
                buffers: match mode {
1671
                    UnionMode::Sparse => {
1672
3
                        vec![type_ids]
1673
                    }
1674
                    UnionMode::Dense => {
1675
0
                        vec![
1676
0
                            type_ids,
1677
0
                            BufferSpec::FixedWidth {
1678
0
                                byte_width: mem::size_of::<i32>(),
1679
0
                                alignment: mem::align_of::<i32>(),
1680
0
                            },
1681
                        ]
1682
                    }
1683
                },
1684
                can_contain_null_mask: false,
1685
                variadic: false,
1686
            }
1687
        }
1688
0
        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1689
    }
1690
1.05k
}
1691
1692
/// Layout specification for a data type
1693
#[derive(Debug, PartialEq, Eq)]
1694
// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1695
pub struct DataTypeLayout {
1696
    /// A vector of buffer layout specifications, one for each expected buffer
1697
    pub buffers: Vec<BufferSpec>,
1698
1699
    /// Can contain a null bitmask
1700
    pub can_contain_null_mask: bool,
1701
1702
    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1703
    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1704
    /// buffers.len(). Buffers that exceed the lower bound are legal.
1705
    pub variadic: bool,
1706
}
1707
1708
impl DataTypeLayout {
1709
    /// Describes a basic numeric array where each element has type `T`
1710
889
    pub fn new_fixed_width<T>() -> Self {
1711
889
        Self {
1712
889
            buffers: vec![BufferSpec::FixedWidth {
1713
889
                byte_width: mem::size_of::<T>(),
1714
889
                alignment: mem::align_of::<T>(),
1715
889
            }],
1716
889
            can_contain_null_mask: true,
1717
889
            variadic: false,
1718
889
        }
1719
889
    }
1720
1721
    /// Describes arrays which have no data of their own
1722
    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1723
4
    pub fn new_nullable_empty() -> Self {
1724
4
        Self {
1725
4
            buffers: vec![],
1726
4
            can_contain_null_mask: true,
1727
4
            variadic: false,
1728
4
        }
1729
4
    }
1730
1731
    /// Describes arrays which have no data of their own
1732
    /// (e.g. RunEndEncoded).
1733
40
    pub fn new_empty() -> Self {
1734
40
        Self {
1735
40
            buffers: vec![],
1736
40
            can_contain_null_mask: false,
1737
40
            variadic: false,
1738
40
        }
1739
40
    }
1740
1741
    /// Describes a basic numeric array where each element has a fixed
1742
    /// with offset buffer of type `T`, followed by a
1743
    /// variable width data buffer
1744
9
    pub fn new_binary<T>() -> Self {
1745
9
        Self {
1746
9
            buffers: vec![
1747
9
                // offsets
1748
9
                BufferSpec::FixedWidth {
1749
9
                    byte_width: mem::size_of::<T>(),
1750
9
                    alignment: mem::align_of::<T>(),
1751
9
                },
1752
9
                // values
1753
9
                BufferSpec::VariableWidth,
1754
9
            ],
1755
9
            can_contain_null_mask: true,
1756
9
            variadic: false,
1757
9
        }
1758
9
    }
1759
1760
    /// Describes a view type
1761
0
    pub fn new_view() -> Self {
1762
0
        Self {
1763
0
            buffers: vec![BufferSpec::FixedWidth {
1764
0
                byte_width: mem::size_of::<u128>(),
1765
0
                alignment: mem::align_of::<u128>(),
1766
0
            }],
1767
0
            can_contain_null_mask: true,
1768
0
            variadic: true,
1769
0
        }
1770
0
    }
1771
1772
    /// Describes a list view type
1773
0
    pub fn new_list_view<T>() -> Self {
1774
0
        Self {
1775
0
            buffers: vec![
1776
0
                BufferSpec::FixedWidth {
1777
0
                    byte_width: mem::size_of::<T>(),
1778
0
                    alignment: mem::align_of::<T>(),
1779
0
                },
1780
0
                BufferSpec::FixedWidth {
1781
0
                    byte_width: mem::size_of::<T>(),
1782
0
                    alignment: mem::align_of::<T>(),
1783
0
                },
1784
0
            ],
1785
0
            can_contain_null_mask: true,
1786
0
            variadic: true,
1787
0
        }
1788
0
    }
1789
}
1790
1791
/// Layout specification for a single data type buffer
1792
#[derive(Debug, PartialEq, Eq)]
1793
pub enum BufferSpec {
1794
    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1795
    ///
1796
    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1797
    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1798
    ///
1799
    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1800
    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1801
    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1802
    ///
1803
    /// Note that these alignment requirements will vary between architectures
1804
    FixedWidth {
1805
        /// The width of each element in bytes
1806
        byte_width: usize,
1807
        /// The alignment required by Rust for an array of the corresponding primitive
1808
        alignment: usize,
1809
    },
1810
    /// Variable width, such as string data for utf8 data
1811
    VariableWidth,
1812
    /// Buffer holds a bitmap.
1813
    ///
1814
    /// Note: Unlike the C++ implementation, the null/validity buffer
1815
    /// is handled specially rather than as another of the buffers in
1816
    /// the spec, so this variant is only used for the Boolean type.
1817
    BitMap,
1818
    /// Buffer is always null. Unused currently in Rust implementation,
1819
    /// (used in C++ for Union type)
1820
    #[allow(dead_code)]
1821
    AlwaysNull,
1822
}
1823
1824
impl PartialEq for ArrayData {
1825
81.1k
    fn eq(&self, other: &Self) -> bool {
1826
81.1k
        equal::equal(self, other)
1827
81.1k
    }
1828
}
1829
1830
/// A boolean flag that cannot be mutated outside of unsafe code.
1831
///
1832
/// Defaults to a value of false.
1833
///
1834
/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1835
///
1836
/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1837
///
1838
/// # Example
1839
/// ```rust
1840
/// use arrow_data::UnsafeFlag;
1841
/// assert!(!UnsafeFlag::default().get()); // default is false
1842
/// let mut flag = UnsafeFlag::new();
1843
/// assert!(!flag.get()); // defaults to false
1844
/// // can only set it to true in unsafe code
1845
/// unsafe { flag.set(true) };
1846
/// assert!(flag.get()); // now true
1847
/// ```
1848
#[derive(Debug, Clone)]
1849
#[doc(hidden)]
1850
pub struct UnsafeFlag(bool);
1851
1852
impl UnsafeFlag {
1853
    /// Creates a new `UnsafeFlag` with the value set to `false`.
1854
    ///
1855
    /// See examples on [`Self::new`]
1856
    #[inline]
1857
970k
    pub const fn new() -> Self {
1858
970k
        Self(false)
1859
970k
    }
1860
1861
    /// Sets the value of the flag to the given value
1862
    ///
1863
    /// Note this can purposely only be done in `unsafe` code
1864
    ///
1865
    /// # Safety
1866
    ///
1867
    /// If set, the flag will be set to the given value. There is nothing
1868
    /// immediately unsafe about doing so, however, the flag can be used to
1869
    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1870
    #[inline]
1871
970k
    pub unsafe fn set(&mut self, val: bool) {
1872
970k
        self.0 = val;
1873
970k
    }
1874
1875
    /// Returns the value of the flag
1876
    #[inline]
1877
970k
    pub fn get(&self) -> bool {
1878
970k
        self.0
1879
970k
    }
1880
}
1881
1882
// Manual impl to make it clear you can not construct unsafe with true
1883
impl Default for UnsafeFlag {
1884
0
    fn default() -> Self {
1885
0
        Self::new()
1886
0
    }
1887
}
1888
1889
/// Builder for [`ArrayData`] type
1890
#[derive(Debug)]
1891
pub struct ArrayDataBuilder {
1892
    data_type: DataType,
1893
    len: usize,
1894
    null_count: Option<usize>,
1895
    null_bit_buffer: Option<Buffer>,
1896
    nulls: Option<NullBuffer>,
1897
    offset: usize,
1898
    buffers: Vec<Buffer>,
1899
    child_data: Vec<ArrayData>,
1900
    /// Should buffers be realigned (copying if necessary)?
1901
    ///
1902
    /// Defaults to false.
1903
    align_buffers: bool,
1904
    /// Should data validation be skipped for this [`ArrayData`]?
1905
    ///
1906
    /// Defaults to false.
1907
    ///
1908
    /// # Safety
1909
    ///
1910
    /// This flag can only be set to true using `unsafe` APIs. However, once true
1911
    /// subsequent calls to `build()` may result in undefined behavior if the data
1912
    /// is not valid.
1913
    skip_validation: UnsafeFlag,
1914
}
1915
1916
impl ArrayDataBuilder {
1917
    #[inline]
1918
    /// Creates a new array data builder
1919
647k
    pub const fn new(data_type: DataType) -> Self {
1920
647k
        Self {
1921
647k
            data_type,
1922
647k
            len: 0,
1923
647k
            null_count: None,
1924
647k
            null_bit_buffer: None,
1925
647k
            nulls: None,
1926
647k
            offset: 0,
1927
647k
            buffers: vec![],
1928
647k
            child_data: vec![],
1929
647k
            align_buffers: false,
1930
647k
            skip_validation: UnsafeFlag::new(),
1931
647k
        }
1932
647k
    }
1933
1934
    /// Creates a new array data builder from an existing one, changing the data type
1935
320k
    pub fn data_type(self, data_type: DataType) -> Self {
1936
320k
        Self { data_type, ..self }
1937
320k
    }
1938
1939
    #[inline]
1940
    #[allow(clippy::len_without_is_empty)]
1941
    /// Sets the length of the [ArrayData]
1942
647k
    pub const fn len(mut self, n: usize) -> Self {
1943
647k
        self.len = n;
1944
647k
        self
1945
647k
    }
1946
1947
    /// Sets the null buffer of the [ArrayData]
1948
647k
    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1949
647k
        self.nulls = nulls;
1950
647k
        self.null_count = None;
1951
647k
        self.null_bit_buffer = None;
1952
647k
        self
1953
647k
    }
1954
1955
    /// Sets the null count of the [ArrayData]
1956
547
    pub fn null_count(mut self, null_count: usize) -> Self {
1957
547
        self.null_count = Some(null_count);
1958
547
        self
1959
547
    }
1960
1961
    /// Sets the `null_bit_buffer` of the [ArrayData]
1962
565
    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1963
565
        self.nulls = None;
1964
565
        self.null_bit_buffer = buf;
1965
565
        self
1966
565
    }
1967
1968
    /// Sets the offset of the [ArrayData]
1969
    #[inline]
1970
262
    pub const fn offset(mut self, n: usize) -> Self {
1971
262
        self.offset = n;
1972
262
        self
1973
262
    }
1974
1975
    /// Sets the buffers of the [ArrayData]
1976
485k
    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1977
485k
        self.buffers = v;
1978
485k
        self
1979
485k
    }
1980
1981
    /// Adds a single buffer to the [ArrayData]'s buffers
1982
243k
    pub fn add_buffer(mut self, b: Buffer) -> Self {
1983
243k
        self.buffers.push(b);
1984
243k
        self
1985
243k
    }
1986
1987
    /// Adds multiple buffers to the [ArrayData]'s buffers
1988
84
    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1989
84
        self.buffers.extend(bs);
1990
84
        self
1991
84
    }
1992
1993
    /// Sets the child data of the [ArrayData]
1994
320k
    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1995
320k
        self.child_data = v;
1996
320k
        self
1997
320k
    }
1998
1999
    /// Adds a single child data to the [ArrayData]'s child data
2000
106
    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2001
106
        self.child_data.push(r);
2002
106
        self
2003
106
    }
2004
2005
    /// Creates an array data, without any validation
2006
    ///
2007
    /// Note: This is shorthand for
2008
    /// ```rust
2009
    /// # #[expect(unsafe_op_in_unsafe_fn)]
2010
    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2011
    /// # let _ = unsafe {
2012
    /// builder.skip_validation(true).build().unwrap()
2013
    /// # };
2014
    /// ```
2015
    ///
2016
    /// # Safety
2017
    ///
2018
    /// The same caveats as [`ArrayData::new_unchecked`]
2019
    /// apply.
2020
968k
    pub unsafe fn build_unchecked(self) -> ArrayData {
2021
968k
        unsafe { self.skip_validation(true) }.build().unwrap()
2022
968k
    }
2023
2024
    /// Creates an `ArrayData`, consuming `self`
2025
    ///
2026
    /// # Safety
2027
    ///
2028
    /// By default the underlying buffers are checked to ensure they are valid
2029
    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2030
    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2031
    /// not valid, undefined behavior will result.
2032
970k
    pub fn build(self) -> Result<ArrayData, ArrowError> {
2033
        let Self {
2034
970k
            data_type,
2035
970k
            len,
2036
970k
            null_count,
2037
970k
            null_bit_buffer,
2038
970k
            nulls,
2039
970k
            offset,
2040
970k
            buffers,
2041
970k
            child_data,
2042
970k
            align_buffers,
2043
970k
            skip_validation,
2044
970k
        } = self;
2045
2046
970k
        let nulls = nulls
2047
970k
            .or_else(|| 
{965k
2048
965k
                let 
buffer2.69k
= null_bit_buffer
?963k
;
2049
2.69k
                let buffer = BooleanBuffer::new(buffer, offset, len);
2050
2.69k
                Some(match null_count {
2051
545
                    Some(n) => {
2052
                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2053
545
                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2054
                    }
2055
2.14k
                    None => NullBuffer::new(buffer),
2056
                })
2057
965k
            })
2058
970k
            .filter(|b| 
b7.78k
.
null_count7.78k
() != 0);
2059
2060
970k
        let mut data = ArrayData {
2061
970k
            data_type,
2062
970k
            len,
2063
970k
            offset,
2064
970k
            buffers,
2065
970k
            child_data,
2066
970k
            nulls,
2067
970k
        };
2068
2069
970k
        if align_buffers {
2070
0
            data.align_buffers();
2071
970k
        }
2072
2073
        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2074
970k
        if !skip_validation.get() || 
cfg!970k
(feature = "force_validate") {
2075
128
            data.validate_data()
?0
;
2076
970k
        }
2077
970k
        Ok(data)
2078
970k
    }
2079
2080
    /// Creates an array data, validating all inputs, and aligning any buffers
2081
    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2082
0
    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2083
0
        self.align_buffers(true).build()
2084
0
    }
2085
2086
    /// Ensure that all buffers are aligned, copying data if necessary
2087
    ///
2088
    /// Rust requires that arrays are aligned to their corresponding primitive,
2089
    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2090
    ///
2091
    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2092
    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2093
    ///
2094
    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2095
    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2096
    /// when necessary, making it useful when interacting with buffers produced by other systems,
2097
    /// e.g. IPC or FFI.
2098
    ///
2099
    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2100
    /// insufficiently aligned buffers.
2101
0
    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2102
0
        self.align_buffers = align_buffers;
2103
0
        self
2104
0
    }
2105
2106
    /// Skips validation of the data.
2107
    ///
2108
    /// If this flag is enabled, `[Self::build`] will skip validation of the
2109
    /// data
2110
    ///
2111
    /// If this flag is not enabled, `[Self::build`] will validate that all
2112
    /// buffers are valid and will return an error if any data is invalid.
2113
    /// Validation can be expensive.
2114
    ///
2115
    /// # Safety
2116
    ///
2117
    /// If validation is skipped, the buffers must form a valid Arrow array,
2118
    /// otherwise undefined behavior will result
2119
968k
    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2120
968k
        unsafe {
2121
968k
            self.skip_validation.set(skip_validation);
2122
968k
        }
2123
968k
        self
2124
968k
    }
2125
}
2126
2127
impl From<ArrayData> for ArrayDataBuilder {
2128
321k
    fn from(d: ArrayData) -> Self {
2129
321k
        Self {
2130
321k
            data_type: d.data_type,
2131
321k
            len: d.len,
2132
321k
            offset: d.offset,
2133
321k
            buffers: d.buffers,
2134
321k
            child_data: d.child_data,
2135
321k
            nulls: d.nulls,
2136
321k
            null_bit_buffer: None,
2137
321k
            null_count: None,
2138
321k
            align_buffers: false,
2139
321k
            skip_validation: UnsafeFlag::new(),
2140
321k
        }
2141
321k
    }
2142
}
2143
2144
#[cfg(test)]
2145
mod tests {
2146
    use super::*;
2147
    use arrow_schema::{Field, Fields};
2148
2149
    // See arrow/tests/array_data_validation.rs for test of array validation
2150
2151
    /// returns a buffer initialized with some constant value for tests
2152
    fn make_i32_buffer(n: usize) -> Buffer {
2153
        Buffer::from_slice_ref(vec![42i32; n])
2154
    }
2155
2156
    /// returns a buffer initialized with some constant value for tests
2157
    fn make_f32_buffer(n: usize) -> Buffer {
2158
        Buffer::from_slice_ref(vec![42f32; n])
2159
    }
2160
2161
    #[test]
2162
    fn test_builder() {
2163
        // Buffer needs to be at least 25 long
2164
        let v = (0..25).collect::<Vec<i32>>();
2165
        let b1 = Buffer::from_slice_ref(&v);
2166
        let arr_data = ArrayData::builder(DataType::Int32)
2167
            .len(20)
2168
            .offset(5)
2169
            .add_buffer(b1)
2170
            .null_bit_buffer(Some(Buffer::from([
2171
                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2172
            ])))
2173
            .build()
2174
            .unwrap();
2175
2176
        assert_eq!(20, arr_data.len());
2177
        assert_eq!(10, arr_data.null_count());
2178
        assert_eq!(5, arr_data.offset());
2179
        assert_eq!(1, arr_data.buffers().len());
2180
        assert_eq!(
2181
            Buffer::from_slice_ref(&v).as_slice(),
2182
            arr_data.buffers()[0].as_slice()
2183
        );
2184
    }
2185
2186
    #[test]
2187
    fn test_builder_with_child_data() {
2188
        let child_arr_data = ArrayData::try_new(
2189
            DataType::Int32,
2190
            5,
2191
            None,
2192
            0,
2193
            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2194
            vec![],
2195
        )
2196
        .unwrap();
2197
2198
        let field = Arc::new(Field::new("x", DataType::Int32, true));
2199
        let data_type = DataType::Struct(vec![field].into());
2200
2201
        let arr_data = ArrayData::builder(data_type)
2202
            .len(5)
2203
            .offset(0)
2204
            .add_child_data(child_arr_data.clone())
2205
            .build()
2206
            .unwrap();
2207
2208
        assert_eq!(5, arr_data.len());
2209
        assert_eq!(1, arr_data.child_data().len());
2210
        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2211
    }
2212
2213
    #[test]
2214
    fn test_null_count() {
2215
        let mut bit_v: [u8; 2] = [0; 2];
2216
        bit_util::set_bit(&mut bit_v, 0);
2217
        bit_util::set_bit(&mut bit_v, 3);
2218
        bit_util::set_bit(&mut bit_v, 10);
2219
        let arr_data = ArrayData::builder(DataType::Int32)
2220
            .len(16)
2221
            .add_buffer(make_i32_buffer(16))
2222
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2223
            .build()
2224
            .unwrap();
2225
        assert_eq!(13, arr_data.null_count());
2226
2227
        // Test with offset
2228
        let mut bit_v: [u8; 2] = [0; 2];
2229
        bit_util::set_bit(&mut bit_v, 0);
2230
        bit_util::set_bit(&mut bit_v, 3);
2231
        bit_util::set_bit(&mut bit_v, 10);
2232
        let arr_data = ArrayData::builder(DataType::Int32)
2233
            .len(12)
2234
            .offset(2)
2235
            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2236
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2237
            .build()
2238
            .unwrap();
2239
        assert_eq!(10, arr_data.null_count());
2240
    }
2241
2242
    #[test]
2243
    fn test_null_buffer_ref() {
2244
        let mut bit_v: [u8; 2] = [0; 2];
2245
        bit_util::set_bit(&mut bit_v, 0);
2246
        bit_util::set_bit(&mut bit_v, 3);
2247
        bit_util::set_bit(&mut bit_v, 10);
2248
        let arr_data = ArrayData::builder(DataType::Int32)
2249
            .len(16)
2250
            .add_buffer(make_i32_buffer(16))
2251
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2252
            .build()
2253
            .unwrap();
2254
        assert!(arr_data.nulls().is_some());
2255
        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2256
    }
2257
2258
    #[test]
2259
    fn test_slice() {
2260
        let mut bit_v: [u8; 2] = [0; 2];
2261
        bit_util::set_bit(&mut bit_v, 0);
2262
        bit_util::set_bit(&mut bit_v, 3);
2263
        bit_util::set_bit(&mut bit_v, 10);
2264
        let data = ArrayData::builder(DataType::Int32)
2265
            .len(16)
2266
            .add_buffer(make_i32_buffer(16))
2267
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2268
            .build()
2269
            .unwrap();
2270
        let new_data = data.slice(1, 15);
2271
        assert_eq!(data.len() - 1, new_data.len());
2272
        assert_eq!(1, new_data.offset());
2273
        assert_eq!(data.null_count(), new_data.null_count());
2274
2275
        // slice of a slice (removes one null)
2276
        let new_data = new_data.slice(1, 14);
2277
        assert_eq!(data.len() - 2, new_data.len());
2278
        assert_eq!(2, new_data.offset());
2279
        assert_eq!(data.null_count() - 1, new_data.null_count());
2280
    }
2281
2282
    #[test]
2283
    fn test_equality() {
2284
        let int_data = ArrayData::builder(DataType::Int32)
2285
            .len(1)
2286
            .add_buffer(make_i32_buffer(1))
2287
            .build()
2288
            .unwrap();
2289
2290
        let float_data = ArrayData::builder(DataType::Float32)
2291
            .len(1)
2292
            .add_buffer(make_f32_buffer(1))
2293
            .build()
2294
            .unwrap();
2295
        assert_ne!(int_data, float_data);
2296
        assert!(!int_data.ptr_eq(&float_data));
2297
        assert!(int_data.ptr_eq(&int_data));
2298
2299
        #[allow(clippy::redundant_clone)]
2300
        let int_data_clone = int_data.clone();
2301
        assert_eq!(int_data, int_data_clone);
2302
        assert!(int_data.ptr_eq(&int_data_clone));
2303
        assert!(int_data_clone.ptr_eq(&int_data));
2304
2305
        let int_data_slice = int_data_clone.slice(1, 0);
2306
        assert!(int_data_slice.ptr_eq(&int_data_slice));
2307
        assert!(!int_data.ptr_eq(&int_data_slice));
2308
        assert!(!int_data_slice.ptr_eq(&int_data));
2309
2310
        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2311
        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2312
        let string_data = ArrayData::try_new(
2313
            DataType::Utf8,
2314
            3,
2315
            Some(Buffer::from_iter(vec![true, false, true])),
2316
            0,
2317
            vec![offsets_buffer, data_buffer],
2318
            vec![],
2319
        )
2320
        .unwrap();
2321
2322
        assert_ne!(float_data, string_data);
2323
        assert!(!float_data.ptr_eq(&string_data));
2324
2325
        assert!(string_data.ptr_eq(&string_data));
2326
2327
        #[allow(clippy::redundant_clone)]
2328
        let string_data_cloned = string_data.clone();
2329
        assert!(string_data_cloned.ptr_eq(&string_data));
2330
        assert!(string_data.ptr_eq(&string_data_cloned));
2331
2332
        let string_data_slice = string_data.slice(1, 2);
2333
        assert!(string_data_slice.ptr_eq(&string_data_slice));
2334
        assert!(!string_data_slice.ptr_eq(&string_data))
2335
    }
2336
2337
    #[test]
2338
    fn test_slice_memory_size() {
2339
        let mut bit_v: [u8; 2] = [0; 2];
2340
        bit_util::set_bit(&mut bit_v, 0);
2341
        bit_util::set_bit(&mut bit_v, 3);
2342
        bit_util::set_bit(&mut bit_v, 10);
2343
        let data = ArrayData::builder(DataType::Int32)
2344
            .len(16)
2345
            .add_buffer(make_i32_buffer(16))
2346
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2347
            .build()
2348
            .unwrap();
2349
        let new_data = data.slice(1, 14);
2350
        assert_eq!(
2351
            data.get_slice_memory_size().unwrap() - 8,
2352
            new_data.get_slice_memory_size().unwrap()
2353
        );
2354
        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2355
        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2356
        let string_data = ArrayData::try_new(
2357
            DataType::Utf8,
2358
            3,
2359
            Some(Buffer::from_iter(vec![true, false, true])),
2360
            0,
2361
            vec![offsets_buffer, data_buffer],
2362
            vec![],
2363
        )
2364
        .unwrap();
2365
        let string_data_slice = string_data.slice(1, 2);
2366
        //4 bytes of offset and 2 bytes of data reduced by slicing.
2367
        assert_eq!(
2368
            string_data.get_slice_memory_size().unwrap() - 6,
2369
            string_data_slice.get_slice_memory_size().unwrap()
2370
        );
2371
    }
2372
2373
    #[test]
2374
    fn test_count_nulls() {
2375
        let buffer = Buffer::from([0b00010110, 0b10011111]);
2376
        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2377
        let count = count_nulls(Some(&buffer), 0, 16);
2378
        assert_eq!(count, 7);
2379
2380
        let count = count_nulls(Some(&buffer), 4, 8);
2381
        assert_eq!(count, 3);
2382
    }
2383
2384
    #[test]
2385
    fn test_contains_nulls() {
2386
        let buffer: Buffer =
2387
            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2388
        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2389
        assert!(contains_nulls(Some(&buffer), 0, 6));
2390
        assert!(contains_nulls(Some(&buffer), 0, 3));
2391
        assert!(!contains_nulls(Some(&buffer), 3, 2));
2392
        assert!(!contains_nulls(Some(&buffer), 0, 0));
2393
    }
2394
2395
    #[test]
2396
    fn test_alignment() {
2397
        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2398
        let sliced = buffer.slice(1);
2399
2400
        let mut data = ArrayData {
2401
            data_type: DataType::Int32,
2402
            len: 0,
2403
            offset: 0,
2404
            buffers: vec![buffer],
2405
            child_data: vec![],
2406
            nulls: None,
2407
        };
2408
        data.validate_full().unwrap();
2409
2410
        // break alignment in data
2411
        data.buffers[0] = sliced;
2412
        let err = data.validate().unwrap_err();
2413
2414
        assert_eq!(
2415
            err.to_string(),
2416
            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2417
        );
2418
2419
        data.align_buffers();
2420
        data.validate_full().unwrap();
2421
    }
2422
2423
    #[test]
2424
    fn test_alignment_struct() {
2425
        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2426
        let sliced = buffer.slice(1);
2427
2428
        let child_data = ArrayData {
2429
            data_type: DataType::Int32,
2430
            len: 0,
2431
            offset: 0,
2432
            buffers: vec![buffer],
2433
            child_data: vec![],
2434
            nulls: None,
2435
        };
2436
2437
        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2438
        let mut data = ArrayData {
2439
            data_type: schema,
2440
            len: 0,
2441
            offset: 0,
2442
            buffers: vec![],
2443
            child_data: vec![child_data],
2444
            nulls: None,
2445
        };
2446
        data.validate_full().unwrap();
2447
2448
        // break alignment in child data
2449
        data.child_data[0].buffers[0] = sliced;
2450
        let err = data.validate().unwrap_err();
2451
2452
        assert_eq!(
2453
            err.to_string(),
2454
            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2455
        );
2456
2457
        data.align_buffers();
2458
        data.validate_full().unwrap();
2459
    }
2460
2461
    #[test]
2462
    fn test_null_view_types() {
2463
        let array_len = 32;
2464
        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2465
        assert_eq!(array.len(), array_len);
2466
        for i in 0..array.len() {
2467
            assert!(array.is_null(i));
2468
        }
2469
2470
        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2471
        assert_eq!(array.len(), array_len);
2472
        for i in 0..array.len() {
2473
            assert!(array.is_null(i));
2474
        }
2475
    }
2476
}