Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-data/src/data.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19
//! common attributes and operations for Arrow array.
20
21
use crate::bit_iterator::BitSliceIterator;
22
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23
use arrow_buffer::{
24
    bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25
};
26
use arrow_schema::{ArrowError, DataType, UnionMode};
27
use std::mem;
28
use std::ops::Range;
29
use std::sync::Arc;
30
31
use crate::{equal, validate_binary_view, validate_string_view};
32
33
#[inline]
34
624
pub(crate) fn contains_nulls(
35
624
    null_bit_buffer: Option<&NullBuffer>,
36
624
    offset: usize,
37
624
    len: usize,
38
624
) -> bool {
39
624
    match null_bit_buffer {
40
105
        Some(buffer) => {
41
105
            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42
66
                Some((start, end)) => start != 0 || 
end != len59
,
43
39
                None => len != 0, // No non-null values
44
            }
45
        }
46
519
        None => false, // No null buffer
47
    }
48
624
}
49
50
#[inline]
51
122
pub(crate) fn count_nulls(
52
122
    null_bit_buffer: Option<&NullBuffer>,
53
122
    offset: usize,
54
122
    len: usize,
55
122
) -> usize {
56
122
    if let Some(
buf68
) = null_bit_buffer {
57
68
        let buffer = buf.buffer();
58
68
        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59
    } else {
60
54
        0
61
    }
62
122
}
63
64
/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65
#[inline]
66
18
pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67
18
    let empty_buffer = MutableBuffer::new(0);
68
18
    match data_type {
69
0
        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70
        DataType::Boolean => {
71
0
            let bytes = bit_util::ceil(capacity, 8);
72
0
            let buffer = MutableBuffer::new(bytes);
73
0
            [buffer, empty_buffer]
74
        }
75
        DataType::UInt8
76
        | DataType::UInt16
77
        | DataType::UInt32
78
        | DataType::UInt64
79
        | DataType::Int8
80
        | DataType::Int16
81
        | DataType::Int32
82
        | DataType::Int64
83
        | DataType::Float16
84
        | DataType::Float32
85
        | DataType::Float64
86
        | DataType::Decimal32(_, _)
87
        | DataType::Decimal64(_, _)
88
        | DataType::Decimal128(_, _)
89
        | DataType::Decimal256(_, _)
90
        | DataType::Date32
91
        | DataType::Time32(_)
92
        | DataType::Date64
93
        | DataType::Time64(_)
94
        | DataType::Duration(_)
95
        | DataType::Timestamp(_, _)
96
3
        | DataType::Interval(_) => [
97
3
            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98
3
            empty_buffer,
99
3
        ],
100
        DataType::Utf8 | DataType::Binary => {
101
3
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102
            // safety: `unsafe` code assumes that this buffer is initialized with one element
103
3
            buffer.push(0i32);
104
3
            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105
        }
106
        DataType::LargeUtf8 | DataType::LargeBinary => {
107
0
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108
            // safety: `unsafe` code assumes that this buffer is initialized with one element
109
0
            buffer.push(0i64);
110
0
            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111
        }
112
0
        DataType::BinaryView | DataType::Utf8View => [
113
0
            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114
0
            empty_buffer,
115
0
        ],
116
        DataType::List(_) | DataType::Map(_, _) => {
117
            // offset buffer always starts with a zero
118
4
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119
4
            buffer.push(0i32);
120
4
            [buffer, empty_buffer]
121
        }
122
0
        DataType::ListView(_) => [
123
0
            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124
0
            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125
0
        ],
126
        DataType::LargeList(_) => {
127
            // offset buffer always starts with a zero
128
0
            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129
0
            buffer.push(0i64);
130
0
            [buffer, empty_buffer]
131
        }
132
0
        DataType::LargeListView(_) => [
133
0
            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134
0
            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135
0
        ],
136
3
        DataType::FixedSizeBinary(size) => {
137
3
            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138
        }
139
0
        DataType::Dictionary(k, _) => [
140
0
            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141
0
            empty_buffer,
142
0
        ],
143
        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144
5
            [empty_buffer, MutableBuffer::new(0)]
145
        }
146
0
        DataType::Union(_, mode) => {
147
0
            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148
0
            match mode {
149
0
                UnionMode::Sparse => [type_ids, empty_buffer],
150
                UnionMode::Dense => {
151
0
                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152
0
                    [type_ids, offsets]
153
                }
154
            }
155
        }
156
    }
157
18
}
158
159
/// A generic representation of Arrow array data which encapsulates common attributes
160
/// and operations for Arrow array.
161
///
162
/// Specific operations for different arrays types (e.g., primitive, list, struct)
163
/// are implemented in `Array`.
164
///
165
/// # Memory Layout
166
///
167
/// `ArrayData` has references to one or more underlying data buffers
168
/// and optional child ArrayData, depending on type as illustrated
169
/// below. Bitmaps are not shown for simplicity but they are stored
170
/// similarly to the buffers.
171
///
172
/// ```text
173
///                        offset
174
///                       points to
175
/// ┌───────────────────┐ start of  ┌───────┐       Different
176
/// │                   │   data    │       │     ArrayData may
177
/// │ArrayData {        │           │....   │     also refers to
178
/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179
/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180
/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181
/// │  buffers: [       │           │5882   │◀─
182
/// │    ...            │  │        │4323   │
183
/// │  ]                │   ─ ─ ─ ─▶│4859   │
184
/// │  child_data: [    │           │....   │
185
/// │    ...            │           │       │
186
/// │  ]                │           └───────┘
187
/// │}                  │
188
/// │                   │            Shared Buffer uses
189
/// │               │   │            bytes::Bytes to hold
190
/// └───────────────────┘            actual data values
191
///           ┌ ─ ─ ┘
192
///
193
///           ▼
194
/// ┌───────────────────┐
195
/// │ArrayData {        │
196
/// │  ...              │
197
/// │}                  │
198
/// │                   │
199
/// └───────────────────┘
200
///
201
/// Child ArrayData may also have its own buffers and children
202
/// ```
203
204
#[derive(Debug, Clone)]
205
pub struct ArrayData {
206
    /// The data type
207
    data_type: DataType,
208
209
    /// The number of elements
210
    len: usize,
211
212
    /// The offset in number of items (not bytes).
213
    ///
214
    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215
    /// does NOT apply to [`Self::nulls`].
216
    offset: usize,
217
218
    /// The buffers that store the actual data for this array, as defined
219
    /// in the [Arrow Spec].
220
    ///
221
    /// Depending on the array types, [`Self::buffers`] can hold different
222
    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223
    /// positions.
224
    ///
225
    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226
    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227
    /// we need to satisfy `len`.
228
    ///
229
    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230
    buffers: Vec<Buffer>,
231
232
    /// The child(ren) of this array.
233
    ///
234
    /// Only non-empty for nested types, such as `ListArray` and
235
    /// `StructArray`.
236
    ///
237
    /// The first logical element in each child element begins at `offset`.
238
    ///
239
    /// If the child element also has an offset then these offsets are
240
    /// cumulative.
241
    child_data: Vec<ArrayData>,
242
243
    /// The null bitmap.
244
    ///
245
    /// `None` indicates all values are non-null in this array.
246
    ///
247
    /// [`Self::offset]` does not apply to the null bitmap. While the
248
    /// BooleanBuffer may be sliced (have its own offset) internally, this
249
    /// `NullBuffer` always represents exactly `len` elements.
250
    nulls: Option<NullBuffer>,
251
}
252
253
/// A thread-safe, shared reference to the Arrow array data.
254
pub type ArrayDataRef = Arc<ArrayData>;
255
256
impl ArrayData {
257
    /// Create a new ArrayData instance;
258
    ///
259
    /// If `null_count` is not specified, the number of nulls in
260
    /// null_bit_buffer is calculated.
261
    ///
262
    /// If the number of nulls is 0 then the null_bit_buffer
263
    /// is set to `None`.
264
    ///
265
    /// # Safety
266
    ///
267
    /// The input values *must* form a valid Arrow array for
268
    /// `data_type`, or undefined behavior can result.
269
    ///
270
    /// Note: This is a low level API and most users of the arrow
271
    /// crate should create arrays using the methods in the `array`
272
    /// module.
273
34
    pub unsafe fn new_unchecked(
274
34
        data_type: DataType,
275
34
        len: usize,
276
34
        null_count: Option<usize>,
277
34
        null_bit_buffer: Option<Buffer>,
278
34
        offset: usize,
279
34
        buffers: Vec<Buffer>,
280
34
        child_data: Vec<ArrayData>,
281
34
    ) -> Self {
282
34
        let mut skip_validation = UnsafeFlag::new();
283
        // SAFETY: caller responsible for ensuring data is valid
284
34
        skip_validation.set(true);
285
286
34
        ArrayDataBuilder {
287
34
            data_type,
288
34
            len,
289
34
            null_count,
290
34
            null_bit_buffer,
291
34
            nulls: None,
292
34
            offset,
293
34
            buffers,
294
34
            child_data,
295
34
            align_buffers: false,
296
34
            skip_validation,
297
34
        }
298
34
        .build()
299
34
        .unwrap()
300
34
    }
301
302
    /// Create a new ArrayData, validating that the provided buffers form a valid
303
    /// Arrow array of the specified data type.
304
    ///
305
    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
306
    /// is set to `None`.
307
    ///
308
    /// Internally this calls through to [`Self::validate_data`]
309
    ///
310
    /// Note: This is a low level API and most users of the arrow crate should create
311
    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
312
0
    pub fn try_new(
313
0
        data_type: DataType,
314
0
        len: usize,
315
0
        null_bit_buffer: Option<Buffer>,
316
0
        offset: usize,
317
0
        buffers: Vec<Buffer>,
318
0
        child_data: Vec<ArrayData>,
319
0
    ) -> Result<Self, ArrowError> {
320
        // we must check the length of `null_bit_buffer` first
321
        // because we use this buffer to calculate `null_count`
322
        // in `Self::new_unchecked`.
323
0
        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
324
0
            let needed_len = bit_util::ceil(len + offset, 8);
325
0
            if null_bit_buffer.len() < needed_len {
326
0
                return Err(ArrowError::InvalidArgumentError(format!(
327
0
                    "null_bit_buffer size too small. got {} needed {}",
328
0
                    null_bit_buffer.len(),
329
0
                    needed_len
330
0
                )));
331
0
            }
332
0
        }
333
        // Safety justification: `validate_full` is called below
334
0
        let new_self = unsafe {
335
0
            Self::new_unchecked(
336
0
                data_type,
337
0
                len,
338
0
                None,
339
0
                null_bit_buffer,
340
0
                offset,
341
0
                buffers,
342
0
                child_data,
343
            )
344
        };
345
346
        // As the data is not trusted, do a full validation of its contents
347
        // We don't need to validate children as we can assume that the
348
        // [`ArrayData`] in `child_data` have already been validated through
349
        // a call to `ArrayData::try_new` or created using unsafe
350
0
        new_self.validate_data()?;
351
0
        Ok(new_self)
352
0
    }
353
354
    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
355
    #[inline]
356
162
    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
357
162
        ArrayDataBuilder::new(data_type)
358
162
    }
359
360
    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
361
    #[inline]
362
2.22k
    pub const fn data_type(&self) -> &DataType {
363
2.22k
        &self.data_type
364
2.22k
    }
365
366
    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
367
2.17k
    pub fn buffers(&self) -> &[Buffer] {
368
2.17k
        &self.buffers
369
2.17k
    }
370
371
    /// Returns a slice of children [`ArrayData`]. This will be non
372
    /// empty for type such as lists and structs.
373
369
    pub fn child_data(&self) -> &[ArrayData] {
374
369
        &self.child_data[..]
375
369
    }
376
377
    /// Returns whether the element at index `i` is null
378
    #[inline]
379
0
    pub fn is_null(&self, i: usize) -> bool {
380
0
        match &self.nulls {
381
0
            Some(v) => v.is_null(i),
382
0
            None => false,
383
        }
384
0
    }
385
386
    /// Returns a reference to the null buffer of this [`ArrayData`] if any
387
    ///
388
    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
389
    #[inline]
390
2.65k
    pub fn nulls(&self) -> Option<&NullBuffer> {
391
2.65k
        self.nulls.as_ref()
392
2.65k
    }
393
394
    /// Returns whether the element at index `i` is not null
395
    #[inline]
396
    pub fn is_valid(&self, i: usize) -> bool {
397
        !self.is_null(i)
398
    }
399
400
    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
401
    #[inline]
402
2.10k
    pub const fn len(&self) -> usize {
403
2.10k
        self.len
404
2.10k
    }
405
406
    /// Returns whether this [`ArrayData`] is empty
407
    #[inline]
408
53
    pub const fn is_empty(&self) -> bool {
409
53
        self.len == 0
410
53
    }
411
412
    /// Returns the offset of this [`ArrayData`]
413
    #[inline]
414
1.15k
    pub const fn offset(&self) -> usize {
415
1.15k
        self.offset
416
1.15k
    }
417
418
    /// Returns the total number of nulls in this array
419
    #[inline]
420
943
    pub fn null_count(&self) -> usize {
421
943
        self.nulls
422
943
            .as_ref()
423
943
            .map(|x| 
x107
.
null_count107
())
424
943
            .unwrap_or_default()
425
943
    }
426
427
    /// Returns the total number of bytes of memory occupied by the
428
    /// buffers owned by this [`ArrayData`] and all of its
429
    /// children. (See also diagram on [`ArrayData`]).
430
    ///
431
    /// Note that this [`ArrayData`] may only refer to a subset of the
432
    /// data in the underlying [`Buffer`]s (due to `offset` and
433
    /// `length`), but the size returned includes the entire size of
434
    /// the buffers.
435
    ///
436
    /// If multiple [`ArrayData`]s refer to the same underlying
437
    /// [`Buffer`]s they will both report the same size.
438
0
    pub fn get_buffer_memory_size(&self) -> usize {
439
0
        let mut size = 0;
440
0
        for buffer in &self.buffers {
441
0
            size += buffer.capacity();
442
0
        }
443
0
        if let Some(bitmap) = &self.nulls {
444
0
            size += bitmap.buffer().capacity()
445
0
        }
446
0
        for child in &self.child_data {
447
0
            size += child.get_buffer_memory_size();
448
0
        }
449
0
        size
450
0
    }
451
452
    /// Returns the total number of the bytes of memory occupied by
453
    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
454
    ///
455
    /// This is approximately the number of bytes if a new
456
    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
457
    /// exactly the data needed.
458
    ///
459
    /// For example, a [`DataType::Int64`] with `100` elements,
460
    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
461
    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
462
    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
463
    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
464
0
    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
465
0
        let mut result: usize = 0;
466
0
        let layout = layout(&self.data_type);
467
468
0
        for spec in layout.buffers.iter() {
469
0
            match spec {
470
0
                BufferSpec::FixedWidth { byte_width, .. } => {
471
0
                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
472
0
                        ArrowError::ComputeError(
473
0
                            "Integer overflow computing buffer size".to_string(),
474
0
                        )
475
0
                    })?;
476
0
                    result += buffer_size;
477
                }
478
                BufferSpec::VariableWidth => {
479
                    let buffer_len: usize;
480
0
                    match self.data_type {
481
                        DataType::Utf8 | DataType::Binary => {
482
0
                            let offsets = self.typed_offsets::<i32>()?;
483
0
                            buffer_len = (offsets[self.len] - offsets[0] ) as usize;
484
                        }
485
                        DataType::LargeUtf8 | DataType::LargeBinary => {
486
0
                            let offsets = self.typed_offsets::<i64>()?;
487
0
                            buffer_len = (offsets[self.len] - offsets[0]) as usize;
488
                        }
489
                        _ => {
490
0
                            return Err(ArrowError::NotYetImplemented(format!(
491
0
                            "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
492
0
                            self.data_type
493
0
                            )))
494
                        }
495
                    };
496
0
                    result += buffer_len;
497
                }
498
0
                BufferSpec::BitMap => {
499
0
                    let buffer_size = bit_util::ceil(self.len, 8);
500
0
                    result += buffer_size;
501
0
                }
502
0
                BufferSpec::AlwaysNull => {
503
0
                    // Nothing to do
504
0
                }
505
            }
506
        }
507
508
0
        if self.nulls().is_some() {
509
0
            result += bit_util::ceil(self.len, 8);
510
0
        }
511
512
0
        for child in &self.child_data {
513
0
            result += child.get_slice_memory_size()?;
514
        }
515
0
        Ok(result)
516
0
    }
517
518
    /// Returns the total number of bytes of memory occupied
519
    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
520
    /// children. (See also diagram on [`ArrayData`]).
521
    ///
522
    /// Equivalent to:
523
    ///  `size_of_val(self)` +
524
    ///  [`Self::get_buffer_memory_size`] +
525
    ///  `size_of_val(child)` for all children
526
0
    pub fn get_array_memory_size(&self) -> usize {
527
0
        let mut size = mem::size_of_val(self);
528
529
        // Calculate rest of the fields top down which contain actual data
530
0
        for buffer in &self.buffers {
531
0
            size += mem::size_of::<Buffer>();
532
0
            size += buffer.capacity();
533
0
        }
534
0
        if let Some(nulls) = &self.nulls {
535
0
            size += nulls.buffer().capacity();
536
0
        }
537
0
        for child in &self.child_data {
538
0
            size += child.get_array_memory_size();
539
0
        }
540
541
0
        size
542
0
    }
543
544
    /// Creates a zero-copy slice of itself. This creates a new
545
    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
546
    /// different offset and len
547
    ///
548
    /// # Panics
549
    ///
550
    /// Panics if `offset + length > self.len()`.
551
0
    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
552
0
        assert!((offset + length) <= self.len());
553
554
0
        if let DataType::Struct(_) = self.data_type() {
555
            // Slice into children
556
0
            let new_offset = self.offset + offset;
557
0
            let new_data = ArrayData {
558
0
                data_type: self.data_type().clone(),
559
0
                len: length,
560
0
                offset: new_offset,
561
0
                buffers: self.buffers.clone(),
562
                // Slice child data, to propagate offsets down to them
563
0
                child_data: self
564
0
                    .child_data()
565
0
                    .iter()
566
0
                    .map(|data| data.slice(offset, length))
567
0
                    .collect(),
568
0
                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
569
            };
570
571
0
            new_data
572
        } else {
573
0
            let mut new_data = self.clone();
574
575
0
            new_data.len = length;
576
0
            new_data.offset = offset + self.offset;
577
0
            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
578
579
0
            new_data
580
        }
581
0
    }
582
583
    /// Returns the `buffer` as a slice of type `T` starting at self.offset
584
    ///
585
    /// # Panics
586
    /// This function panics if:
587
    /// * the buffer is not byte-aligned with type T, or
588
    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
589
496
    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
590
496
        &self.buffers()[buffer].typed_data()[self.offset..]
591
496
    }
592
593
    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
594
0
    pub fn new_null(data_type: &DataType, len: usize) -> Self {
595
0
        let bit_len = bit_util::ceil(len, 8);
596
0
        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
597
598
0
        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
599
0
            Some(width) => (vec![zeroed(width * len)], vec![], true),
600
0
            None => match data_type {
601
0
                DataType::Null => (vec![], vec![], false),
602
0
                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
603
                DataType::Binary | DataType::Utf8 => {
604
0
                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
605
                }
606
0
                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
607
                DataType::LargeBinary | DataType::LargeUtf8 => {
608
0
                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
609
                }
610
0
                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
611
0
                DataType::List(f) | DataType::Map(f, _) => (
612
0
                    vec![zeroed((len + 1) * 4)],
613
0
                    vec![ArrayData::new_empty(f.data_type())],
614
0
                    true,
615
0
                ),
616
0
                DataType::LargeList(f) => (
617
0
                    vec![zeroed((len + 1) * 8)],
618
0
                    vec![ArrayData::new_empty(f.data_type())],
619
0
                    true,
620
0
                ),
621
0
                DataType::FixedSizeList(f, list_len) => (
622
0
                    vec![],
623
0
                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
624
0
                    true,
625
0
                ),
626
0
                DataType::Struct(fields) => (
627
0
                    vec![],
628
0
                    fields
629
0
                        .iter()
630
0
                        .map(|f| Self::new_null(f.data_type(), len))
631
0
                        .collect(),
632
                    true,
633
                ),
634
0
                DataType::Dictionary(k, v) => (
635
0
                    vec![zeroed(k.primitive_width().unwrap() * len)],
636
0
                    vec![ArrayData::new_empty(v.as_ref())],
637
0
                    true,
638
0
                ),
639
0
                DataType::Union(f, mode) => {
640
0
                    let (id, _) = f.iter().next().unwrap();
641
0
                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
642
0
                    let buffers = match mode {
643
0
                        UnionMode::Sparse => vec![ids],
644
                        UnionMode::Dense => {
645
0
                            let end_offset = i32::from_usize(len).unwrap();
646
0
                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
647
                        }
648
                    };
649
650
0
                    let children = f
651
0
                        .iter()
652
0
                        .enumerate()
653
0
                        .map(|(idx, (_, f))| {
654
0
                            if idx == 0 || *mode == UnionMode::Sparse {
655
0
                                Self::new_null(f.data_type(), len)
656
                            } else {
657
0
                                Self::new_empty(f.data_type())
658
                            }
659
0
                        })
660
0
                        .collect();
661
662
0
                    (buffers, children, false)
663
                }
664
0
                DataType::RunEndEncoded(r, v) => {
665
0
                    let runs = match r.data_type() {
666
                        DataType::Int16 => {
667
0
                            let i = i16::from_usize(len).expect("run overflow");
668
0
                            Buffer::from_slice_ref([i])
669
                        }
670
                        DataType::Int32 => {
671
0
                            let i = i32::from_usize(len).expect("run overflow");
672
0
                            Buffer::from_slice_ref([i])
673
                        }
674
                        DataType::Int64 => {
675
0
                            let i = i64::from_usize(len).expect("run overflow");
676
0
                            Buffer::from_slice_ref([i])
677
                        }
678
0
                        dt => unreachable!("Invalid run ends data type {dt}"),
679
                    };
680
681
0
                    let builder = ArrayData::builder(r.data_type().clone())
682
0
                        .len(1)
683
0
                        .buffers(vec![runs]);
684
685
                    // SAFETY:
686
                    // Valid by construction
687
0
                    let runs = unsafe { builder.build_unchecked() };
688
0
                    (
689
0
                        vec![],
690
0
                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
691
0
                        false,
692
0
                    )
693
                }
694
0
                d => unreachable!("{d}"),
695
            },
696
        };
697
698
0
        let mut builder = ArrayDataBuilder::new(data_type.clone())
699
0
            .len(len)
700
0
            .buffers(buffers)
701
0
            .child_data(child_data);
702
703
0
        if has_nulls {
704
0
            builder = builder.nulls(Some(NullBuffer::new_null(len)))
705
0
        }
706
707
        // SAFETY:
708
        // Data valid by construction
709
0
        unsafe { builder.build_unchecked() }
710
0
    }
711
712
    /// Returns a new empty [ArrayData] valid for `data_type`.
713
0
    pub fn new_empty(data_type: &DataType) -> Self {
714
0
        Self::new_null(data_type, 0)
715
0
    }
716
717
    /// Verifies that the buffers meet the minimum alignment requirements for the data type
718
    ///
719
    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
720
    ///
721
    /// This can be useful for when interacting with data sent over IPC or FFI, that may
722
    /// not meet the minimum alignment requirements
723
    ///
724
    /// This also aligns buffers of children data
725
0
    pub fn align_buffers(&mut self) {
726
0
        let layout = layout(&self.data_type);
727
0
        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
728
0
            if let BufferSpec::FixedWidth { alignment, .. } = spec {
729
0
                if buffer.as_ptr().align_offset(*alignment) != 0 {
730
0
                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
731
0
                }
732
0
            }
733
        }
734
        // align children data recursively
735
0
        for data in self.child_data.iter_mut() {
736
0
            data.align_buffers()
737
        }
738
0
    }
739
740
    /// "cheap" validation of an `ArrayData`. Ensures buffers are
741
    /// sufficiently sized to store `len` + `offset` total elements of
742
    /// `data_type` and performs other inexpensive consistency checks.
743
    ///
744
    /// This check is "cheap" in the sense that it does not validate the
745
    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
746
    /// are within the bounds of the values buffer).
747
    ///
748
    /// See [ArrayData::validate_data] to validate fully the offset content
749
    /// and the validity of utf8 data
750
25
    pub fn validate(&self) -> Result<(), ArrowError> {
751
        // Need at least this mich space in each buffer
752
25
        let len_plus_offset = self.len + self.offset;
753
754
        // Check that the data layout conforms to the spec
755
25
        let layout = layout(&self.data_type);
756
757
25
        if !layout.can_contain_null_mask && 
self.nulls0
.
is_some0
() {
758
0
            return Err(ArrowError::InvalidArgumentError(format!(
759
0
                "Arrays of type {:?} cannot contain a null bitmask",
760
0
                self.data_type,
761
0
            )));
762
25
        }
763
764
        // Check data buffers length for view types and other types
765
25
        if self.buffers.len() < layout.buffers.len()
766
25
            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
767
        {
768
0
            return Err(ArrowError::InvalidArgumentError(format!(
769
0
                "Expected {} buffers in array of type {:?}, got {}",
770
0
                layout.buffers.len(),
771
0
                self.data_type,
772
0
                self.buffers.len(),
773
0
            )));
774
25
        }
775
776
26
        for (i, (buffer, spec)) in 
self.buffers.iter()25
.
zip25
(
layout.buffers.iter()25
).
enumerate25
() {
777
26
            match spec {
778
                BufferSpec::FixedWidth {
779
19
                    byte_width,
780
19
                    alignment,
781
                } => {
782
19
                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
783
784
19
                    if buffer.len() < min_buffer_size {
785
0
                        return Err(ArrowError::InvalidArgumentError(format!(
786
0
                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
787
0
                            min_buffer_size, i, self.data_type, buffer.len()
788
0
                        )));
789
19
                    }
790
791
19
                    let align_offset = buffer.as_ptr().align_offset(*alignment);
792
19
                    if align_offset != 0 {
793
0
                        return Err(ArrowError::InvalidArgumentError(format!(
794
0
                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
795
0
                            self.data_type, align_offset.min(alignment - align_offset)
796
0
                        )));
797
19
                    }
798
                }
799
6
                BufferSpec::VariableWidth => {
800
6
                    // not cheap to validate (need to look at the
801
6
                    // data). Partially checked in validate_offsets
802
6
                    // called below. Can check with `validate_full`
803
6
                }
804
                BufferSpec::BitMap => {
805
1
                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
806
1
                    if buffer.len() < min_buffer_size {
807
0
                        return Err(ArrowError::InvalidArgumentError(format!(
808
0
                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
809
0
                            min_buffer_size, i, self.data_type, buffer.len()
810
0
                        )));
811
1
                    }
812
                }
813
0
                BufferSpec::AlwaysNull => {
814
0
                    // Nothing to validate
815
0
                }
816
            }
817
        }
818
819
        // check null bit buffer size
820
25
        if let Some(
nulls9
) = self.nulls() {
821
9
            if nulls.null_count() > self.len {
822
0
                return Err(ArrowError::InvalidArgumentError(format!(
823
0
                    "null_count {} for an array exceeds length of {} elements",
824
0
                    nulls.null_count(),
825
0
                    self.len
826
0
                )));
827
9
            }
828
829
9
            let actual_len = nulls.validity().len();
830
9
            let needed_len = bit_util::ceil(len_plus_offset, 8);
831
9
            if actual_len < needed_len {
832
0
                return Err(ArrowError::InvalidArgumentError(format!(
833
0
                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
834
0
                )));
835
9
            }
836
837
9
            if nulls.len() != self.len {
838
0
                return Err(ArrowError::InvalidArgumentError(format!(
839
0
                    "null buffer incorrect size. got {} expected {}",
840
0
                    nulls.len(),
841
0
                    self.len
842
0
                )));
843
9
            }
844
16
        }
845
846
25
        self.validate_child_data()
?0
;
847
848
        // Additional Type specific checks
849
25
        match &self.data_type {
850
            DataType::Utf8 | DataType::Binary => {
851
6
                self.validate_offsets::<i32>(self.buffers[1].len())
?0
;
852
            }
853
            DataType::LargeUtf8 | DataType::LargeBinary => {
854
0
                self.validate_offsets::<i64>(self.buffers[1].len())?;
855
            }
856
0
            DataType::Dictionary(key_type, _value_type) => {
857
                // At the moment, constructing a DictionaryArray will also check this
858
0
                if !DataType::is_dictionary_key_type(key_type) {
859
0
                    return Err(ArrowError::InvalidArgumentError(format!(
860
0
                        "Dictionary key type must be integer, but was {key_type}"
861
0
                    )));
862
0
                }
863
            }
864
0
            DataType::RunEndEncoded(run_ends_type, _) => {
865
0
                if run_ends_type.is_nullable() {
866
0
                    return Err(ArrowError::InvalidArgumentError(
867
0
                        "The nullable should be set to false for the field defining run_ends array.".to_string()
868
0
                    ));
869
0
                }
870
0
                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
871
0
                    return Err(ArrowError::InvalidArgumentError(format!(
872
0
                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
873
0
                        run_ends_type.data_type()
874
0
                    )));
875
0
                }
876
            }
877
19
            _ => {}
878
        };
879
880
25
        Ok(())
881
25
    }
882
883
    /// Returns a reference to the data in `buffer` as a typed slice
884
    /// (typically `&[i32]` or `&[i64]`) after validating. The
885
    /// returned slice is guaranteed to have at least `self.len + 1`
886
    /// entries.
887
    ///
888
    /// For an empty array, the `buffer` can also be empty.
889
20
    fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
890
        // An empty list-like array can have 0 offsets
891
20
        if self.len == 0 && 
self.buffers[0]0
.
is_empty0
() {
892
0
            return Ok(&[]);
893
20
        }
894
895
20
        self.typed_buffer(0, self.len + 1)
896
20
    }
897
898
    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
899
20
    fn typed_buffer<T: ArrowNativeType + num::Num>(
900
20
        &self,
901
20
        idx: usize,
902
20
        len: usize,
903
20
    ) -> Result<&[T], ArrowError> {
904
20
        let buffer = &self.buffers[idx];
905
906
20
        let required_len = (len + self.offset) * mem::size_of::<T>();
907
908
20
        if buffer.len() < required_len {
909
0
            return Err(ArrowError::InvalidArgumentError(format!(
910
0
                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
911
0
                idx,
912
0
                self.data_type,
913
0
                required_len,
914
0
                buffer.len()
915
0
            )));
916
20
        }
917
918
20
        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
919
20
    }
920
921
    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
922
    /// offsets (of type T) into some other buffer of `values_length` bytes long
923
15
    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
924
15
        &self,
925
15
        values_length: usize,
926
15
    ) -> Result<(), ArrowError> {
927
        // Justification: buffer size was validated above
928
15
        let offsets = self.typed_offsets::<T>()
?0
;
929
15
        if offsets.is_empty() {
930
0
            return Ok(());
931
15
        }
932
933
15
        let first_offset = offsets[0].to_usize().ok_or_else(|| 
{0
934
0
            ArrowError::InvalidArgumentError(format!(
935
0
                "Error converting offset[0] ({}) to usize for {}",
936
0
                offsets[0], self.data_type
937
0
            ))
938
0
        })?;
939
940
15
        let last_offset = offsets[self.len].to_usize().ok_or_else(|| 
{0
941
0
            ArrowError::InvalidArgumentError(format!(
942
0
                "Error converting offset[{}] ({}) to usize for {}",
943
0
                self.len, offsets[self.len], self.data_type
944
0
            ))
945
0
        })?;
946
947
15
        if first_offset > values_length {
948
0
            return Err(ArrowError::InvalidArgumentError(format!(
949
0
                "First offset {} of {} is larger than values length {}",
950
0
                first_offset, self.data_type, values_length,
951
0
            )));
952
15
        }
953
954
15
        if last_offset > values_length {
955
0
            return Err(ArrowError::InvalidArgumentError(format!(
956
0
                "Last offset {} of {} is larger than values length {}",
957
0
                last_offset, self.data_type, values_length,
958
0
            )));
959
15
        }
960
961
15
        if first_offset > last_offset {
962
0
            return Err(ArrowError::InvalidArgumentError(format!(
963
0
                "First offset {} in {} is smaller than last offset {}",
964
0
                first_offset, self.data_type, last_offset,
965
0
            )));
966
15
        }
967
968
15
        Ok(())
969
15
    }
970
971
    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
972
    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
973
0
    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
974
0
        &self,
975
0
        values_length: usize,
976
0
    ) -> Result<(), ArrowError> {
977
0
        let offsets: &[T] = self.typed_buffer(0, self.len)?;
978
0
        let sizes: &[T] = self.typed_buffer(1, self.len)?;
979
0
        for i in 0..values_length {
980
0
            let size = sizes[i].to_usize().ok_or_else(|| {
981
0
                ArrowError::InvalidArgumentError(format!(
982
0
                    "Error converting size[{}] ({}) to usize for {}",
983
0
                    i, sizes[i], self.data_type
984
0
                ))
985
0
            })?;
986
0
            let offset = offsets[i].to_usize().ok_or_else(|| {
987
0
                ArrowError::InvalidArgumentError(format!(
988
0
                    "Error converting offset[{}] ({}) to usize for {}",
989
0
                    i, offsets[i], self.data_type
990
0
                ))
991
0
            })?;
992
0
            if size
993
0
                .checked_add(offset)
994
0
                .expect("Offset and size have exceeded the usize boundary")
995
0
                > values_length
996
            {
997
0
                return Err(ArrowError::InvalidArgumentError(format!(
998
0
                    "Size {} at index {} is larger than the remaining values for {}",
999
0
                    size, i, self.data_type
1000
0
                )));
1001
0
            }
1002
        }
1003
0
        Ok(())
1004
0
    }
1005
1006
    /// Validates the layout of `child_data` ArrayData structures
1007
25
    fn validate_child_data(&self) -> Result<(), ArrowError> {
1008
25
        match &self.data_type {
1009
9
            DataType::List(field) | DataType::Map(
field0
, _) => {
1010
9
                let values_data = self.get_single_valid_child_data(field.data_type())
?0
;
1011
9
                self.validate_offsets::<i32>(values_data.len)
?0
;
1012
9
                Ok(())
1013
            }
1014
0
            DataType::LargeList(field) => {
1015
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1016
0
                self.validate_offsets::<i64>(values_data.len)?;
1017
0
                Ok(())
1018
            }
1019
0
            DataType::ListView(field) => {
1020
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1021
0
                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1022
0
                Ok(())
1023
            }
1024
0
            DataType::LargeListView(field) => {
1025
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1026
0
                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1027
0
                Ok(())
1028
            }
1029
0
            DataType::FixedSizeList(field, list_size) => {
1030
0
                let values_data = self.get_single_valid_child_data(field.data_type())?;
1031
1032
0
                let list_size: usize = (*list_size).try_into().map_err(|_| {
1033
0
                    ArrowError::InvalidArgumentError(format!(
1034
0
                        "{} has a negative list_size {}",
1035
0
                        self.data_type, list_size
1036
0
                    ))
1037
0
                })?;
1038
1039
0
                let expected_values_len = self.len
1040
0
                    .checked_mul(list_size)
1041
0
                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1042
1043
0
                if values_data.len < expected_values_len {
1044
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1045
0
                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1046
0
                        values_data.len, self.len, list_size, self.data_type
1047
0
                    )));
1048
0
                }
1049
1050
0
                Ok(())
1051
            }
1052
5
            DataType::Struct(fields) => {
1053
5
                self.validate_num_child_data(fields.len())
?0
;
1054
9
                for (i, field) in 
fields.iter()5
.
enumerate5
() {
1055
9
                    let field_data = self.get_valid_child_data(i, field.data_type())
?0
;
1056
1057
                    // Ensure child field has sufficient size
1058
9
                    if field_data.len < self.len {
1059
0
                        return Err(ArrowError::InvalidArgumentError(format!(
1060
0
                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1061
0
                            self.data_type, i, field.name(), field_data.len, self.len
1062
0
                        )));
1063
9
                    }
1064
                }
1065
5
                Ok(())
1066
            }
1067
0
            DataType::RunEndEncoded(run_ends_field, values_field) => {
1068
0
                self.validate_num_child_data(2)?;
1069
0
                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1070
0
                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1071
0
                if run_ends_data.len != values_data.len {
1072
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1073
0
                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1074
0
                        run_ends_data.len, values_data.len
1075
0
                    )));
1076
0
                }
1077
0
                if run_ends_data.nulls.is_some() {
1078
0
                    return Err(ArrowError::InvalidArgumentError(
1079
0
                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1080
0
                    ));
1081
0
                }
1082
0
                Ok(())
1083
            }
1084
0
            DataType::Union(fields, mode) => {
1085
0
                self.validate_num_child_data(fields.len())?;
1086
1087
0
                for (i, (_, field)) in fields.iter().enumerate() {
1088
0
                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1089
1090
0
                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1091
0
                        return Err(ArrowError::InvalidArgumentError(format!(
1092
0
                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1093
0
                            i, field_data.len, self.len + self.offset
1094
0
                        )));
1095
0
                    }
1096
                }
1097
0
                Ok(())
1098
            }
1099
0
            DataType::Dictionary(_key_type, value_type) => {
1100
0
                self.get_single_valid_child_data(value_type)?;
1101
0
                Ok(())
1102
            }
1103
            _ => {
1104
                // other types do not have child data
1105
11
                if !self.child_data.is_empty() {
1106
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1107
0
                        "Expected no child arrays for type {} but got {}",
1108
0
                        self.data_type,
1109
0
                        self.child_data.len()
1110
0
                    )));
1111
11
                }
1112
11
                Ok(())
1113
            }
1114
        }
1115
25
    }
1116
1117
    /// Ensures that this array data has a single child_data with the
1118
    /// expected type, and calls `validate()` on it. Returns a
1119
    /// reference to that child_data
1120
9
    fn get_single_valid_child_data(
1121
9
        &self,
1122
9
        expected_type: &DataType,
1123
9
    ) -> Result<&ArrayData, ArrowError> {
1124
9
        self.validate_num_child_data(1)
?0
;
1125
9
        self.get_valid_child_data(0, expected_type)
1126
9
    }
1127
1128
    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1129
14
    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1130
14
        if self.child_data.len() != expected_len {
1131
0
            Err(ArrowError::InvalidArgumentError(format!(
1132
0
                "Value data for {} should contain {} child data array(s), had {}",
1133
0
                self.data_type,
1134
0
                expected_len,
1135
0
                self.child_data.len()
1136
0
            )))
1137
        } else {
1138
14
            Ok(())
1139
        }
1140
14
    }
1141
1142
    /// Ensures that `child_data[i]` has the expected type, calls
1143
    /// `validate()` on it, and returns a reference to that child_data
1144
18
    fn get_valid_child_data(
1145
18
        &self,
1146
18
        i: usize,
1147
18
        expected_type: &DataType,
1148
18
    ) -> Result<&ArrayData, ArrowError> {
1149
18
        let values_data = self.child_data.get(i).ok_or_else(|| 
{0
1150
0
            ArrowError::InvalidArgumentError(format!(
1151
0
                "{} did not have enough child arrays. Expected at least {} but had only {}",
1152
0
                self.data_type,
1153
0
                i + 1,
1154
0
                self.child_data.len()
1155
0
            ))
1156
0
        })?;
1157
1158
18
        if expected_type != &values_data.data_type {
1159
0
            return Err(ArrowError::InvalidArgumentError(format!(
1160
0
                "Child type mismatch for {}. Expected {} but child data had {}",
1161
0
                self.data_type, expected_type, values_data.data_type
1162
0
            )));
1163
18
        }
1164
1165
18
        values_data.validate()
?0
;
1166
18
        Ok(values_data)
1167
18
    }
1168
1169
    /// Validate that the data contained within this [`ArrayData`] is valid
1170
    ///
1171
    /// 1. Null count is correct
1172
    /// 2. All offsets are valid
1173
    /// 3. All String data is valid UTF-8
1174
    /// 4. All dictionary offsets are valid
1175
    ///
1176
    /// Internally this calls:
1177
    ///
1178
    /// * [`Self::validate`]
1179
    /// * [`Self::validate_nulls`]
1180
    /// * [`Self::validate_values`]
1181
    ///
1182
    /// Note: this does not recurse into children, for a recursive variant
1183
    /// see [`Self::validate_full`]
1184
7
    pub fn validate_data(&self) -> Result<(), ArrowError> {
1185
7
        self.validate()
?0
;
1186
1187
7
        self.validate_nulls()
?0
;
1188
7
        self.validate_values()
?0
;
1189
7
        Ok(())
1190
7
    }
1191
1192
    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1193
    ///
1194
    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1195
    /// and all its children recursively
1196
0
    pub fn validate_full(&self) -> Result<(), ArrowError> {
1197
0
        self.validate_data()?;
1198
        // validate all children recursively
1199
0
        self.child_data
1200
0
            .iter()
1201
0
            .enumerate()
1202
0
            .try_for_each(|(i, child_data)| {
1203
0
                child_data.validate_full().map_err(|e| {
1204
0
                    ArrowError::InvalidArgumentError(format!(
1205
0
                        "{} child #{} invalid: {}",
1206
0
                        self.data_type, i, e
1207
0
                    ))
1208
0
                })
1209
0
            })?;
1210
0
        Ok(())
1211
0
    }
1212
1213
    /// Validates the values stored within this [`ArrayData`] are valid
1214
    /// without recursing into child [`ArrayData`]
1215
    ///
1216
    /// Does not (yet) check
1217
    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1218
    /// 2. the the null count is correct and that any
1219
    /// 3. nullability requirements of its children are correct
1220
    ///
1221
    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1222
7
    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1223
7
        if let Some(
nulls3
) = &self.nulls {
1224
3
            let actual = nulls.len() - nulls.inner().count_set_bits();
1225
3
            if actual != nulls.null_count() {
1226
0
                return Err(ArrowError::InvalidArgumentError(format!(
1227
0
                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1228
0
                    nulls.null_count(),
1229
0
                    actual
1230
0
                )));
1231
3
            }
1232
4
        }
1233
1234
        // In general non-nullable children should not contain nulls, however, for certain
1235
        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1236
        // space in the child. As such we permit nulls in the children in the corresponding
1237
        // positions for such types
1238
7
        match &self.data_type {
1239
5
            DataType::List(f) | DataType::LargeList(
f0
) | DataType::Map(
f0
, _) => {
1240
5
                if !f.is_nullable() {
1241
1
                    self.validate_non_nullable(None, &self.child_data[0])
?0
1242
4
                }
1243
            }
1244
0
            DataType::FixedSizeList(field, len) => {
1245
0
                let child = &self.child_data[0];
1246
0
                if !field.is_nullable() {
1247
0
                    match &self.nulls {
1248
0
                        Some(nulls) => {
1249
0
                            let element_len = *len as usize;
1250
0
                            let expanded = nulls.expand(element_len);
1251
0
                            self.validate_non_nullable(Some(&expanded), child)?;
1252
                        }
1253
0
                        None => self.validate_non_nullable(None, child)?,
1254
                    }
1255
0
                }
1256
            }
1257
2
            DataType::Struct(fields) => {
1258
3
                for (field, child) in 
fields.iter()2
.
zip2
(
&self.child_data2
) {
1259
3
                    if !field.is_nullable() {
1260
0
                        self.validate_non_nullable(self.nulls(), child)?
1261
3
                    }
1262
                }
1263
            }
1264
0
            _ => {}
1265
        }
1266
1267
7
        Ok(())
1268
7
    }
1269
1270
    /// Verifies that `child` contains no nulls not present in `mask`
1271
1
    fn validate_non_nullable(
1272
1
        &self,
1273
1
        mask: Option<&NullBuffer>,
1274
1
        child: &ArrayData,
1275
1
    ) -> Result<(), ArrowError> {
1276
1
        let 
mask0
= match mask {
1277
0
            Some(mask) => mask,
1278
            None => {
1279
1
                return match child.null_count() {
1280
1
                    0 => Ok(()),
1281
0
                    _ => Err(ArrowError::InvalidArgumentError(format!(
1282
0
                        "non-nullable child of type {} contains nulls not present in parent {}",
1283
0
                        child.data_type, self.data_type
1284
0
                    ))),
1285
                }
1286
            }
1287
        };
1288
1289
0
        match child.nulls() {
1290
0
            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1291
0
                "non-nullable child of type {} contains nulls not present in parent",
1292
0
                child.data_type
1293
0
            ))),
1294
0
            _ => Ok(()),
1295
        }
1296
1
    }
1297
1298
    /// Validates the values stored within this [`ArrayData`] are valid
1299
    /// without recursing into child [`ArrayData`]
1300
    ///
1301
    /// Does not (yet) check
1302
    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1303
7
    pub fn validate_values(&self) -> Result<(), ArrowError> {
1304
7
        match &self.data_type {
1305
0
            DataType::Utf8 => self.validate_utf8::<i32>(),
1306
0
            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1307
0
            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1308
0
            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1309
            DataType::BinaryView => {
1310
0
                let views = self.typed_buffer::<u128>(0, self.len)?;
1311
0
                validate_binary_view(views, &self.buffers[1..])
1312
            }
1313
            DataType::Utf8View => {
1314
0
                let views = self.typed_buffer::<u128>(0, self.len)?;
1315
0
                validate_string_view(views, &self.buffers[1..])
1316
            }
1317
            DataType::List(_) | DataType::Map(_, _) => {
1318
5
                let child = &self.child_data[0];
1319
5
                self.validate_offsets_full::<i32>(child.len)
1320
            }
1321
            DataType::LargeList(_) => {
1322
0
                let child = &self.child_data[0];
1323
0
                self.validate_offsets_full::<i64>(child.len)
1324
            }
1325
            DataType::Union(_, _) => {
1326
                // Validate Union Array as part of implementing new Union semantics
1327
                // See comments in `ArrayData::validate()`
1328
                // https://github.com/apache/arrow-rs/issues/85
1329
                //
1330
                // TODO file follow on ticket for full union validation
1331
0
                Ok(())
1332
            }
1333
0
            DataType::Dictionary(key_type, _value_type) => {
1334
0
                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1335
0
                let max_value = dictionary_length - 1;
1336
0
                match key_type.as_ref() {
1337
0
                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1338
0
                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1339
0
                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1340
0
                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1341
0
                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1342
0
                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1343
0
                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1344
0
                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1345
0
                    _ => unreachable!(),
1346
                }
1347
            }
1348
0
            DataType::RunEndEncoded(run_ends, _values) => {
1349
0
                let run_ends_data = self.child_data()[0].clone();
1350
0
                match run_ends.data_type() {
1351
0
                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1352
0
                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1353
0
                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1354
0
                    _ => unreachable!(),
1355
                }
1356
            }
1357
            _ => {
1358
                // No extra validation check required for other types
1359
2
                Ok(())
1360
            }
1361
        }
1362
7
    }
1363
1364
    /// Calls the `validate(item_index, range)` function for each of
1365
    /// the ranges specified in the arrow offsets buffer of type
1366
    /// `T`. Also validates that each offset is smaller than
1367
    /// `offset_limit`
1368
    ///
1369
    /// For an empty array, the offsets buffer can either be empty
1370
    /// or contain a single `0`.
1371
    ///
1372
    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1373
    /// function would call `validate([1,2])`, and `validate([2,4])`
1374
5
    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1375
5
    where
1376
5
        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1377
5
        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1378
    {
1379
5
        self.typed_offsets::<T>()
?0
1380
5
            .iter()
1381
5
            .enumerate()
1382
35
            .
map5
(|(i, x)| {
1383
                // check if the offset can be converted to usize
1384
35
                let r = x.to_usize().ok_or_else(|| 
{0
1385
0
                    ArrowError::InvalidArgumentError(format!(
1386
0
                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1387
                    );
1388
                // check if the offset exceeds the limit
1389
35
                match r {
1390
35
                    Ok(n) if n <= offset_limit => Ok((i, n)),
1391
0
                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1392
0
                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1393
0
                    ),
1394
0
                    Err(e) => Err(e),
1395
                }
1396
35
            })
1397
35
            .
scan5
(0_usize, |start, end| {
1398
                // check offsets are monotonically increasing
1399
35
                match end {
1400
35
                    Ok((i, end)) if *start <= end => {
1401
35
                        let range = Some(Ok((i, *start..end)));
1402
35
                        *start = end;
1403
35
                        range
1404
                    }
1405
0
                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1406
0
                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1407
0
                        i - 1, start, end))
1408
0
                    )),
1409
0
                    Err(err) => Some(Err(err)),
1410
                }
1411
35
            })
1412
5
            .skip(1) // the first element is meaningless
1413
30
            .
try_for_each5
(|res: Result<(usize, Range<usize>), ArrowError>| {
1414
30
                let (item_index, range) = res
?0
;
1415
30
                validate(item_index-1, range)
1416
30
            })
1417
5
    }
1418
1419
    /// Ensures that all strings formed by the offsets in `buffers[0]`
1420
    /// into `buffers[1]` are valid utf8 sequences
1421
0
    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1422
0
    where
1423
0
        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1424
    {
1425
0
        let values_buffer = &self.buffers[1].as_slice();
1426
0
        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1427
            // Validate Offsets are correct
1428
0
            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1429
0
                if !values_str.is_char_boundary(range.start)
1430
0
                    || !values_str.is_char_boundary(range.end)
1431
                {
1432
0
                    return Err(ArrowError::InvalidArgumentError(format!(
1433
0
                        "incomplete utf-8 byte sequence from index {string_index}"
1434
0
                    )));
1435
0
                }
1436
0
                Ok(())
1437
0
            })
1438
        } else {
1439
            // find specific offset that failed utf8 validation
1440
0
            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1441
0
                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1442
0
                    ArrowError::InvalidArgumentError(format!(
1443
0
                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1444
0
                    ))
1445
0
                })?;
1446
0
                Ok(())
1447
0
            })
1448
        }
1449
0
    }
1450
1451
    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1452
    /// between `0` and `offset_limit`
1453
5
    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1454
5
    where
1455
5
        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1456
    {
1457
30
        
self5
.
validate_each_offset5
::<T, _>(
offset_limit5
, |_string_index, _range| {
1458
            // No validation applied to each value, but the iteration
1459
            // itself applies bounds checking to each range
1460
30
            Ok(())
1461
30
        })
1462
5
    }
1463
1464
    /// Validates that each value in self.buffers (typed as T)
1465
    /// is within the range [0, max_value], inclusive
1466
0
    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1467
0
    where
1468
0
        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1469
    {
1470
0
        let required_len = self.len + self.offset;
1471
0
        let buffer = &self.buffers[0];
1472
1473
        // This should have been checked as part of `validate()` prior
1474
        // to calling `validate_full()` but double check to be sure
1475
0
        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1476
1477
        // Justification: buffer size was validated above
1478
0
        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1479
1480
0
        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1481
            // Do not check the value is null (value can be arbitrary)
1482
0
            if self.is_null(i) {
1483
0
                return Ok(());
1484
0
            }
1485
0
            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1486
0
                ArrowError::InvalidArgumentError(format!(
1487
0
                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1488
0
                ))
1489
0
            })?;
1490
1491
0
            if dict_index < 0 || dict_index > max_value {
1492
0
                return Err(ArrowError::InvalidArgumentError(format!(
1493
0
                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1494
0
                )));
1495
0
            }
1496
0
            Ok(())
1497
0
        })
1498
0
    }
1499
1500
    /// Validates that each value in run_ends array is positive and strictly increasing.
1501
0
    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1502
0
    where
1503
0
        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1504
    {
1505
0
        let values = self.typed_buffer::<T>(0, self.len)?;
1506
0
        let mut prev_value: i64 = 0_i64;
1507
0
        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1508
0
            let value: i64 = inp_value.try_into().map_err(|_| {
1509
0
                ArrowError::InvalidArgumentError(format!(
1510
0
                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1511
0
                ))
1512
0
            })?;
1513
0
            if value <= 0_i64 {
1514
0
                return Err(ArrowError::InvalidArgumentError(format!(
1515
0
                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1516
0
                )));
1517
0
            }
1518
0
            if ix > 0 && value <= prev_value {
1519
0
                return Err(ArrowError::InvalidArgumentError(format!(
1520
0
                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1521
0
                )));
1522
0
            }
1523
1524
0
            prev_value = value;
1525
0
            Ok(())
1526
0
        })?;
1527
1528
0
        if prev_value.as_usize() < (self.offset + self.len) {
1529
0
            return Err(ArrowError::InvalidArgumentError(format!(
1530
0
                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1531
0
                self.offset + self.len
1532
0
            )));
1533
0
        }
1534
0
        Ok(())
1535
0
    }
1536
1537
    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1538
    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1539
    /// return false when the arrays are logically equal
1540
0
    pub fn ptr_eq(&self, other: &Self) -> bool {
1541
0
        if self.offset != other.offset
1542
0
            || self.len != other.len
1543
0
            || self.data_type != other.data_type
1544
0
            || self.buffers.len() != other.buffers.len()
1545
0
            || self.child_data.len() != other.child_data.len()
1546
        {
1547
0
            return false;
1548
0
        }
1549
1550
0
        match (&self.nulls, &other.nulls) {
1551
0
            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1552
0
            (Some(_), None) | (None, Some(_)) => return false,
1553
0
            _ => {}
1554
        };
1555
1556
0
        if !self
1557
0
            .buffers
1558
0
            .iter()
1559
0
            .zip(other.buffers.iter())
1560
0
            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1561
        {
1562
0
            return false;
1563
0
        }
1564
1565
0
        self.child_data
1566
0
            .iter()
1567
0
            .zip(other.child_data.iter())
1568
0
            .all(|(a, b)| a.ptr_eq(b))
1569
0
    }
1570
1571
    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1572
13
    pub fn into_builder(self) -> ArrayDataBuilder {
1573
13
        self.into()
1574
13
    }
1575
}
1576
1577
/// Return the expected [`DataTypeLayout`] Arrays of this data
1578
/// type are expected to have
1579
25
pub fn layout(data_type: &DataType) -> DataTypeLayout {
1580
    // based on C/C++ implementation in
1581
    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1582
    use arrow_schema::IntervalUnit::*;
1583
1584
0
    match data_type {
1585
0
        DataType::Null => DataTypeLayout {
1586
0
            buffers: vec![],
1587
0
            can_contain_null_mask: false,
1588
0
            variadic: false,
1589
0
        },
1590
1
        DataType::Boolean => DataTypeLayout {
1591
1
            buffers: vec![BufferSpec::BitMap],
1592
1
            can_contain_null_mask: true,
1593
1
            variadic: false,
1594
1
        },
1595
0
        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1596
0
        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1597
0
        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1598
3
        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1599
0
        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1600
0
        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1601
0
        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1602
0
        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1603
0
        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1604
1
        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1605
0
        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1606
0
        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1607
0
        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1608
0
        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1609
0
        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610
0
        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1611
0
        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1612
0
        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1613
        DataType::Interval(MonthDayNano) => {
1614
0
            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1615
        }
1616
0
        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1617
0
        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1618
0
        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1619
0
        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1620
0
        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1621
0
        DataType::FixedSizeBinary(size) => {
1622
0
            let spec = BufferSpec::FixedWidth {
1623
0
                byte_width: (*size).try_into().unwrap(),
1624
0
                alignment: mem::align_of::<u8>(),
1625
0
            };
1626
0
            DataTypeLayout {
1627
0
                buffers: vec![spec],
1628
0
                can_contain_null_mask: true,
1629
0
                variadic: false,
1630
0
            }
1631
        }
1632
0
        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1633
0
        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1634
6
        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1635
0
        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1636
0
        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1637
0
        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1638
9
        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1639
0
        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1640
0
        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1641
0
        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1642
0
        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1643
5
        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1644
0
        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1645
0
        DataType::Union(_, mode) => {
1646
0
            let type_ids = BufferSpec::FixedWidth {
1647
0
                byte_width: mem::size_of::<i8>(),
1648
0
                alignment: mem::align_of::<i8>(),
1649
0
            };
1650
1651
            DataTypeLayout {
1652
0
                buffers: match mode {
1653
                    UnionMode::Sparse => {
1654
0
                        vec![type_ids]
1655
                    }
1656
                    UnionMode::Dense => {
1657
0
                        vec![
1658
0
                            type_ids,
1659
0
                            BufferSpec::FixedWidth {
1660
0
                                byte_width: mem::size_of::<i32>(),
1661
0
                                alignment: mem::align_of::<i32>(),
1662
0
                            },
1663
                        ]
1664
                    }
1665
                },
1666
                can_contain_null_mask: false,
1667
                variadic: false,
1668
            }
1669
        }
1670
0
        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1671
    }
1672
25
}
1673
1674
/// Layout specification for a data type
1675
#[derive(Debug, PartialEq, Eq)]
1676
// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1677
pub struct DataTypeLayout {
1678
    /// A vector of buffer layout specifications, one for each expected buffer
1679
    pub buffers: Vec<BufferSpec>,
1680
1681
    /// Can contain a null bitmask
1682
    pub can_contain_null_mask: bool,
1683
1684
    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1685
    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1686
    /// buffers.len(). Buffers that exceed the lower bound are legal.
1687
    pub variadic: bool,
1688
}
1689
1690
impl DataTypeLayout {
1691
    /// Describes a basic numeric array where each element has type `T`
1692
13
    pub fn new_fixed_width<T>() -> Self {
1693
13
        Self {
1694
13
            buffers: vec![BufferSpec::FixedWidth {
1695
13
                byte_width: mem::size_of::<T>(),
1696
13
                alignment: mem::align_of::<T>(),
1697
13
            }],
1698
13
            can_contain_null_mask: true,
1699
13
            variadic: false,
1700
13
        }
1701
13
    }
1702
1703
    /// Describes arrays which have no data of their own
1704
    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1705
5
    pub fn new_nullable_empty() -> Self {
1706
5
        Self {
1707
5
            buffers: vec![],
1708
5
            can_contain_null_mask: true,
1709
5
            variadic: false,
1710
5
        }
1711
5
    }
1712
1713
    /// Describes arrays which have no data of their own
1714
    /// (e.g. RunEndEncoded).
1715
0
    pub fn new_empty() -> Self {
1716
0
        Self {
1717
0
            buffers: vec![],
1718
0
            can_contain_null_mask: false,
1719
0
            variadic: false,
1720
0
        }
1721
0
    }
1722
1723
    /// Describes a basic numeric array where each element has a fixed
1724
    /// with offset buffer of type `T`, followed by a
1725
    /// variable width data buffer
1726
6
    pub fn new_binary<T>() -> Self {
1727
6
        Self {
1728
6
            buffers: vec![
1729
6
                // offsets
1730
6
                BufferSpec::FixedWidth {
1731
6
                    byte_width: mem::size_of::<T>(),
1732
6
                    alignment: mem::align_of::<T>(),
1733
6
                },
1734
6
                // values
1735
6
                BufferSpec::VariableWidth,
1736
6
            ],
1737
6
            can_contain_null_mask: true,
1738
6
            variadic: false,
1739
6
        }
1740
6
    }
1741
1742
    /// Describes a view type
1743
0
    pub fn new_view() -> Self {
1744
0
        Self {
1745
0
            buffers: vec![BufferSpec::FixedWidth {
1746
0
                byte_width: mem::size_of::<u128>(),
1747
0
                alignment: mem::align_of::<u128>(),
1748
0
            }],
1749
0
            can_contain_null_mask: true,
1750
0
            variadic: true,
1751
0
        }
1752
0
    }
1753
1754
    /// Describes a list view type
1755
0
    pub fn new_list_view<T>() -> Self {
1756
0
        Self {
1757
0
            buffers: vec![
1758
0
                BufferSpec::FixedWidth {
1759
0
                    byte_width: mem::size_of::<T>(),
1760
0
                    alignment: mem::align_of::<T>(),
1761
0
                },
1762
0
                BufferSpec::FixedWidth {
1763
0
                    byte_width: mem::size_of::<T>(),
1764
0
                    alignment: mem::align_of::<T>(),
1765
0
                },
1766
0
            ],
1767
0
            can_contain_null_mask: true,
1768
0
            variadic: true,
1769
0
        }
1770
0
    }
1771
}
1772
1773
/// Layout specification for a single data type buffer
1774
#[derive(Debug, PartialEq, Eq)]
1775
pub enum BufferSpec {
1776
    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1777
    ///
1778
    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1779
    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1780
    ///
1781
    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1782
    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1783
    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1784
    ///
1785
    /// Note that these alignment requirements will vary between architectures
1786
    FixedWidth {
1787
        /// The width of each element in bytes
1788
        byte_width: usize,
1789
        /// The alignment required by Rust for an array of the corresponding primitive
1790
        alignment: usize,
1791
    },
1792
    /// Variable width, such as string data for utf8 data
1793
    VariableWidth,
1794
    /// Buffer holds a bitmap.
1795
    ///
1796
    /// Note: Unlike the C++ implementation, the null/validity buffer
1797
    /// is handled specially rather than as another of the buffers in
1798
    /// the spec, so this variant is only used for the Boolean type.
1799
    BitMap,
1800
    /// Buffer is always null. Unused currently in Rust implementation,
1801
    /// (used in C++ for Union type)
1802
    #[allow(dead_code)]
1803
    AlwaysNull,
1804
}
1805
1806
impl PartialEq for ArrayData {
1807
439
    fn eq(&self, other: &Self) -> bool {
1808
439
        equal::equal(self, other)
1809
439
    }
1810
}
1811
1812
/// A boolean flag that cannot be mutated outside of unsafe code.
1813
///
1814
/// Defaults to a value of false.
1815
///
1816
/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1817
///
1818
/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1819
///
1820
/// # Example
1821
/// ```rust
1822
/// use arrow_data::UnsafeFlag;
1823
/// assert!(!UnsafeFlag::default().get()); // default is false
1824
/// let mut flag = UnsafeFlag::new();
1825
/// assert!(!flag.get()); // defaults to false
1826
/// // can only set it to true in unsafe code
1827
/// unsafe { flag.set(true) };
1828
/// assert!(flag.get()); // now true
1829
/// ```
1830
#[derive(Debug, Clone)]
1831
#[doc(hidden)]
1832
pub struct UnsafeFlag(bool);
1833
1834
impl UnsafeFlag {
1835
    /// Creates a new `UnsafeFlag` with the value set to `false`.
1836
    ///
1837
    /// See examples on [`Self::new`]
1838
    #[inline]
1839
1.45k
    pub const fn new() -> Self {
1840
1.45k
        Self(false)
1841
1.45k
    }
1842
1843
    /// Sets the value of the flag to the given value
1844
    ///
1845
    /// Note this can purposely only be done in `unsafe` code
1846
    ///
1847
    /// # Safety
1848
    ///
1849
    /// If set, the flag will be set to the given value. There is nothing
1850
    /// immediately unsafe about doing so, however, the flag can be used to
1851
    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1852
    #[inline]
1853
1.45k
    pub unsafe fn set(&mut self, val: bool) {
1854
1.45k
        self.0 = val;
1855
1.45k
    }
1856
1857
    /// Returns the value of the flag
1858
    #[inline]
1859
1.45k
    pub fn get(&self) -> bool {
1860
1.45k
        self.0
1861
1.45k
    }
1862
}
1863
1864
// Manual impl to make it clear you can not construct unsafe with true
1865
impl Default for UnsafeFlag {
1866
0
    fn default() -> Self {
1867
0
        Self::new()
1868
0
    }
1869
}
1870
1871
/// Builder for [`ArrayData`] type
1872
#[derive(Debug)]
1873
pub struct ArrayDataBuilder {
1874
    data_type: DataType,
1875
    len: usize,
1876
    null_count: Option<usize>,
1877
    null_bit_buffer: Option<Buffer>,
1878
    nulls: Option<NullBuffer>,
1879
    offset: usize,
1880
    buffers: Vec<Buffer>,
1881
    child_data: Vec<ArrayData>,
1882
    /// Should buffers be realigned (copying if necessary)?
1883
    ///
1884
    /// Defaults to false.
1885
    align_buffers: bool,
1886
    /// Should data validation be skipped for this [`ArrayData`]?
1887
    ///
1888
    /// Defaults to false.
1889
    ///
1890
    /// # Safety
1891
    ///
1892
    /// This flag can only be set to true using `unsafe` APIs. However, once true
1893
    /// subsequent calls to `build()` may result in undefined behavior if the data
1894
    /// is not valid.
1895
    skip_validation: UnsafeFlag,
1896
}
1897
1898
impl ArrayDataBuilder {
1899
    #[inline]
1900
    /// Creates a new array data builder
1901
1.41k
    pub const fn new(data_type: DataType) -> Self {
1902
1.41k
        Self {
1903
1.41k
            data_type,
1904
1.41k
            len: 0,
1905
1.41k
            null_count: None,
1906
1.41k
            null_bit_buffer: None,
1907
1.41k
            nulls: None,
1908
1.41k
            offset: 0,
1909
1.41k
            buffers: vec![],
1910
1.41k
            child_data: vec![],
1911
1.41k
            align_buffers: false,
1912
1.41k
            skip_validation: UnsafeFlag::new(),
1913
1.41k
        }
1914
1.41k
    }
1915
1916
    /// Creates a new array data builder from an existing one, changing the data type
1917
13
    pub fn data_type(self, data_type: DataType) -> Self {
1918
13
        Self { data_type, ..self }
1919
13
    }
1920
1921
    #[inline]
1922
    #[allow(clippy::len_without_is_empty)]
1923
    /// Sets the length of the [ArrayData]
1924
1.41k
    pub const fn len(mut self, n: usize) -> Self {
1925
1.41k
        self.len = n;
1926
1.41k
        self
1927
1.41k
    }
1928
1929
    /// Sets the null buffer of the [ArrayData]
1930
1.37k
    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1931
1.37k
        self.nulls = nulls;
1932
1.37k
        self.null_count = None;
1933
1.37k
        self.null_bit_buffer = None;
1934
1.37k
        self
1935
1.37k
    }
1936
1937
    /// Sets the null count of the [ArrayData]
1938
0
    pub fn null_count(mut self, null_count: usize) -> Self {
1939
0
        self.null_count = Some(null_count);
1940
0
        self
1941
0
    }
1942
1943
    /// Sets the `null_bit_buffer` of the [ArrayData]
1944
5
    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1945
5
        self.nulls = None;
1946
5
        self.null_bit_buffer = buf;
1947
5
        self
1948
5
    }
1949
1950
    /// Sets the offset of the [ArrayData]
1951
    #[inline]
1952
93
    pub const fn offset(mut self, n: usize) -> Self {
1953
93
        self.offset = n;
1954
93
        self
1955
93
    }
1956
1957
    /// Sets the buffers of the [ArrayData]
1958
1.12k
    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1959
1.12k
        self.buffers = v;
1960
1.12k
        self
1961
1.12k
    }
1962
1963
    /// Adds a single buffer to the [ArrayData]'s buffers
1964
214
    pub fn add_buffer(mut self, b: Buffer) -> Self {
1965
214
        self.buffers.push(b);
1966
214
        self
1967
214
    }
1968
1969
    /// Adds multiple buffers to the [ArrayData]'s buffers
1970
0
    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1971
0
        self.buffers.extend(bs);
1972
0
        self
1973
0
    }
1974
1975
    /// Sets the child data of the [ArrayData]
1976
246
    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1977
246
        self.child_data = v;
1978
246
        self
1979
246
    }
1980
1981
    /// Adds a single child data to the [ArrayData]'s child data
1982
6
    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1983
6
        self.child_data.push(r);
1984
6
        self
1985
6
    }
1986
1987
    /// Creates an array data, without any validation
1988
    ///
1989
    /// Note: This is shorthand for
1990
    /// ```rust
1991
    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
1992
    /// # let _ = unsafe {
1993
    /// builder.skip_validation(true).build().unwrap()
1994
    /// # };
1995
    /// ```
1996
    ///
1997
    /// # Safety
1998
    ///
1999
    /// The same caveats as [`ArrayData::new_unchecked`]
2000
    /// apply.
2001
1.41k
    pub unsafe fn build_unchecked(self) -> ArrayData {
2002
1.41k
        self.skip_validation(true).build().unwrap()
2003
1.41k
    }
2004
2005
    /// Creates an `ArrayData`, consuming `self`
2006
    ///
2007
    /// # Safety
2008
    ///
2009
    /// By default the underlying buffers are checked to ensure they are valid
2010
    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2011
    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2012
    /// not valid, undefined behavior will result.
2013
1.45k
    pub fn build(self) -> Result<ArrayData, ArrowError> {
2014
        let Self {
2015
1.45k
            data_type,
2016
1.45k
            len,
2017
1.45k
            null_count,
2018
1.45k
            null_bit_buffer,
2019
1.45k
            nulls,
2020
1.45k
            offset,
2021
1.45k
            buffers,
2022
1.45k
            child_data,
2023
1.45k
            align_buffers,
2024
1.45k
            skip_validation,
2025
1.45k
        } = self;
2026
2027
1.45k
        let nulls = nulls
2028
1.45k
            .or_else(|| 
{1.27k
2029
1.27k
                let 
buffer39
= null_bit_buffer
?1.24k
;
2030
39
                let buffer = BooleanBuffer::new(buffer, offset, len);
2031
39
                Some(match null_count {
2032
0
                    Some(n) => {
2033
                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2034
0
                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2035
                    }
2036
39
                    None => NullBuffer::new(buffer),
2037
                })
2038
1.27k
            })
2039
1.45k
            .filter(|b| 
b219
.
null_count219
() != 0);
2040
2041
1.45k
        let mut data = ArrayData {
2042
1.45k
            data_type,
2043
1.45k
            len,
2044
1.45k
            offset,
2045
1.45k
            buffers,
2046
1.45k
            child_data,
2047
1.45k
            nulls,
2048
1.45k
        };
2049
2050
1.45k
        if align_buffers {
2051
0
            data.align_buffers();
2052
1.45k
        }
2053
2054
        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2055
1.45k
        if !skip_validation.get() || 
cfg!1.45k
(feature = "force_validate") {
2056
7
            data.validate_data()
?0
;
2057
1.45k
        }
2058
1.45k
        Ok(data)
2059
1.45k
    }
2060
2061
    /// Creates an array data, validating all inputs, and aligning any buffers
2062
    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2063
0
    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2064
0
        self.align_buffers(true).build()
2065
0
    }
2066
2067
    /// Ensure that all buffers are aligned, copying data if necessary
2068
    ///
2069
    /// Rust requires that arrays are aligned to their corresponding primitive,
2070
    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2071
    ///
2072
    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2073
    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2074
    ///
2075
    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2076
    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2077
    /// when necessary, making it useful when interacting with buffers produced by other systems,
2078
    /// e.g. IPC or FFI.
2079
    ///
2080
    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2081
    /// insufficiently aligned buffers.
2082
0
    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2083
0
        self.align_buffers = align_buffers;
2084
0
        self
2085
0
    }
2086
2087
    /// Skips validation of the data.
2088
    ///
2089
    /// If this flag is enabled, `[Self::build`] will skip validation of the
2090
    /// data
2091
    ///
2092
    /// If this flag is not enabled, `[Self::build`] will validate that all
2093
    /// buffers are valid and will return an error if any data is invalid.
2094
    /// Validation can be expensive.
2095
    ///
2096
    /// # Safety
2097
    ///
2098
    /// If validation is skipped, the buffers must form a valid Arrow array,
2099
    /// otherwise undefined behavior will result
2100
1.41k
    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2101
1.41k
        self.skip_validation.set(skip_validation);
2102
1.41k
        self
2103
1.41k
    }
2104
}
2105
2106
impl From<ArrayData> for ArrayDataBuilder {
2107
13
    fn from(d: ArrayData) -> Self {
2108
13
        Self {
2109
13
            data_type: d.data_type,
2110
13
            len: d.len,
2111
13
            offset: d.offset,
2112
13
            buffers: d.buffers,
2113
13
            child_data: d.child_data,
2114
13
            nulls: d.nulls,
2115
13
            null_bit_buffer: None,
2116
13
            null_count: None,
2117
13
            align_buffers: false,
2118
13
            skip_validation: UnsafeFlag::new(),
2119
13
        }
2120
13
    }
2121
}
2122
2123
#[cfg(test)]
2124
mod tests {
2125
    use super::*;
2126
    use arrow_schema::{Field, Fields};
2127
2128
    // See arrow/tests/array_data_validation.rs for test of array validation
2129
2130
    /// returns a buffer initialized with some constant value for tests
2131
    fn make_i32_buffer(n: usize) -> Buffer {
2132
        Buffer::from_slice_ref(vec![42i32; n])
2133
    }
2134
2135
    /// returns a buffer initialized with some constant value for tests
2136
    fn make_f32_buffer(n: usize) -> Buffer {
2137
        Buffer::from_slice_ref(vec![42f32; n])
2138
    }
2139
2140
    #[test]
2141
    fn test_builder() {
2142
        // Buffer needs to be at least 25 long
2143
        let v = (0..25).collect::<Vec<i32>>();
2144
        let b1 = Buffer::from_slice_ref(&v);
2145
        let arr_data = ArrayData::builder(DataType::Int32)
2146
            .len(20)
2147
            .offset(5)
2148
            .add_buffer(b1)
2149
            .null_bit_buffer(Some(Buffer::from([
2150
                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2151
            ])))
2152
            .build()
2153
            .unwrap();
2154
2155
        assert_eq!(20, arr_data.len());
2156
        assert_eq!(10, arr_data.null_count());
2157
        assert_eq!(5, arr_data.offset());
2158
        assert_eq!(1, arr_data.buffers().len());
2159
        assert_eq!(
2160
            Buffer::from_slice_ref(&v).as_slice(),
2161
            arr_data.buffers()[0].as_slice()
2162
        );
2163
    }
2164
2165
    #[test]
2166
    fn test_builder_with_child_data() {
2167
        let child_arr_data = ArrayData::try_new(
2168
            DataType::Int32,
2169
            5,
2170
            None,
2171
            0,
2172
            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2173
            vec![],
2174
        )
2175
        .unwrap();
2176
2177
        let field = Arc::new(Field::new("x", DataType::Int32, true));
2178
        let data_type = DataType::Struct(vec![field].into());
2179
2180
        let arr_data = ArrayData::builder(data_type)
2181
            .len(5)
2182
            .offset(0)
2183
            .add_child_data(child_arr_data.clone())
2184
            .build()
2185
            .unwrap();
2186
2187
        assert_eq!(5, arr_data.len());
2188
        assert_eq!(1, arr_data.child_data().len());
2189
        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2190
    }
2191
2192
    #[test]
2193
    fn test_null_count() {
2194
        let mut bit_v: [u8; 2] = [0; 2];
2195
        bit_util::set_bit(&mut bit_v, 0);
2196
        bit_util::set_bit(&mut bit_v, 3);
2197
        bit_util::set_bit(&mut bit_v, 10);
2198
        let arr_data = ArrayData::builder(DataType::Int32)
2199
            .len(16)
2200
            .add_buffer(make_i32_buffer(16))
2201
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2202
            .build()
2203
            .unwrap();
2204
        assert_eq!(13, arr_data.null_count());
2205
2206
        // Test with offset
2207
        let mut bit_v: [u8; 2] = [0; 2];
2208
        bit_util::set_bit(&mut bit_v, 0);
2209
        bit_util::set_bit(&mut bit_v, 3);
2210
        bit_util::set_bit(&mut bit_v, 10);
2211
        let arr_data = ArrayData::builder(DataType::Int32)
2212
            .len(12)
2213
            .offset(2)
2214
            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2215
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2216
            .build()
2217
            .unwrap();
2218
        assert_eq!(10, arr_data.null_count());
2219
    }
2220
2221
    #[test]
2222
    fn test_null_buffer_ref() {
2223
        let mut bit_v: [u8; 2] = [0; 2];
2224
        bit_util::set_bit(&mut bit_v, 0);
2225
        bit_util::set_bit(&mut bit_v, 3);
2226
        bit_util::set_bit(&mut bit_v, 10);
2227
        let arr_data = ArrayData::builder(DataType::Int32)
2228
            .len(16)
2229
            .add_buffer(make_i32_buffer(16))
2230
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2231
            .build()
2232
            .unwrap();
2233
        assert!(arr_data.nulls().is_some());
2234
        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2235
    }
2236
2237
    #[test]
2238
    fn test_slice() {
2239
        let mut bit_v: [u8; 2] = [0; 2];
2240
        bit_util::set_bit(&mut bit_v, 0);
2241
        bit_util::set_bit(&mut bit_v, 3);
2242
        bit_util::set_bit(&mut bit_v, 10);
2243
        let data = ArrayData::builder(DataType::Int32)
2244
            .len(16)
2245
            .add_buffer(make_i32_buffer(16))
2246
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2247
            .build()
2248
            .unwrap();
2249
        let new_data = data.slice(1, 15);
2250
        assert_eq!(data.len() - 1, new_data.len());
2251
        assert_eq!(1, new_data.offset());
2252
        assert_eq!(data.null_count(), new_data.null_count());
2253
2254
        // slice of a slice (removes one null)
2255
        let new_data = new_data.slice(1, 14);
2256
        assert_eq!(data.len() - 2, new_data.len());
2257
        assert_eq!(2, new_data.offset());
2258
        assert_eq!(data.null_count() - 1, new_data.null_count());
2259
    }
2260
2261
    #[test]
2262
    fn test_equality() {
2263
        let int_data = ArrayData::builder(DataType::Int32)
2264
            .len(1)
2265
            .add_buffer(make_i32_buffer(1))
2266
            .build()
2267
            .unwrap();
2268
2269
        let float_data = ArrayData::builder(DataType::Float32)
2270
            .len(1)
2271
            .add_buffer(make_f32_buffer(1))
2272
            .build()
2273
            .unwrap();
2274
        assert_ne!(int_data, float_data);
2275
        assert!(!int_data.ptr_eq(&float_data));
2276
        assert!(int_data.ptr_eq(&int_data));
2277
2278
        #[allow(clippy::redundant_clone)]
2279
        let int_data_clone = int_data.clone();
2280
        assert_eq!(int_data, int_data_clone);
2281
        assert!(int_data.ptr_eq(&int_data_clone));
2282
        assert!(int_data_clone.ptr_eq(&int_data));
2283
2284
        let int_data_slice = int_data_clone.slice(1, 0);
2285
        assert!(int_data_slice.ptr_eq(&int_data_slice));
2286
        assert!(!int_data.ptr_eq(&int_data_slice));
2287
        assert!(!int_data_slice.ptr_eq(&int_data));
2288
2289
        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2290
        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2291
        let string_data = ArrayData::try_new(
2292
            DataType::Utf8,
2293
            3,
2294
            Some(Buffer::from_iter(vec![true, false, true])),
2295
            0,
2296
            vec![offsets_buffer, data_buffer],
2297
            vec![],
2298
        )
2299
        .unwrap();
2300
2301
        assert_ne!(float_data, string_data);
2302
        assert!(!float_data.ptr_eq(&string_data));
2303
2304
        assert!(string_data.ptr_eq(&string_data));
2305
2306
        #[allow(clippy::redundant_clone)]
2307
        let string_data_cloned = string_data.clone();
2308
        assert!(string_data_cloned.ptr_eq(&string_data));
2309
        assert!(string_data.ptr_eq(&string_data_cloned));
2310
2311
        let string_data_slice = string_data.slice(1, 2);
2312
        assert!(string_data_slice.ptr_eq(&string_data_slice));
2313
        assert!(!string_data_slice.ptr_eq(&string_data))
2314
    }
2315
2316
    #[test]
2317
    fn test_slice_memory_size() {
2318
        let mut bit_v: [u8; 2] = [0; 2];
2319
        bit_util::set_bit(&mut bit_v, 0);
2320
        bit_util::set_bit(&mut bit_v, 3);
2321
        bit_util::set_bit(&mut bit_v, 10);
2322
        let data = ArrayData::builder(DataType::Int32)
2323
            .len(16)
2324
            .add_buffer(make_i32_buffer(16))
2325
            .null_bit_buffer(Some(Buffer::from(bit_v)))
2326
            .build()
2327
            .unwrap();
2328
        let new_data = data.slice(1, 14);
2329
        assert_eq!(
2330
            data.get_slice_memory_size().unwrap() - 8,
2331
            new_data.get_slice_memory_size().unwrap()
2332
        );
2333
        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2334
        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2335
        let string_data = ArrayData::try_new(
2336
            DataType::Utf8,
2337
            3,
2338
            Some(Buffer::from_iter(vec![true, false, true])),
2339
            0,
2340
            vec![offsets_buffer, data_buffer],
2341
            vec![],
2342
        )
2343
        .unwrap();
2344
        let string_data_slice = string_data.slice(1, 2);
2345
        //4 bytes of offset and 2 bytes of data reduced by slicing.
2346
        assert_eq!(
2347
            string_data.get_slice_memory_size().unwrap() - 6,
2348
            string_data_slice.get_slice_memory_size().unwrap()
2349
        );
2350
    }
2351
2352
    #[test]
2353
    fn test_count_nulls() {
2354
        let buffer = Buffer::from([0b00010110, 0b10011111]);
2355
        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2356
        let count = count_nulls(Some(&buffer), 0, 16);
2357
        assert_eq!(count, 7);
2358
2359
        let count = count_nulls(Some(&buffer), 4, 8);
2360
        assert_eq!(count, 3);
2361
    }
2362
2363
    #[test]
2364
    fn test_contains_nulls() {
2365
        let buffer: Buffer =
2366
            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2367
        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2368
        assert!(contains_nulls(Some(&buffer), 0, 6));
2369
        assert!(contains_nulls(Some(&buffer), 0, 3));
2370
        assert!(!contains_nulls(Some(&buffer), 3, 2));
2371
        assert!(!contains_nulls(Some(&buffer), 0, 0));
2372
    }
2373
2374
    #[test]
2375
    fn test_alignment() {
2376
        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2377
        let sliced = buffer.slice(1);
2378
2379
        let mut data = ArrayData {
2380
            data_type: DataType::Int32,
2381
            len: 0,
2382
            offset: 0,
2383
            buffers: vec![buffer],
2384
            child_data: vec![],
2385
            nulls: None,
2386
        };
2387
        data.validate_full().unwrap();
2388
2389
        // break alignment in data
2390
        data.buffers[0] = sliced;
2391
        let err = data.validate().unwrap_err();
2392
2393
        assert_eq!(
2394
            err.to_string(),
2395
            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2396
        );
2397
2398
        data.align_buffers();
2399
        data.validate_full().unwrap();
2400
    }
2401
2402
    #[test]
2403
    fn test_alignment_struct() {
2404
        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2405
        let sliced = buffer.slice(1);
2406
2407
        let child_data = ArrayData {
2408
            data_type: DataType::Int32,
2409
            len: 0,
2410
            offset: 0,
2411
            buffers: vec![buffer],
2412
            child_data: vec![],
2413
            nulls: None,
2414
        };
2415
2416
        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2417
        let mut data = ArrayData {
2418
            data_type: schema,
2419
            len: 0,
2420
            offset: 0,
2421
            buffers: vec![],
2422
            child_data: vec![child_data],
2423
            nulls: None,
2424
        };
2425
        data.validate_full().unwrap();
2426
2427
        // break alignment in child data
2428
        data.child_data[0].buffers[0] = sliced;
2429
        let err = data.validate().unwrap_err();
2430
2431
        assert_eq!(
2432
            err.to_string(),
2433
            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2434
        );
2435
2436
        data.align_buffers();
2437
        data.validate_full().unwrap();
2438
    }
2439
2440
    #[test]
2441
    fn test_null_view_types() {
2442
        let array_len = 32;
2443
        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2444
        assert_eq!(array.len(), array_len);
2445
        for i in 0..array.len() {
2446
            assert!(array.is_null(i));
2447
        }
2448
2449
        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2450
        assert_eq!(array.len(), array_len);
2451
        for i in 0..array.len() {
2452
            assert!(array.is_null(i));
2453
        }
2454
    }
2455
}