/Users/andrewlamb/Software/arrow-rs/arrow-data/src/data.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates |
19 | | //! common attributes and operations for Arrow array. |
20 | | |
21 | | use crate::bit_iterator::BitSliceIterator; |
22 | | use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; |
23 | | use arrow_buffer::{ |
24 | | ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256, |
25 | | }; |
26 | | use arrow_schema::{ArrowError, DataType, UnionMode}; |
27 | | use std::mem; |
28 | | use std::ops::Range; |
29 | | use std::sync::Arc; |
30 | | |
31 | | use crate::{equal, validate_binary_view, validate_string_view}; |
32 | | |
33 | | #[inline] |
34 | 161k | pub(crate) fn contains_nulls( |
35 | 161k | null_bit_buffer: Option<&NullBuffer>, |
36 | 161k | offset: usize, |
37 | 161k | len: usize, |
38 | 161k | ) -> bool { |
39 | 161k | match null_bit_buffer { |
40 | 798 | Some(buffer) => { |
41 | 798 | match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() { |
42 | 767 | Some((start, end)) => start != 0 || end != len493 , |
43 | 31 | None => len != 0, // No non-null values |
44 | | } |
45 | | } |
46 | 160k | None => false, // No null buffer |
47 | | } |
48 | 161k | } |
49 | | |
50 | | #[inline] |
51 | 56 | pub(crate) fn count_nulls( |
52 | 56 | null_bit_buffer: Option<&NullBuffer>, |
53 | 56 | offset: usize, |
54 | 56 | len: usize, |
55 | 56 | ) -> usize { |
56 | 56 | if let Some(buf42 ) = null_bit_buffer { |
57 | 42 | let buffer = buf.buffer(); |
58 | 42 | len - buffer.count_set_bits_offset(offset + buf.offset(), len) |
59 | | } else { |
60 | 14 | 0 |
61 | | } |
62 | 56 | } |
63 | | |
64 | | /// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots). |
65 | | #[inline] |
66 | 112 | pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] { |
67 | 112 | let empty_buffer = MutableBuffer::new(0); |
68 | 112 | match data_type { |
69 | 0 | DataType::Null => [empty_buffer, MutableBuffer::new(0)], |
70 | | DataType::Boolean => { |
71 | 0 | let bytes = bit_util::ceil(capacity, 8); |
72 | 0 | let buffer = MutableBuffer::new(bytes); |
73 | 0 | [buffer, empty_buffer] |
74 | | } |
75 | | DataType::UInt8 |
76 | | | DataType::UInt16 |
77 | | | DataType::UInt32 |
78 | | | DataType::UInt64 |
79 | | | DataType::Int8 |
80 | | | DataType::Int16 |
81 | | | DataType::Int32 |
82 | | | DataType::Int64 |
83 | | | DataType::Float16 |
84 | | | DataType::Float32 |
85 | | | DataType::Float64 |
86 | | | DataType::Decimal32(_, _) |
87 | | | DataType::Decimal64(_, _) |
88 | | | DataType::Decimal128(_, _) |
89 | | | DataType::Decimal256(_, _) |
90 | | | DataType::Date32 |
91 | | | DataType::Time32(_) |
92 | | | DataType::Date64 |
93 | | | DataType::Time64(_) |
94 | | | DataType::Duration(_) |
95 | | | DataType::Timestamp(_, _) |
96 | 66 | | DataType::Interval(_) => [ |
97 | 66 | MutableBuffer::new(capacity * data_type.primitive_width().unwrap()), |
98 | 66 | empty_buffer, |
99 | 66 | ], |
100 | | DataType::Utf8 | DataType::Binary => { |
101 | 14 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
102 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
103 | 14 | buffer.push(0i32); |
104 | 14 | [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
105 | | } |
106 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
107 | 0 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
108 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
109 | 0 | buffer.push(0i64); |
110 | 0 | [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
111 | | } |
112 | 2 | DataType::BinaryView | DataType::Utf8View => [ |
113 | 2 | MutableBuffer::new(capacity * mem::size_of::<u128>()), |
114 | 2 | empty_buffer, |
115 | 2 | ], |
116 | | DataType::List(_) | DataType::Map(_, _) => { |
117 | | // offset buffer always starts with a zero |
118 | 3 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
119 | 3 | buffer.push(0i32); |
120 | 3 | [buffer, empty_buffer] |
121 | | } |
122 | 0 | DataType::ListView(_) => [ |
123 | 0 | MutableBuffer::new(capacity * mem::size_of::<i32>()), |
124 | 0 | MutableBuffer::new(capacity * mem::size_of::<i32>()), |
125 | 0 | ], |
126 | | DataType::LargeList(_) => { |
127 | | // offset buffer always starts with a zero |
128 | 1 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
129 | 1 | buffer.push(0i64); |
130 | 1 | [buffer, empty_buffer] |
131 | | } |
132 | 0 | DataType::LargeListView(_) => [ |
133 | 0 | MutableBuffer::new(capacity * mem::size_of::<i64>()), |
134 | 0 | MutableBuffer::new(capacity * mem::size_of::<i64>()), |
135 | 0 | ], |
136 | 0 | DataType::FixedSizeBinary(size) => { |
137 | 0 | [MutableBuffer::new(capacity * *size as usize), empty_buffer] |
138 | | } |
139 | 8 | DataType::Dictionary(k, _) => [ |
140 | 8 | MutableBuffer::new(capacity * k.primitive_width().unwrap()), |
141 | 8 | empty_buffer, |
142 | 8 | ], |
143 | | DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { |
144 | 11 | [empty_buffer, MutableBuffer::new(0)] |
145 | | } |
146 | 7 | DataType::Union(_, mode) => { |
147 | 7 | let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>()); |
148 | 7 | match mode { |
149 | 1 | UnionMode::Sparse => [type_ids, empty_buffer], |
150 | | UnionMode::Dense => { |
151 | 6 | let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>()); |
152 | 6 | [type_ids, offsets] |
153 | | } |
154 | | } |
155 | | } |
156 | | } |
157 | 112 | } |
158 | | |
159 | | /// A generic representation of Arrow array data which encapsulates common attributes |
160 | | /// and operations for Arrow array. |
161 | | /// |
162 | | /// Specific operations for different arrays types (e.g., primitive, list, struct) |
163 | | /// are implemented in `Array`. |
164 | | /// |
165 | | /// # Memory Layout |
166 | | /// |
167 | | /// `ArrayData` has references to one or more underlying data buffers |
168 | | /// and optional child ArrayData, depending on type as illustrated |
169 | | /// below. Bitmaps are not shown for simplicity but they are stored |
170 | | /// similarly to the buffers. |
171 | | /// |
172 | | /// ```text |
173 | | /// offset |
174 | | /// points to |
175 | | /// ┌───────────────────┐ start of ┌───────┐ Different |
176 | | /// │ │ data │ │ ArrayData may |
177 | | /// │ArrayData { │ │.... │ also refers to |
178 | | /// │ data_type: ... │ ─ ─ ─ ─▶│1234 │ ┌ ─ the same |
179 | | /// │ offset: ... ─ ─ ─│─ ┘ │4372 │ underlying |
180 | | /// │ len: ... ─ ─ ─│─ ┐ │4888 │ │ buffer with different offset/len |
181 | | /// │ buffers: [ │ │5882 │◀─ |
182 | | /// │ ... │ │ │4323 │ |
183 | | /// │ ] │ ─ ─ ─ ─▶│4859 │ |
184 | | /// │ child_data: [ │ │.... │ |
185 | | /// │ ... │ │ │ |
186 | | /// │ ] │ └───────┘ |
187 | | /// │} │ |
188 | | /// │ │ Shared Buffer uses |
189 | | /// │ │ │ bytes::Bytes to hold |
190 | | /// └───────────────────┘ actual data values |
191 | | /// ┌ ─ ─ ┘ |
192 | | /// |
193 | | /// ▼ |
194 | | /// ┌───────────────────┐ |
195 | | /// │ArrayData { │ |
196 | | /// │ ... │ |
197 | | /// │} │ |
198 | | /// │ │ |
199 | | /// └───────────────────┘ |
200 | | /// |
201 | | /// Child ArrayData may also have its own buffers and children |
202 | | /// ``` |
203 | | |
204 | | #[derive(Debug, Clone)] |
205 | | pub struct ArrayData { |
206 | | /// The data type |
207 | | data_type: DataType, |
208 | | |
209 | | /// The number of elements |
210 | | len: usize, |
211 | | |
212 | | /// The offset in number of items (not bytes). |
213 | | /// |
214 | | /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It |
215 | | /// does NOT apply to [`Self::nulls`]. |
216 | | offset: usize, |
217 | | |
218 | | /// The buffers that store the actual data for this array, as defined |
219 | | /// in the [Arrow Spec]. |
220 | | /// |
221 | | /// Depending on the array types, [`Self::buffers`] can hold different |
222 | | /// kinds of buffers (e.g., value buffer, value offset buffer) at different |
223 | | /// positions. |
224 | | /// |
225 | | /// The buffer may be larger than needed. Some items at the beginning may be skipped if |
226 | | /// there is an `offset`. Some items at the end may be skipped if the buffer is longer than |
227 | | /// we need to satisfy `len`. |
228 | | /// |
229 | | /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) |
230 | | buffers: Vec<Buffer>, |
231 | | |
232 | | /// The child(ren) of this array. |
233 | | /// |
234 | | /// Only non-empty for nested types, such as `ListArray` and |
235 | | /// `StructArray`. |
236 | | /// |
237 | | /// The first logical element in each child element begins at `offset`. |
238 | | /// |
239 | | /// If the child element also has an offset then these offsets are |
240 | | /// cumulative. |
241 | | child_data: Vec<ArrayData>, |
242 | | |
243 | | /// The null bitmap. |
244 | | /// |
245 | | /// `None` indicates all values are non-null in this array. |
246 | | /// |
247 | | /// [`Self::offset]` does not apply to the null bitmap. While the |
248 | | /// BooleanBuffer may be sliced (have its own offset) internally, this |
249 | | /// `NullBuffer` always represents exactly `len` elements. |
250 | | nulls: Option<NullBuffer>, |
251 | | } |
252 | | |
253 | | /// A thread-safe, shared reference to the Arrow array data. |
254 | | pub type ArrayDataRef = Arc<ArrayData>; |
255 | | |
256 | | impl ArrayData { |
257 | | /// Create a new ArrayData instance; |
258 | | /// |
259 | | /// If `null_count` is not specified, the number of nulls in |
260 | | /// null_bit_buffer is calculated. |
261 | | /// |
262 | | /// If the number of nulls is 0 then the null_bit_buffer |
263 | | /// is set to `None`. |
264 | | /// |
265 | | /// # Safety |
266 | | /// |
267 | | /// The input values *must* form a valid Arrow array for |
268 | | /// `data_type`, or undefined behavior can result. |
269 | | /// |
270 | | /// Note: This is a low level API and most users of the arrow |
271 | | /// crate should create arrays using the methods in the `array` |
272 | | /// module. |
273 | 2.12k | pub unsafe fn new_unchecked( |
274 | 2.12k | data_type: DataType, |
275 | 2.12k | len: usize, |
276 | 2.12k | null_count: Option<usize>, |
277 | 2.12k | null_bit_buffer: Option<Buffer>, |
278 | 2.12k | offset: usize, |
279 | 2.12k | buffers: Vec<Buffer>, |
280 | 2.12k | child_data: Vec<ArrayData>, |
281 | 2.12k | ) -> Self { |
282 | 2.12k | let mut skip_validation = UnsafeFlag::new(); |
283 | | // SAFETY: caller responsible for ensuring data is valid |
284 | 2.12k | unsafe { skip_validation.set(true) }; |
285 | | |
286 | 2.12k | ArrayDataBuilder { |
287 | 2.12k | data_type, |
288 | 2.12k | len, |
289 | 2.12k | null_count, |
290 | 2.12k | null_bit_buffer, |
291 | 2.12k | nulls: None, |
292 | 2.12k | offset, |
293 | 2.12k | buffers, |
294 | 2.12k | child_data, |
295 | 2.12k | align_buffers: false, |
296 | 2.12k | skip_validation, |
297 | 2.12k | } |
298 | 2.12k | .build() |
299 | 2.12k | .unwrap() |
300 | 2.12k | } |
301 | | |
302 | | /// Create a new ArrayData, validating that the provided buffers form a valid |
303 | | /// Arrow array of the specified data type. |
304 | | /// |
305 | | /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer |
306 | | /// is set to `None`. |
307 | | /// |
308 | | /// Internally this calls through to [`Self::validate_data`] |
309 | | /// |
310 | | /// Note: This is a low level API and most users of the arrow crate should create |
311 | | /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array) |
312 | 2 | pub fn try_new( |
313 | 2 | data_type: DataType, |
314 | 2 | len: usize, |
315 | 2 | null_bit_buffer: Option<Buffer>, |
316 | 2 | offset: usize, |
317 | 2 | buffers: Vec<Buffer>, |
318 | 2 | child_data: Vec<ArrayData>, |
319 | 2 | ) -> Result<Self, ArrowError> { |
320 | | // we must check the length of `null_bit_buffer` first |
321 | | // because we use this buffer to calculate `null_count` |
322 | | // in `Self::new_unchecked`. |
323 | 2 | if let Some(null_bit_buffer) = null_bit_buffer.as_ref() { |
324 | 2 | let needed_len = bit_util::ceil(len + offset, 8); |
325 | 2 | if null_bit_buffer.len() < needed_len { |
326 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
327 | 0 | "null_bit_buffer size too small. got {} needed {}", |
328 | 0 | null_bit_buffer.len(), |
329 | 0 | needed_len |
330 | 0 | ))); |
331 | 2 | } |
332 | 0 | } |
333 | | // Safety justification: `validate_full` is called below |
334 | 2 | let new_self = unsafe { |
335 | 2 | Self::new_unchecked( |
336 | 2 | data_type, |
337 | 2 | len, |
338 | 2 | None, |
339 | 2 | null_bit_buffer, |
340 | 2 | offset, |
341 | 2 | buffers, |
342 | 2 | child_data, |
343 | | ) |
344 | | }; |
345 | | |
346 | | // As the data is not trusted, do a full validation of its contents |
347 | | // We don't need to validate children as we can assume that the |
348 | | // [`ArrayData`] in `child_data` have already been validated through |
349 | | // a call to `ArrayData::try_new` or created using unsafe |
350 | 2 | new_self.validate_data()?0 ; |
351 | 2 | Ok(new_self) |
352 | 2 | } |
353 | | |
354 | | /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`] |
355 | | #[inline] |
356 | 81.5k | pub const fn builder(data_type: DataType) -> ArrayDataBuilder { |
357 | 81.5k | ArrayDataBuilder::new(data_type) |
358 | 81.5k | } |
359 | | |
360 | | /// Returns a reference to the [`DataType`] of this [`ArrayData`] |
361 | | #[inline] |
362 | 1.05M | pub const fn data_type(&self) -> &DataType { |
363 | 1.05M | &self.data_type |
364 | 1.05M | } |
365 | | |
366 | | /// Returns the [`Buffer`] storing data for this [`ArrayData`] |
367 | 1.37M | pub fn buffers(&self) -> &[Buffer] { |
368 | 1.37M | &self.buffers |
369 | 1.37M | } |
370 | | |
371 | | /// Returns a slice of children [`ArrayData`]. This will be non |
372 | | /// empty for type such as lists and structs. |
373 | 321k | pub fn child_data(&self) -> &[ArrayData] { |
374 | 321k | &self.child_data[..] |
375 | 321k | } |
376 | | |
377 | | /// Returns whether the element at index `i` is null |
378 | | #[inline] |
379 | 29.5k | pub fn is_null(&self, i: usize) -> bool { |
380 | 29.5k | match &self.nulls { |
381 | 20.1k | Some(v) => v.is_null(i), |
382 | 9.43k | None => false, |
383 | | } |
384 | 29.5k | } |
385 | | |
386 | | /// Returns a reference to the null buffer of this [`ArrayData`] if any |
387 | | /// |
388 | | /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`] |
389 | | #[inline] |
390 | 853k | pub fn nulls(&self) -> Option<&NullBuffer> { |
391 | 853k | self.nulls.as_ref() |
392 | 853k | } |
393 | | |
394 | | /// Returns whether the element at index `i` is not null |
395 | | #[inline] |
396 | | pub fn is_valid(&self, i: usize) -> bool { |
397 | | !self.is_null(i) |
398 | | } |
399 | | |
400 | | /// Returns the length (i.e., number of elements) of this [`ArrayData`]. |
401 | | #[inline] |
402 | 653k | pub const fn len(&self) -> usize { |
403 | 653k | self.len |
404 | 653k | } |
405 | | |
406 | | /// Returns whether this [`ArrayData`] is empty |
407 | | #[inline] |
408 | 160k | pub const fn is_empty(&self) -> bool { |
409 | 160k | self.len == 0 |
410 | 160k | } |
411 | | |
412 | | /// Returns the offset of this [`ArrayData`] |
413 | | #[inline] |
414 | 328k | pub const fn offset(&self) -> usize { |
415 | 328k | self.offset |
416 | 328k | } |
417 | | |
418 | | /// Returns the total number of nulls in this array |
419 | | #[inline] |
420 | 163k | pub fn null_count(&self) -> usize { |
421 | 163k | self.nulls |
422 | 163k | .as_ref() |
423 | 163k | .map(|x| x2.27k .null_count2.27k ()) |
424 | 163k | .unwrap_or_default() |
425 | 163k | } |
426 | | |
427 | | /// Returns the total number of bytes of memory occupied by the |
428 | | /// buffers owned by this [`ArrayData`] and all of its |
429 | | /// children. (See also diagram on [`ArrayData`]). |
430 | | /// |
431 | | /// Note that this [`ArrayData`] may only refer to a subset of the |
432 | | /// data in the underlying [`Buffer`]s (due to `offset` and |
433 | | /// `length`), but the size returned includes the entire size of |
434 | | /// the buffers. |
435 | | /// |
436 | | /// If multiple [`ArrayData`]s refer to the same underlying |
437 | | /// [`Buffer`]s they will both report the same size. |
438 | 0 | pub fn get_buffer_memory_size(&self) -> usize { |
439 | 0 | let mut size = 0; |
440 | 0 | for buffer in &self.buffers { |
441 | 0 | size += buffer.capacity(); |
442 | 0 | } |
443 | 0 | if let Some(bitmap) = &self.nulls { |
444 | 0 | size += bitmap.buffer().capacity() |
445 | 0 | } |
446 | 0 | for child in &self.child_data { |
447 | 0 | size += child.get_buffer_memory_size(); |
448 | 0 | } |
449 | 0 | size |
450 | 0 | } |
451 | | |
452 | | /// Returns the total number of the bytes of memory occupied by |
453 | | /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). |
454 | | /// |
455 | | /// This is approximately the number of bytes if a new |
456 | | /// [`ArrayData`] was formed by creating new [`Buffer`]s with |
457 | | /// exactly the data needed. |
458 | | /// |
459 | | /// For example, a [`DataType::Int64`] with `100` elements, |
460 | | /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If |
461 | | /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its |
462 | | /// first `20` elements, then [`Self::get_slice_memory_size`] on the |
463 | | /// sliced [`ArrayData`] would return `20 * 8 = 160`. |
464 | 0 | pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> { |
465 | 0 | let mut result: usize = 0; |
466 | 0 | let layout = layout(&self.data_type); |
467 | | |
468 | 0 | for spec in layout.buffers.iter() { |
469 | 0 | match spec { |
470 | 0 | BufferSpec::FixedWidth { byte_width, .. } => { |
471 | 0 | let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { |
472 | 0 | ArrowError::ComputeError( |
473 | 0 | "Integer overflow computing buffer size".to_string(), |
474 | 0 | ) |
475 | 0 | })?; |
476 | 0 | result += buffer_size; |
477 | | } |
478 | | BufferSpec::VariableWidth => { |
479 | 0 | let buffer_len = match self.data_type { |
480 | | DataType::Utf8 | DataType::Binary => { |
481 | 0 | let offsets = self.typed_offsets::<i32>()?; |
482 | 0 | (offsets[self.len] - offsets[0]) as usize |
483 | | } |
484 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
485 | 0 | let offsets = self.typed_offsets::<i64>()?; |
486 | 0 | (offsets[self.len] - offsets[0]) as usize |
487 | | } |
488 | | _ => { |
489 | 0 | return Err(ArrowError::NotYetImplemented(format!( |
490 | 0 | "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", |
491 | 0 | self.data_type |
492 | 0 | ))); |
493 | | } |
494 | | }; |
495 | 0 | result += buffer_len; |
496 | | } |
497 | 0 | BufferSpec::BitMap => { |
498 | 0 | let buffer_size = bit_util::ceil(self.len, 8); |
499 | 0 | result += buffer_size; |
500 | 0 | } |
501 | 0 | BufferSpec::AlwaysNull => { |
502 | 0 | // Nothing to do |
503 | 0 | } |
504 | | } |
505 | | } |
506 | | |
507 | 0 | if self.nulls().is_some() { |
508 | 0 | result += bit_util::ceil(self.len, 8); |
509 | 0 | } |
510 | | |
511 | 0 | for child in &self.child_data { |
512 | 0 | result += child.get_slice_memory_size()?; |
513 | | } |
514 | 0 | Ok(result) |
515 | 0 | } |
516 | | |
517 | | /// Returns the total number of bytes of memory occupied |
518 | | /// physically by this [`ArrayData`] and all its [`Buffer`]s and |
519 | | /// children. (See also diagram on [`ArrayData`]). |
520 | | /// |
521 | | /// Equivalent to: |
522 | | /// `size_of_val(self)` + |
523 | | /// [`Self::get_buffer_memory_size`] + |
524 | | /// `size_of_val(child)` for all children |
525 | 0 | pub fn get_array_memory_size(&self) -> usize { |
526 | 0 | let mut size = mem::size_of_val(self); |
527 | | |
528 | | // Calculate rest of the fields top down which contain actual data |
529 | 0 | for buffer in &self.buffers { |
530 | 0 | size += mem::size_of::<Buffer>(); |
531 | 0 | size += buffer.capacity(); |
532 | 0 | } |
533 | 0 | if let Some(nulls) = &self.nulls { |
534 | 0 | size += nulls.buffer().capacity(); |
535 | 0 | } |
536 | 0 | for child in &self.child_data { |
537 | 0 | size += child.get_array_memory_size(); |
538 | 0 | } |
539 | | |
540 | 0 | size |
541 | 0 | } |
542 | | |
543 | | /// Creates a zero-copy slice of itself. This creates a new |
544 | | /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a |
545 | | /// different offset and len |
546 | | /// |
547 | | /// # Panics |
548 | | /// |
549 | | /// Panics if `offset + length > self.len()`. |
550 | 11 | pub fn slice(&self, offset: usize, length: usize) -> ArrayData { |
551 | 11 | assert!((offset + length) <= self.len()); |
552 | | |
553 | 11 | if let DataType::Struct(_) = self.data_type() { |
554 | | // Slice into children |
555 | 0 | let new_offset = self.offset + offset; |
556 | | ArrayData { |
557 | 0 | data_type: self.data_type().clone(), |
558 | 0 | len: length, |
559 | 0 | offset: new_offset, |
560 | 0 | buffers: self.buffers.clone(), |
561 | | // Slice child data, to propagate offsets down to them |
562 | 0 | child_data: self |
563 | 0 | .child_data() |
564 | 0 | .iter() |
565 | 0 | .map(|data| data.slice(offset, length)) |
566 | 0 | .collect(), |
567 | 0 | nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), |
568 | | } |
569 | | } else { |
570 | 11 | let mut new_data = self.clone(); |
571 | | |
572 | 11 | new_data.len = length; |
573 | 11 | new_data.offset = offset + self.offset; |
574 | 11 | new_data.nulls = self.nulls.as_ref().map(|x| x4 .slice4 (offset4 , length4 )); |
575 | | |
576 | 11 | new_data |
577 | | } |
578 | 11 | } |
579 | | |
580 | | /// Returns the `buffer` as a slice of type `T` starting at self.offset |
581 | | /// |
582 | | /// # Panics |
583 | | /// This function panics if: |
584 | | /// * the buffer is not byte-aligned with type T, or |
585 | | /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) |
586 | 320k | pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] { |
587 | 320k | &self.buffers()[buffer].typed_data()[self.offset..] |
588 | 320k | } |
589 | | |
590 | | /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values |
591 | 81 | pub fn new_null(data_type: &DataType, len: usize) -> Self { |
592 | 81 | let bit_len = bit_util::ceil(len, 8); |
593 | 106 | let zeroed81 = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len)); |
594 | | |
595 | 81 | let (buffers, child_data, has_nulls) = match data_type.primitive_width() { |
596 | 27 | Some(width) => (vec![zeroed(width * len)], vec![], true), |
597 | 54 | None => match data_type { |
598 | 1 | DataType::Null => (vec![], vec![], false), |
599 | 1 | DataType::Boolean => (vec![zeroed(bit_len)], vec![], true), |
600 | | DataType::Binary | DataType::Utf8 => { |
601 | 33 | (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true) |
602 | | } |
603 | 0 | DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true), |
604 | | DataType::LargeBinary | DataType::LargeUtf8 => { |
605 | 0 | (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true) |
606 | | } |
607 | 1 | DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true), |
608 | 0 | DataType::List(f) | DataType::Map(f, _) => ( |
609 | 0 | vec![zeroed((len + 1) * 4)], |
610 | 0 | vec![ArrayData::new_empty(f.data_type())], |
611 | 0 | true, |
612 | 0 | ), |
613 | 0 | DataType::LargeList(f) => ( |
614 | 0 | vec![zeroed((len + 1) * 8)], |
615 | 0 | vec![ArrayData::new_empty(f.data_type())], |
616 | 0 | true, |
617 | 0 | ), |
618 | 0 | DataType::FixedSizeList(f, list_len) => ( |
619 | 0 | vec![], |
620 | 0 | vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)], |
621 | 0 | true, |
622 | 0 | ), |
623 | 1 | DataType::Struct(fields) => ( |
624 | 1 | vec![], |
625 | 1 | fields |
626 | 1 | .iter() |
627 | 2 | .map1 (|f| Self::new_null(f.data_type(), len)) |
628 | 1 | .collect(), |
629 | | true, |
630 | | ), |
631 | 11 | DataType::Dictionary(k, v) => ( |
632 | 11 | vec![zeroed(k.primitive_width().unwrap() * len)], |
633 | 11 | vec![ArrayData::new_empty(v.as_ref())], |
634 | 11 | true, |
635 | 11 | ), |
636 | 5 | DataType::Union(f, mode) => { |
637 | 5 | let (id, _) = f.iter().next().unwrap(); |
638 | 5 | let ids = Buffer::from_iter(std::iter::repeat_n(id, len)); |
639 | 5 | let buffers = match mode { |
640 | 5 | UnionMode::Sparse => vec![ids], |
641 | | UnionMode::Dense => { |
642 | 0 | let end_offset = i32::from_usize(len).unwrap(); |
643 | 0 | vec![ids, Buffer::from_iter(0_i32..end_offset)] |
644 | | } |
645 | | }; |
646 | | |
647 | 5 | let children = f |
648 | 5 | .iter() |
649 | 5 | .enumerate() |
650 | 5 | .map(|(idx, (_, f))| { |
651 | 5 | if idx == 0 || *mode == UnionMode::Sparse0 { |
652 | 5 | Self::new_null(f.data_type(), len) |
653 | | } else { |
654 | 0 | Self::new_empty(f.data_type()) |
655 | | } |
656 | 5 | }) |
657 | 5 | .collect(); |
658 | | |
659 | 5 | (buffers, children, false) |
660 | | } |
661 | 1 | DataType::RunEndEncoded(r, v) => { |
662 | 1 | let runs = match r.data_type() { |
663 | | DataType::Int16 => { |
664 | 0 | let i = i16::from_usize(len).expect("run overflow"); |
665 | 0 | Buffer::from_slice_ref([i]) |
666 | | } |
667 | | DataType::Int32 => { |
668 | 0 | let i = i32::from_usize(len).expect("run overflow"); |
669 | 0 | Buffer::from_slice_ref([i]) |
670 | | } |
671 | | DataType::Int64 => { |
672 | 1 | let i = i64::from_usize(len).expect("run overflow"); |
673 | 1 | Buffer::from_slice_ref([i]) |
674 | | } |
675 | 0 | dt => unreachable!("Invalid run ends data type {dt}"), |
676 | | }; |
677 | | |
678 | 1 | let builder = ArrayData::builder(r.data_type().clone()) |
679 | 1 | .len(1) |
680 | 1 | .buffers(vec![runs]); |
681 | | |
682 | | // SAFETY: |
683 | | // Valid by construction |
684 | 1 | let runs = unsafe { builder.build_unchecked() }; |
685 | 1 | ( |
686 | 1 | vec![], |
687 | 1 | vec![runs, ArrayData::new_null(v.data_type(), 1)], |
688 | 1 | false, |
689 | 1 | ) |
690 | | } |
691 | 0 | d => unreachable!("{d}"), |
692 | | }, |
693 | | }; |
694 | | |
695 | 81 | let mut builder = ArrayDataBuilder::new(data_type.clone()) |
696 | 81 | .len(len) |
697 | 81 | .buffers(buffers) |
698 | 81 | .child_data(child_data); |
699 | | |
700 | 81 | if has_nulls { |
701 | 74 | builder = builder.nulls(Some(NullBuffer::new_null(len))) |
702 | 7 | } |
703 | | |
704 | | // SAFETY: |
705 | | // Data valid by construction |
706 | 81 | unsafe { builder.build_unchecked() } |
707 | 81 | } |
708 | | |
709 | | /// Returns a new empty [ArrayData] valid for `data_type`. |
710 | 51 | pub fn new_empty(data_type: &DataType) -> Self { |
711 | 51 | Self::new_null(data_type, 0) |
712 | 51 | } |
713 | | |
714 | | /// Verifies that the buffers meet the minimum alignment requirements for the data type |
715 | | /// |
716 | | /// Buffers that are not adequately aligned will be copied to a new aligned allocation |
717 | | /// |
718 | | /// This can be useful for when interacting with data sent over IPC or FFI, that may |
719 | | /// not meet the minimum alignment requirements |
720 | | /// |
721 | | /// This also aligns buffers of children data |
722 | 0 | pub fn align_buffers(&mut self) { |
723 | 0 | let layout = layout(&self.data_type); |
724 | 0 | for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { |
725 | 0 | if let BufferSpec::FixedWidth { alignment, .. } = spec { |
726 | 0 | if buffer.as_ptr().align_offset(*alignment) != 0 { |
727 | 0 | *buffer = Buffer::from_slice_ref(buffer.as_ref()); |
728 | 0 | } |
729 | 0 | } |
730 | | } |
731 | | // align children data recursively |
732 | 0 | for data in self.child_data.iter_mut() { |
733 | 0 | data.align_buffers() |
734 | | } |
735 | 0 | } |
736 | | |
737 | | /// "cheap" validation of an `ArrayData`. Ensures buffers are |
738 | | /// sufficiently sized to store `len` + `offset` total elements of |
739 | | /// `data_type` and performs other inexpensive consistency checks. |
740 | | /// |
741 | | /// This check is "cheap" in the sense that it does not validate the |
742 | | /// contents of the buffers (e.g. that all offsets for UTF8 arrays |
743 | | /// are within the bounds of the values buffer). |
744 | | /// |
745 | | /// See [ArrayData::validate_data] to validate fully the offset content |
746 | | /// and the validity of utf8 data |
747 | 1.04k | pub fn validate(&self) -> Result<(), ArrowError> { |
748 | | // Need at least this mich space in each buffer |
749 | 1.04k | let len_plus_offset = self.len + self.offset; |
750 | | |
751 | | // Check that the data layout conforms to the spec |
752 | 1.04k | let layout = layout(&self.data_type); |
753 | | |
754 | 1.04k | if !layout.can_contain_null_mask && self.nulls40 .is_some40 () { |
755 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
756 | 0 | "Arrays of type {:?} cannot contain a null bitmask", |
757 | 0 | self.data_type, |
758 | 0 | ))); |
759 | 1.04k | } |
760 | | |
761 | | // Check data buffers length for view types and other types |
762 | 1.04k | if self.buffers.len() < layout.buffers.len() |
763 | 1.04k | || (!layout.variadic && self.buffers.len() != layout.buffers.len()) |
764 | | { |
765 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
766 | 0 | "Expected {} buffers in array of type {:?}, got {}", |
767 | 0 | layout.buffers.len(), |
768 | 0 | self.data_type, |
769 | 0 | self.buffers.len(), |
770 | 0 | ))); |
771 | 1.04k | } |
772 | | |
773 | 1.04k | for (i1.00k , (buffer1.00k , spec1.00k )) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() { |
774 | 1.00k | match spec { |
775 | | BufferSpec::FixedWidth { |
776 | 892 | byte_width, |
777 | 892 | alignment, |
778 | | } => { |
779 | 892 | let min_buffer_size = len_plus_offset.saturating_mul(*byte_width); |
780 | | |
781 | 892 | if buffer.len() < min_buffer_size { |
782 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
783 | 0 | "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}", |
784 | 0 | min_buffer_size, |
785 | 0 | i, |
786 | 0 | self.data_type, |
787 | 0 | buffer.len() |
788 | 0 | ))); |
789 | 892 | } |
790 | | |
791 | 892 | let align_offset = buffer.as_ptr().align_offset(*alignment); |
792 | 892 | if align_offset != 0 { |
793 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
794 | 0 | "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}", |
795 | 0 | self.data_type, |
796 | 0 | align_offset.min(alignment - align_offset) |
797 | 0 | ))); |
798 | 892 | } |
799 | | } |
800 | 6 | BufferSpec::VariableWidth => { |
801 | 6 | // not cheap to validate (need to look at the |
802 | 6 | // data). Partially checked in validate_offsets |
803 | 6 | // called below. Can check with `validate_full` |
804 | 6 | } |
805 | | BufferSpec::BitMap => { |
806 | 105 | let min_buffer_size = bit_util::ceil(len_plus_offset, 8); |
807 | 105 | if buffer.len() < min_buffer_size { |
808 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
809 | 0 | "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}", |
810 | 0 | min_buffer_size, |
811 | 0 | i, |
812 | 0 | self.data_type, |
813 | 0 | buffer.len() |
814 | 0 | ))); |
815 | 105 | } |
816 | | } |
817 | 0 | BufferSpec::AlwaysNull => { |
818 | 0 | // Nothing to validate |
819 | 0 | } |
820 | | } |
821 | | } |
822 | | |
823 | | // check null bit buffer size |
824 | 1.04k | if let Some(nulls538 ) = self.nulls() { |
825 | 538 | if nulls.null_count() > self.len { |
826 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
827 | 0 | "null_count {} for an array exceeds length of {} elements", |
828 | 0 | nulls.null_count(), |
829 | 0 | self.len |
830 | 0 | ))); |
831 | 538 | } |
832 | | |
833 | 538 | let actual_len = nulls.validity().len(); |
834 | 538 | let needed_len = bit_util::ceil(len_plus_offset, 8); |
835 | 538 | if actual_len < needed_len { |
836 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
837 | 0 | "null_bit_buffer size too small. got {actual_len} needed {needed_len}", |
838 | 0 | ))); |
839 | 538 | } |
840 | | |
841 | 538 | if nulls.len() != self.len { |
842 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
843 | 0 | "null buffer incorrect size. got {} expected {}", |
844 | 0 | nulls.len(), |
845 | 0 | self.len |
846 | 0 | ))); |
847 | 538 | } |
848 | 503 | } |
849 | | |
850 | 1.04k | self.validate_child_data()?0 ; |
851 | | |
852 | | // Additional Type specific checks |
853 | 1.04k | match &self.data_type { |
854 | | DataType::Utf8 | DataType::Binary => { |
855 | 6 | self.validate_offsets::<i32>(self.buffers[1].len())?0 ; |
856 | | } |
857 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
858 | 0 | self.validate_offsets::<i64>(self.buffers[1].len())?; |
859 | | } |
860 | 0 | DataType::Dictionary(key_type, _value_type) => { |
861 | | // At the moment, constructing a DictionaryArray will also check this |
862 | 0 | if !DataType::is_dictionary_key_type(key_type) { |
863 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
864 | 0 | "Dictionary key type must be integer, but was {key_type}" |
865 | 0 | ))); |
866 | 0 | } |
867 | | } |
868 | 40 | DataType::RunEndEncoded(run_ends_type, _) => { |
869 | 40 | if run_ends_type.is_nullable() { |
870 | 0 | return Err(ArrowError::InvalidArgumentError( |
871 | 0 | "The nullable should be set to false for the field defining run_ends array.".to_string() |
872 | 0 | )); |
873 | 40 | } |
874 | 40 | if !DataType::is_run_ends_type(run_ends_type.data_type()) { |
875 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
876 | 0 | "RunArray run_ends types must be Int16, Int32 or Int64, but was {}", |
877 | 0 | run_ends_type.data_type() |
878 | 0 | ))); |
879 | 40 | } |
880 | | } |
881 | 995 | _ => {} |
882 | | }; |
883 | | |
884 | 1.04k | Ok(()) |
885 | 1.04k | } |
886 | | |
887 | | /// Returns a reference to the data in `buffer` as a typed slice |
888 | | /// (typically `&[i32]` or `&[i64]`) after validating. The |
889 | | /// returned slice is guaranteed to have at least `self.len + 1` |
890 | | /// entries. |
891 | | /// |
892 | | /// For an empty array, the `buffer` can also be empty. |
893 | 40 | fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> { |
894 | | // An empty list-like array can have 0 offsets |
895 | 40 | if self.len == 0 && self.buffers[0]0 .is_empty0 () { |
896 | 0 | return Ok(&[]); |
897 | 40 | } |
898 | | |
899 | 40 | self.typed_buffer(0, self.len + 1) |
900 | 40 | } |
901 | | |
902 | | /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating |
903 | 80 | fn typed_buffer<T: ArrowNativeType + num_traits::Num>( |
904 | 80 | &self, |
905 | 80 | idx: usize, |
906 | 80 | len: usize, |
907 | 80 | ) -> Result<&[T], ArrowError> { |
908 | 80 | let buffer = &self.buffers[idx]; |
909 | | |
910 | 80 | let required_len = (len + self.offset) * mem::size_of::<T>(); |
911 | | |
912 | 80 | if buffer.len() < required_len { |
913 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
914 | 0 | "Buffer {} of {} isn't large enough. Expected {} bytes got {}", |
915 | 0 | idx, |
916 | 0 | self.data_type, |
917 | 0 | required_len, |
918 | 0 | buffer.len() |
919 | 0 | ))); |
920 | 80 | } |
921 | | |
922 | 80 | Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len]) |
923 | 80 | } |
924 | | |
925 | | /// Does a cheap sanity check that the `self.len` values in `buffer` are valid |
926 | | /// offsets (of type T) into some other buffer of `values_length` bytes long |
927 | 23 | fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>( |
928 | 23 | &self, |
929 | 23 | values_length: usize, |
930 | 23 | ) -> Result<(), ArrowError> { |
931 | | // Justification: buffer size was validated above |
932 | 23 | let offsets = self.typed_offsets::<T>()?0 ; |
933 | 23 | if offsets.is_empty() { |
934 | 0 | return Ok(()); |
935 | 23 | } |
936 | | |
937 | 23 | let first_offset = offsets[0].to_usize().ok_or_else(|| {0 |
938 | 0 | ArrowError::InvalidArgumentError(format!( |
939 | 0 | "Error converting offset[0] ({}) to usize for {}", |
940 | 0 | offsets[0], self.data_type |
941 | 0 | )) |
942 | 0 | })?; |
943 | | |
944 | 23 | let last_offset = offsets[self.len].to_usize().ok_or_else(|| {0 |
945 | 0 | ArrowError::InvalidArgumentError(format!( |
946 | 0 | "Error converting offset[{}] ({}) to usize for {}", |
947 | 0 | self.len, offsets[self.len], self.data_type |
948 | 0 | )) |
949 | 0 | })?; |
950 | | |
951 | 23 | if first_offset > values_length { |
952 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
953 | 0 | "First offset {} of {} is larger than values length {}", |
954 | 0 | first_offset, self.data_type, values_length, |
955 | 0 | ))); |
956 | 23 | } |
957 | | |
958 | 23 | if last_offset > values_length { |
959 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
960 | 0 | "Last offset {} of {} is larger than values length {}", |
961 | 0 | last_offset, self.data_type, values_length, |
962 | 0 | ))); |
963 | 23 | } |
964 | | |
965 | 23 | if first_offset > last_offset { |
966 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
967 | 0 | "First offset {} in {} is smaller than last offset {}", |
968 | 0 | first_offset, self.data_type, last_offset, |
969 | 0 | ))); |
970 | 23 | } |
971 | | |
972 | 23 | Ok(()) |
973 | 23 | } |
974 | | |
975 | | /// Does a cheap sanity check that the `self.len` values in `buffer` are valid |
976 | | /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long |
977 | 0 | fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>( |
978 | 0 | &self, |
979 | 0 | values_length: usize, |
980 | 0 | ) -> Result<(), ArrowError> { |
981 | 0 | let offsets: &[T] = self.typed_buffer(0, self.len)?; |
982 | 0 | let sizes: &[T] = self.typed_buffer(1, self.len)?; |
983 | 0 | if offsets.len() != sizes.len() { |
984 | 0 | return Err(ArrowError::ComputeError(format!( |
985 | 0 | "ListView offsets len {} does not match sizes len {}", |
986 | 0 | offsets.len(), |
987 | 0 | sizes.len() |
988 | 0 | ))); |
989 | 0 | } |
990 | | |
991 | 0 | for i in 0..sizes.len() { |
992 | 0 | let size = sizes[i].to_usize().ok_or_else(|| { |
993 | 0 | ArrowError::InvalidArgumentError(format!( |
994 | 0 | "Error converting size[{}] ({}) to usize for {}", |
995 | 0 | i, sizes[i], self.data_type |
996 | 0 | )) |
997 | 0 | })?; |
998 | 0 | let offset = offsets[i].to_usize().ok_or_else(|| { |
999 | 0 | ArrowError::InvalidArgumentError(format!( |
1000 | 0 | "Error converting offset[{}] ({}) to usize for {}", |
1001 | 0 | i, offsets[i], self.data_type |
1002 | 0 | )) |
1003 | 0 | })?; |
1004 | 0 | if size |
1005 | 0 | .checked_add(offset) |
1006 | 0 | .expect("Offset and size have exceeded the usize boundary") |
1007 | 0 | > values_length |
1008 | | { |
1009 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1010 | 0 | "Size {} at index {} is larger than the remaining values for {}", |
1011 | 0 | size, i, self.data_type |
1012 | 0 | ))); |
1013 | 0 | } |
1014 | | } |
1015 | 0 | Ok(()) |
1016 | 0 | } |
1017 | | |
1018 | | /// Validates the layout of `child_data` ArrayData structures |
1019 | 1.04k | fn validate_child_data(&self) -> Result<(), ArrowError> { |
1020 | 1.04k | match &self.data_type { |
1021 | 7 | DataType::List(field) | DataType::Map(field2 , _) => { |
1022 | 9 | let values_data = self.get_single_valid_child_data(field.data_type())?0 ; |
1023 | 9 | self.validate_offsets::<i32>(values_data.len)?0 ; |
1024 | 9 | Ok(()) |
1025 | | } |
1026 | 8 | DataType::LargeList(field) => { |
1027 | 8 | let values_data = self.get_single_valid_child_data(field.data_type())?0 ; |
1028 | 8 | self.validate_offsets::<i64>(values_data.len)?0 ; |
1029 | 8 | Ok(()) |
1030 | | } |
1031 | 0 | DataType::ListView(field) => { |
1032 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1033 | 0 | self.validate_offsets_and_sizes::<i32>(values_data.len)?; |
1034 | 0 | Ok(()) |
1035 | | } |
1036 | 0 | DataType::LargeListView(field) => { |
1037 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1038 | 0 | self.validate_offsets_and_sizes::<i64>(values_data.len)?; |
1039 | 0 | Ok(()) |
1040 | | } |
1041 | 2 | DataType::FixedSizeList(field, list_size) => { |
1042 | 2 | let values_data = self.get_single_valid_child_data(field.data_type())?0 ; |
1043 | | |
1044 | 2 | let list_size: usize = (*list_size).try_into().map_err(|_| {0 |
1045 | 0 | ArrowError::InvalidArgumentError(format!( |
1046 | 0 | "{} has a negative list_size {}", |
1047 | 0 | self.data_type, list_size |
1048 | 0 | )) |
1049 | 0 | })?; |
1050 | | |
1051 | 2 | let expected_values_len = self.len |
1052 | 2 | .checked_mul(list_size) |
1053 | 2 | .expect("integer overflow computing expected number of expected values in FixedListSize"); |
1054 | | |
1055 | 2 | if values_data.len < expected_values_len { |
1056 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1057 | 0 | "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}", |
1058 | 0 | values_data.len, self.len, list_size, self.data_type |
1059 | 0 | ))); |
1060 | 2 | } |
1061 | | |
1062 | 2 | Ok(()) |
1063 | | } |
1064 | 2 | DataType::Struct(fields) => { |
1065 | 2 | self.validate_num_child_data(fields.len())?0 ; |
1066 | 4 | for (i, field) in fields.iter()2 .enumerate2 () { |
1067 | 4 | let field_data = self.get_valid_child_data(i, field.data_type())?0 ; |
1068 | | |
1069 | | // Ensure child field has sufficient size |
1070 | 4 | if field_data.len < self.len { |
1071 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1072 | 0 | "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})", |
1073 | 0 | self.data_type, |
1074 | 0 | i, |
1075 | 0 | field.name(), |
1076 | 0 | field_data.len, |
1077 | 0 | self.len |
1078 | 0 | ))); |
1079 | 4 | } |
1080 | | } |
1081 | 2 | Ok(()) |
1082 | | } |
1083 | 40 | DataType::RunEndEncoded(run_ends_field, values_field) => { |
1084 | 40 | self.validate_num_child_data(2)?0 ; |
1085 | 40 | let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?0 ; |
1086 | 40 | let values_data = self.get_valid_child_data(1, values_field.data_type())?0 ; |
1087 | 40 | if run_ends_data.len != values_data.len { |
1088 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1089 | 0 | "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", |
1090 | 0 | run_ends_data.len, values_data.len |
1091 | 0 | ))); |
1092 | 40 | } |
1093 | 40 | if run_ends_data.nulls.is_some() { |
1094 | 0 | return Err(ArrowError::InvalidArgumentError( |
1095 | 0 | "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), |
1096 | 0 | )); |
1097 | 40 | } |
1098 | 40 | Ok(()) |
1099 | | } |
1100 | 0 | DataType::Union(fields, mode) => { |
1101 | 0 | self.validate_num_child_data(fields.len())?; |
1102 | | |
1103 | 0 | for (i, (_, field)) in fields.iter().enumerate() { |
1104 | 0 | let field_data = self.get_valid_child_data(i, field.data_type())?; |
1105 | | |
1106 | 0 | if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) { |
1107 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1108 | 0 | "Sparse union child array #{} has length smaller than expected for union array ({} < {})", |
1109 | 0 | i, |
1110 | 0 | field_data.len, |
1111 | 0 | self.len + self.offset |
1112 | 0 | ))); |
1113 | 0 | } |
1114 | | } |
1115 | 0 | Ok(()) |
1116 | | } |
1117 | 0 | DataType::Dictionary(_key_type, value_type) => { |
1118 | 0 | self.get_single_valid_child_data(value_type)?; |
1119 | 0 | Ok(()) |
1120 | | } |
1121 | | _ => { |
1122 | | // other types do not have child data |
1123 | 980 | if !self.child_data.is_empty() { |
1124 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1125 | 0 | "Expected no child arrays for type {} but got {}", |
1126 | 0 | self.data_type, |
1127 | 0 | self.child_data.len() |
1128 | 0 | ))); |
1129 | 980 | } |
1130 | 980 | Ok(()) |
1131 | | } |
1132 | | } |
1133 | 1.04k | } |
1134 | | |
1135 | | /// Ensures that this array data has a single child_data with the |
1136 | | /// expected type, and calls `validate()` on it. Returns a |
1137 | | /// reference to that child_data |
1138 | 19 | fn get_single_valid_child_data( |
1139 | 19 | &self, |
1140 | 19 | expected_type: &DataType, |
1141 | 19 | ) -> Result<&ArrayData, ArrowError> { |
1142 | 19 | self.validate_num_child_data(1)?0 ; |
1143 | 19 | self.get_valid_child_data(0, expected_type) |
1144 | 19 | } |
1145 | | |
1146 | | /// Returns `Err` if self.child_data does not have exactly `expected_len` elements |
1147 | 61 | fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { |
1148 | 61 | if self.child_data.len() != expected_len { |
1149 | 0 | Err(ArrowError::InvalidArgumentError(format!( |
1150 | 0 | "Value data for {} should contain {} child data array(s), had {}", |
1151 | 0 | self.data_type, |
1152 | 0 | expected_len, |
1153 | 0 | self.child_data.len() |
1154 | 0 | ))) |
1155 | | } else { |
1156 | 61 | Ok(()) |
1157 | | } |
1158 | 61 | } |
1159 | | |
1160 | | /// Ensures that `child_data[i]` has the expected type, calls |
1161 | | /// `validate()` on it, and returns a reference to that child_data |
1162 | 103 | fn get_valid_child_data( |
1163 | 103 | &self, |
1164 | 103 | i: usize, |
1165 | 103 | expected_type: &DataType, |
1166 | 103 | ) -> Result<&ArrayData, ArrowError> { |
1167 | 103 | let values_data = self.child_data.get(i).ok_or_else(|| {0 |
1168 | 0 | ArrowError::InvalidArgumentError(format!( |
1169 | 0 | "{} did not have enough child arrays. Expected at least {} but had only {}", |
1170 | 0 | self.data_type, |
1171 | 0 | i + 1, |
1172 | 0 | self.child_data.len() |
1173 | 0 | )) |
1174 | 0 | })?; |
1175 | | |
1176 | 103 | if expected_type != &values_data.data_type { |
1177 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1178 | 0 | "Child type mismatch for {}. Expected {} but child data had {}", |
1179 | 0 | self.data_type, expected_type, values_data.data_type |
1180 | 0 | ))); |
1181 | 103 | } |
1182 | | |
1183 | 103 | values_data.validate()?0 ; |
1184 | 103 | Ok(values_data) |
1185 | 103 | } |
1186 | | |
1187 | | /// Validate that the data contained within this [`ArrayData`] is valid |
1188 | | /// |
1189 | | /// 1. Null count is correct |
1190 | | /// 2. All offsets are valid |
1191 | | /// 3. All String data is valid UTF-8 |
1192 | | /// 4. All dictionary offsets are valid |
1193 | | /// |
1194 | | /// Internally this calls: |
1195 | | /// |
1196 | | /// * [`Self::validate`] |
1197 | | /// * [`Self::validate_nulls`] |
1198 | | /// * [`Self::validate_values`] |
1199 | | /// |
1200 | | /// Note: this does not recurse into children, for a recursive variant |
1201 | | /// see [`Self::validate_full`] |
1202 | 170 | pub fn validate_data(&self) -> Result<(), ArrowError> { |
1203 | 170 | self.validate()?0 ; |
1204 | | |
1205 | 170 | self.validate_nulls()?0 ; |
1206 | 170 | self.validate_values()?0 ; |
1207 | 170 | Ok(()) |
1208 | 170 | } |
1209 | | |
1210 | | /// Performs a full recursive validation of this [`ArrayData`] and all its children |
1211 | | /// |
1212 | | /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] |
1213 | | /// and all its children recursively |
1214 | 0 | pub fn validate_full(&self) -> Result<(), ArrowError> { |
1215 | 0 | self.validate_data()?; |
1216 | | // validate all children recursively |
1217 | 0 | self.child_data |
1218 | 0 | .iter() |
1219 | 0 | .enumerate() |
1220 | 0 | .try_for_each(|(i, child_data)| { |
1221 | 0 | child_data.validate_full().map_err(|e| { |
1222 | 0 | ArrowError::InvalidArgumentError(format!( |
1223 | 0 | "{} child #{} invalid: {}", |
1224 | 0 | self.data_type, i, e |
1225 | 0 | )) |
1226 | 0 | }) |
1227 | 0 | })?; |
1228 | 0 | Ok(()) |
1229 | 0 | } |
1230 | | |
1231 | | /// Validates the values stored within this [`ArrayData`] are valid |
1232 | | /// without recursing into child [`ArrayData`] |
1233 | | /// |
1234 | | /// Does not (yet) check |
1235 | | /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) |
1236 | | /// 2. the the null count is correct and that any |
1237 | | /// 3. nullability requirements of its children are correct |
1238 | | /// |
1239 | | /// [#85]: https://github.com/apache/arrow-rs/issues/85 |
1240 | 170 | pub fn validate_nulls(&self) -> Result<(), ArrowError> { |
1241 | 170 | if let Some(nulls13 ) = &self.nulls { |
1242 | 13 | let actual = nulls.len() - nulls.inner().count_set_bits(); |
1243 | 13 | if actual != nulls.null_count() { |
1244 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1245 | 0 | "null_count value ({}) doesn't match actual number of nulls in array ({})", |
1246 | 0 | nulls.null_count(), |
1247 | 0 | actual |
1248 | 0 | ))); |
1249 | 13 | } |
1250 | 157 | } |
1251 | | |
1252 | | // In general non-nullable children should not contain nulls, however, for certain |
1253 | | // types, such as StructArray and FixedSizeList, nulls in the parent take up |
1254 | | // space in the child. As such we permit nulls in the children in the corresponding |
1255 | | // positions for such types |
1256 | 170 | match &self.data_type { |
1257 | 8 | DataType::List(f7 ) | DataType::LargeList(f) | DataType::Map(f2 , _) => { |
1258 | 17 | if !f.is_nullable() { |
1259 | 9 | self.validate_non_nullable(None, &self.child_data[0])?0 |
1260 | 8 | } |
1261 | | } |
1262 | 2 | DataType::FixedSizeList(field, len) => { |
1263 | 2 | let child = &self.child_data[0]; |
1264 | 2 | if !field.is_nullable() { |
1265 | 2 | match &self.nulls { |
1266 | 1 | Some(nulls) => { |
1267 | 1 | let element_len = *len as usize; |
1268 | 1 | let expanded = nulls.expand(element_len); |
1269 | 1 | self.validate_non_nullable(Some(&expanded), child)?0 ; |
1270 | | } |
1271 | 1 | None => self.validate_non_nullable(None, child)?0 , |
1272 | | } |
1273 | 0 | } |
1274 | | } |
1275 | 0 | DataType::Struct(fields) => { |
1276 | 0 | for (field, child) in fields.iter().zip(&self.child_data) { |
1277 | 0 | if !field.is_nullable() { |
1278 | 0 | self.validate_non_nullable(self.nulls(), child)? |
1279 | 0 | } |
1280 | | } |
1281 | | } |
1282 | 151 | _ => {} |
1283 | | } |
1284 | | |
1285 | 170 | Ok(()) |
1286 | 170 | } |
1287 | | |
1288 | | /// Verifies that `child` contains no nulls not present in `mask` |
1289 | 11 | fn validate_non_nullable( |
1290 | 11 | &self, |
1291 | 11 | mask: Option<&NullBuffer>, |
1292 | 11 | child: &ArrayData, |
1293 | 11 | ) -> Result<(), ArrowError> { |
1294 | 11 | let mask1 = match mask { |
1295 | 1 | Some(mask) => mask, |
1296 | | None => { |
1297 | 10 | return match child.null_count() { |
1298 | 10 | 0 => Ok(()), |
1299 | 0 | _ => Err(ArrowError::InvalidArgumentError(format!( |
1300 | 0 | "non-nullable child of type {} contains nulls not present in parent {}", |
1301 | 0 | child.data_type, self.data_type |
1302 | 0 | ))), |
1303 | | }; |
1304 | | } |
1305 | | }; |
1306 | | |
1307 | 1 | match child.nulls() { |
1308 | 0 | Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!( |
1309 | 0 | "non-nullable child of type {} contains nulls not present in parent", |
1310 | 0 | child.data_type |
1311 | 0 | ))), |
1312 | 1 | _ => Ok(()), |
1313 | | } |
1314 | 11 | } |
1315 | | |
1316 | | /// Validates the values stored within this [`ArrayData`] are valid |
1317 | | /// without recursing into child [`ArrayData`] |
1318 | | /// |
1319 | | /// Does not (yet) check |
1320 | | /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) |
1321 | 170 | pub fn validate_values(&self) -> Result<(), ArrowError> { |
1322 | 170 | match &self.data_type { |
1323 | 0 | DataType::Utf8 => self.validate_utf8::<i32>(), |
1324 | 0 | DataType::LargeUtf8 => self.validate_utf8::<i64>(), |
1325 | 0 | DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()), |
1326 | 0 | DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()), |
1327 | | DataType::BinaryView => { |
1328 | 0 | let views = self.typed_buffer::<u128>(0, self.len)?; |
1329 | 0 | validate_binary_view(views, &self.buffers[1..]) |
1330 | | } |
1331 | | DataType::Utf8View => { |
1332 | 0 | let views = self.typed_buffer::<u128>(0, self.len)?; |
1333 | 0 | validate_string_view(views, &self.buffers[1..]) |
1334 | | } |
1335 | | DataType::List(_) | DataType::Map(_, _) => { |
1336 | 9 | let child = &self.child_data[0]; |
1337 | 9 | self.validate_offsets_full::<i32>(child.len) |
1338 | | } |
1339 | | DataType::LargeList(_) => { |
1340 | 8 | let child = &self.child_data[0]; |
1341 | 8 | self.validate_offsets_full::<i64>(child.len) |
1342 | | } |
1343 | | DataType::Union(_, _) => { |
1344 | | // Validate Union Array as part of implementing new Union semantics |
1345 | | // See comments in `ArrayData::validate()` |
1346 | | // https://github.com/apache/arrow-rs/issues/85 |
1347 | | // |
1348 | | // TODO file follow on ticket for full union validation |
1349 | 0 | Ok(()) |
1350 | | } |
1351 | 0 | DataType::Dictionary(key_type, _value_type) => { |
1352 | 0 | let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); |
1353 | 0 | let max_value = dictionary_length - 1; |
1354 | 0 | match key_type.as_ref() { |
1355 | 0 | DataType::UInt8 => self.check_bounds::<u8>(max_value), |
1356 | 0 | DataType::UInt16 => self.check_bounds::<u16>(max_value), |
1357 | 0 | DataType::UInt32 => self.check_bounds::<u32>(max_value), |
1358 | 0 | DataType::UInt64 => self.check_bounds::<u64>(max_value), |
1359 | 0 | DataType::Int8 => self.check_bounds::<i8>(max_value), |
1360 | 0 | DataType::Int16 => self.check_bounds::<i16>(max_value), |
1361 | 0 | DataType::Int32 => self.check_bounds::<i32>(max_value), |
1362 | 0 | DataType::Int64 => self.check_bounds::<i64>(max_value), |
1363 | 0 | _ => unreachable!(), |
1364 | | } |
1365 | | } |
1366 | 40 | DataType::RunEndEncoded(run_ends, _values) => { |
1367 | 40 | let run_ends_data = self.child_data()[0].clone(); |
1368 | 40 | match run_ends.data_type() { |
1369 | 5 | DataType::Int16 => run_ends_data.check_run_ends::<i16>(), |
1370 | 26 | DataType::Int32 => run_ends_data.check_run_ends::<i32>(), |
1371 | 9 | DataType::Int64 => run_ends_data.check_run_ends::<i64>(), |
1372 | 0 | _ => unreachable!(), |
1373 | | } |
1374 | | } |
1375 | | _ => { |
1376 | | // No extra validation check required for other types |
1377 | 113 | Ok(()) |
1378 | | } |
1379 | | } |
1380 | 170 | } |
1381 | | |
1382 | | /// Calls the `validate(item_index, range)` function for each of |
1383 | | /// the ranges specified in the arrow offsets buffer of type |
1384 | | /// `T`. Also validates that each offset is smaller than |
1385 | | /// `offset_limit` |
1386 | | /// |
1387 | | /// For an empty array, the offsets buffer can either be empty |
1388 | | /// or contain a single `0`. |
1389 | | /// |
1390 | | /// For example, the offsets buffer contained `[1, 2, 4]`, this |
1391 | | /// function would call `validate([1,2])`, and `validate([2,4])` |
1392 | 17 | fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError> |
1393 | 17 | where |
1394 | 17 | T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display, |
1395 | 17 | V: Fn(usize, Range<usize>) -> Result<(), ArrowError>, |
1396 | | { |
1397 | 17 | self.typed_offsets::<T>()?0 |
1398 | 17 | .iter() |
1399 | 17 | .enumerate() |
1400 | 83 | .map17 (|(i, x)| { |
1401 | | // check if the offset can be converted to usize |
1402 | 83 | let r = x.to_usize().ok_or_else(|| {0 |
1403 | 0 | ArrowError::InvalidArgumentError(format!( |
1404 | 0 | "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))} |
1405 | | ); |
1406 | | // check if the offset exceeds the limit |
1407 | 83 | match r { |
1408 | 83 | Ok(n) if n <= offset_limit => Ok((i, n)), |
1409 | 0 | Ok(_) => Err(ArrowError::InvalidArgumentError(format!( |
1410 | 0 | "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}")) |
1411 | 0 | ), |
1412 | 0 | Err(e) => Err(e), |
1413 | | } |
1414 | 83 | }) |
1415 | 83 | .scan17 (0_usize, |start, end| { |
1416 | | // check offsets are monotonically increasing |
1417 | 83 | match end { |
1418 | 83 | Ok((i, end)) if *start <= end => { |
1419 | 83 | let range = Some(Ok((i, *start..end))); |
1420 | 83 | *start = end; |
1421 | 83 | range |
1422 | | } |
1423 | 0 | Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( |
1424 | 0 | "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", |
1425 | 0 | i - 1, start, end)) |
1426 | 0 | )), |
1427 | 0 | Err(err) => Some(Err(err)), |
1428 | | } |
1429 | 83 | }) |
1430 | 17 | .skip(1) // the first element is meaningless |
1431 | 66 | .try_for_each17 (|res: Result<(usize, Range<usize>), ArrowError>| { |
1432 | 66 | let (item_index, range) = res?0 ; |
1433 | 66 | validate(item_index-1, range) |
1434 | 66 | }) |
1435 | 17 | } |
1436 | | |
1437 | | /// Ensures that all strings formed by the offsets in `buffers[0]` |
1438 | | /// into `buffers[1]` are valid utf8 sequences |
1439 | 0 | fn validate_utf8<T>(&self) -> Result<(), ArrowError> |
1440 | 0 | where |
1441 | 0 | T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display, |
1442 | | { |
1443 | 0 | let values_buffer = &self.buffers[1].as_slice(); |
1444 | 0 | if let Ok(values_str) = std::str::from_utf8(values_buffer) { |
1445 | | // Validate Offsets are correct |
1446 | 0 | self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| { |
1447 | 0 | if !values_str.is_char_boundary(range.start) |
1448 | 0 | || !values_str.is_char_boundary(range.end) |
1449 | | { |
1450 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1451 | 0 | "incomplete utf-8 byte sequence from index {string_index}" |
1452 | 0 | ))); |
1453 | 0 | } |
1454 | 0 | Ok(()) |
1455 | 0 | }) |
1456 | | } else { |
1457 | | // find specific offset that failed utf8 validation |
1458 | 0 | self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| { |
1459 | 0 | std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { |
1460 | 0 | ArrowError::InvalidArgumentError(format!( |
1461 | 0 | "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" |
1462 | 0 | )) |
1463 | 0 | })?; |
1464 | 0 | Ok(()) |
1465 | 0 | }) |
1466 | | } |
1467 | 0 | } |
1468 | | |
1469 | | /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are |
1470 | | /// between `0` and `offset_limit` |
1471 | 17 | fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError> |
1472 | 17 | where |
1473 | 17 | T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display, |
1474 | | { |
1475 | 66 | self17 .validate_each_offset17 ::<T, _>(offset_limit17 , |_string_index, _range| { |
1476 | | // No validation applied to each value, but the iteration |
1477 | | // itself applies bounds checking to each range |
1478 | 66 | Ok(()) |
1479 | 66 | }) |
1480 | 17 | } |
1481 | | |
1482 | | /// Validates that each value in self.buffers (typed as T) |
1483 | | /// is within the range [0, max_value], inclusive |
1484 | 0 | fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError> |
1485 | 0 | where |
1486 | 0 | T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display, |
1487 | | { |
1488 | 0 | let required_len = self.len + self.offset; |
1489 | 0 | let buffer = &self.buffers[0]; |
1490 | | |
1491 | | // This should have been checked as part of `validate()` prior |
1492 | | // to calling `validate_full()` but double check to be sure |
1493 | 0 | assert!(buffer.len() / mem::size_of::<T>() >= required_len); |
1494 | | |
1495 | | // Justification: buffer size was validated above |
1496 | 0 | let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len]; |
1497 | | |
1498 | 0 | indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { |
1499 | | // Do not check the value is null (value can be arbitrary) |
1500 | 0 | if self.is_null(i) { |
1501 | 0 | return Ok(()); |
1502 | 0 | } |
1503 | 0 | let dict_index: i64 = dict_index.try_into().map_err(|_| { |
1504 | 0 | ArrowError::InvalidArgumentError(format!( |
1505 | 0 | "Value at position {i} out of bounds: {dict_index} (can not convert to i64)" |
1506 | 0 | )) |
1507 | 0 | })?; |
1508 | | |
1509 | 0 | if dict_index < 0 || dict_index > max_value { |
1510 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1511 | 0 | "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])" |
1512 | 0 | ))); |
1513 | 0 | } |
1514 | 0 | Ok(()) |
1515 | 0 | }) |
1516 | 0 | } |
1517 | | |
1518 | | /// Validates that each value in run_ends array is positive and strictly increasing. |
1519 | 40 | fn check_run_ends<T>(&self) -> Result<(), ArrowError> |
1520 | 40 | where |
1521 | 40 | T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display, |
1522 | | { |
1523 | 40 | let values = self.typed_buffer::<T>(0, self.len)?0 ; |
1524 | 40 | let mut prev_value: i64 = 0_i64; |
1525 | 117 | values40 .iter().enumerate().try_for_each40 (|(ix, &inp_value)| { |
1526 | 117 | let value: i64 = inp_value.try_into().map_err(|_| {0 |
1527 | 0 | ArrowError::InvalidArgumentError(format!( |
1528 | 0 | "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)" |
1529 | 0 | )) |
1530 | 0 | })?; |
1531 | 117 | if value <= 0_i64 { |
1532 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1533 | 0 | "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria." |
1534 | 0 | ))); |
1535 | 117 | } |
1536 | 117 | if ix > 0 && value <= prev_value77 { |
1537 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1538 | 0 | "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria." |
1539 | 0 | ))); |
1540 | 117 | } |
1541 | | |
1542 | 117 | prev_value = value; |
1543 | 117 | Ok(()) |
1544 | 117 | })?0 ; |
1545 | | |
1546 | 40 | if prev_value.as_usize() < (self.offset + self.len) { |
1547 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1548 | 0 | "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.", |
1549 | 0 | self.offset + self.len |
1550 | 0 | ))); |
1551 | 40 | } |
1552 | 40 | Ok(()) |
1553 | 40 | } |
1554 | | |
1555 | | /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons |
1556 | | /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may |
1557 | | /// return false when the arrays are logically equal |
1558 | 17 | pub fn ptr_eq(&self, other: &Self) -> bool { |
1559 | 17 | if self.offset != other.offset |
1560 | 17 | || self.len != other.len |
1561 | 8 | || self.data_type != other.data_type |
1562 | 8 | || self.buffers.len() != other.buffers.len() |
1563 | 8 | || self.child_data.len() != other.child_data.len() |
1564 | | { |
1565 | 9 | return false; |
1566 | 8 | } |
1567 | | |
1568 | 8 | match (&self.nulls, &other.nulls) { |
1569 | 0 | (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false, |
1570 | 0 | (Some(_), None) | (None, Some(_)) => return false, |
1571 | 8 | _ => {} |
1572 | | }; |
1573 | | |
1574 | 8 | if !self |
1575 | 8 | .buffers |
1576 | 8 | .iter() |
1577 | 8 | .zip(other.buffers.iter()) |
1578 | 12 | .all8 (|(a, b)| a.as_ptr() == b.as_ptr()) |
1579 | | { |
1580 | 1 | return false; |
1581 | 7 | } |
1582 | | |
1583 | 7 | self.child_data |
1584 | 7 | .iter() |
1585 | 7 | .zip(other.child_data.iter()) |
1586 | 7 | .all(|(a, b)| a0 .ptr_eq0 (b0 )) |
1587 | 17 | } |
1588 | | |
1589 | | /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] |
1590 | 321k | pub fn into_builder(self) -> ArrayDataBuilder { |
1591 | 321k | self.into() |
1592 | 321k | } |
1593 | | } |
1594 | | |
1595 | | /// Return the expected [`DataTypeLayout`] Arrays of this data |
1596 | | /// type are expected to have |
1597 | 1.05k | pub fn layout(data_type: &DataType) -> DataTypeLayout { |
1598 | | // based on C/C++ implementation in |
1599 | | // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) |
1600 | | use arrow_schema::IntervalUnit::*; |
1601 | | |
1602 | 0 | match data_type { |
1603 | 0 | DataType::Null => DataTypeLayout { |
1604 | 0 | buffers: vec![], |
1605 | 0 | can_contain_null_mask: false, |
1606 | 0 | variadic: false, |
1607 | 0 | }, |
1608 | 105 | DataType::Boolean => DataTypeLayout { |
1609 | 105 | buffers: vec![BufferSpec::BitMap], |
1610 | 105 | can_contain_null_mask: true, |
1611 | 105 | variadic: false, |
1612 | 105 | }, |
1613 | 0 | DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(), |
1614 | 8 | DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(), |
1615 | 846 | DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(), |
1616 | 16 | DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(), |
1617 | 0 | DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(), |
1618 | 0 | DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(), |
1619 | 2 | DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(), |
1620 | 0 | DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(), |
1621 | 0 | DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(), |
1622 | 0 | DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(), |
1623 | 0 | DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(), |
1624 | 0 | DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(), |
1625 | 0 | DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(), |
1626 | 0 | DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(), |
1627 | 0 | DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(), |
1628 | 0 | DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1629 | 0 | DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(), |
1630 | 0 | DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(), |
1631 | | DataType::Interval(MonthDayNano) => { |
1632 | 0 | DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>() |
1633 | | } |
1634 | 0 | DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1635 | 0 | DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(), |
1636 | 0 | DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(), |
1637 | 0 | DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(), |
1638 | 0 | DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(), |
1639 | 0 | DataType::FixedSizeBinary(size) => { |
1640 | 0 | let spec = BufferSpec::FixedWidth { |
1641 | 0 | byte_width: (*size).try_into().unwrap(), |
1642 | 0 | alignment: mem::align_of::<u8>(), |
1643 | 0 | }; |
1644 | 0 | DataTypeLayout { |
1645 | 0 | buffers: vec![spec], |
1646 | 0 | can_contain_null_mask: true, |
1647 | 0 | variadic: false, |
1648 | 0 | } |
1649 | | } |
1650 | 0 | DataType::Binary => DataTypeLayout::new_binary::<i32>(), |
1651 | 0 | DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(), |
1652 | 9 | DataType::Utf8 => DataTypeLayout::new_binary::<i32>(), |
1653 | 0 | DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(), |
1654 | 0 | DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), |
1655 | 2 | DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data |
1656 | 7 | DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(), |
1657 | 0 | DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(), |
1658 | 0 | DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(), |
1659 | 8 | DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1660 | 2 | DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(), |
1661 | 2 | DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data, |
1662 | 40 | DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, |
1663 | 3 | DataType::Union(_, mode) => { |
1664 | 3 | let type_ids = BufferSpec::FixedWidth { |
1665 | 3 | byte_width: mem::size_of::<i8>(), |
1666 | 3 | alignment: mem::align_of::<i8>(), |
1667 | 3 | }; |
1668 | | |
1669 | | DataTypeLayout { |
1670 | 3 | buffers: match mode { |
1671 | | UnionMode::Sparse => { |
1672 | 3 | vec![type_ids] |
1673 | | } |
1674 | | UnionMode::Dense => { |
1675 | 0 | vec![ |
1676 | 0 | type_ids, |
1677 | 0 | BufferSpec::FixedWidth { |
1678 | 0 | byte_width: mem::size_of::<i32>(), |
1679 | 0 | alignment: mem::align_of::<i32>(), |
1680 | 0 | }, |
1681 | | ] |
1682 | | } |
1683 | | }, |
1684 | | can_contain_null_mask: false, |
1685 | | variadic: false, |
1686 | | } |
1687 | | } |
1688 | 0 | DataType::Dictionary(key_type, _value_type) => layout(key_type), |
1689 | | } |
1690 | 1.05k | } |
1691 | | |
1692 | | /// Layout specification for a data type |
1693 | | #[derive(Debug, PartialEq, Eq)] |
1694 | | // Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 |
1695 | | pub struct DataTypeLayout { |
1696 | | /// A vector of buffer layout specifications, one for each expected buffer |
1697 | | pub buffers: Vec<BufferSpec>, |
1698 | | |
1699 | | /// Can contain a null bitmask |
1700 | | pub can_contain_null_mask: bool, |
1701 | | |
1702 | | /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`] |
1703 | | /// If `variadic` is true, the number of buffers expected is only lower-bounded by |
1704 | | /// buffers.len(). Buffers that exceed the lower bound are legal. |
1705 | | pub variadic: bool, |
1706 | | } |
1707 | | |
1708 | | impl DataTypeLayout { |
1709 | | /// Describes a basic numeric array where each element has type `T` |
1710 | 889 | pub fn new_fixed_width<T>() -> Self { |
1711 | 889 | Self { |
1712 | 889 | buffers: vec![BufferSpec::FixedWidth { |
1713 | 889 | byte_width: mem::size_of::<T>(), |
1714 | 889 | alignment: mem::align_of::<T>(), |
1715 | 889 | }], |
1716 | 889 | can_contain_null_mask: true, |
1717 | 889 | variadic: false, |
1718 | 889 | } |
1719 | 889 | } |
1720 | | |
1721 | | /// Describes arrays which have no data of their own |
1722 | | /// but may still have a Null Bitmap (e.g. FixedSizeList) |
1723 | 4 | pub fn new_nullable_empty() -> Self { |
1724 | 4 | Self { |
1725 | 4 | buffers: vec![], |
1726 | 4 | can_contain_null_mask: true, |
1727 | 4 | variadic: false, |
1728 | 4 | } |
1729 | 4 | } |
1730 | | |
1731 | | /// Describes arrays which have no data of their own |
1732 | | /// (e.g. RunEndEncoded). |
1733 | 40 | pub fn new_empty() -> Self { |
1734 | 40 | Self { |
1735 | 40 | buffers: vec![], |
1736 | 40 | can_contain_null_mask: false, |
1737 | 40 | variadic: false, |
1738 | 40 | } |
1739 | 40 | } |
1740 | | |
1741 | | /// Describes a basic numeric array where each element has a fixed |
1742 | | /// with offset buffer of type `T`, followed by a |
1743 | | /// variable width data buffer |
1744 | 9 | pub fn new_binary<T>() -> Self { |
1745 | 9 | Self { |
1746 | 9 | buffers: vec![ |
1747 | 9 | // offsets |
1748 | 9 | BufferSpec::FixedWidth { |
1749 | 9 | byte_width: mem::size_of::<T>(), |
1750 | 9 | alignment: mem::align_of::<T>(), |
1751 | 9 | }, |
1752 | 9 | // values |
1753 | 9 | BufferSpec::VariableWidth, |
1754 | 9 | ], |
1755 | 9 | can_contain_null_mask: true, |
1756 | 9 | variadic: false, |
1757 | 9 | } |
1758 | 9 | } |
1759 | | |
1760 | | /// Describes a view type |
1761 | 0 | pub fn new_view() -> Self { |
1762 | 0 | Self { |
1763 | 0 | buffers: vec![BufferSpec::FixedWidth { |
1764 | 0 | byte_width: mem::size_of::<u128>(), |
1765 | 0 | alignment: mem::align_of::<u128>(), |
1766 | 0 | }], |
1767 | 0 | can_contain_null_mask: true, |
1768 | 0 | variadic: true, |
1769 | 0 | } |
1770 | 0 | } |
1771 | | |
1772 | | /// Describes a list view type |
1773 | 0 | pub fn new_list_view<T>() -> Self { |
1774 | 0 | Self { |
1775 | 0 | buffers: vec![ |
1776 | 0 | BufferSpec::FixedWidth { |
1777 | 0 | byte_width: mem::size_of::<T>(), |
1778 | 0 | alignment: mem::align_of::<T>(), |
1779 | 0 | }, |
1780 | 0 | BufferSpec::FixedWidth { |
1781 | 0 | byte_width: mem::size_of::<T>(), |
1782 | 0 | alignment: mem::align_of::<T>(), |
1783 | 0 | }, |
1784 | 0 | ], |
1785 | 0 | can_contain_null_mask: true, |
1786 | 0 | variadic: true, |
1787 | 0 | } |
1788 | 0 | } |
1789 | | } |
1790 | | |
1791 | | /// Layout specification for a single data type buffer |
1792 | | #[derive(Debug, PartialEq, Eq)] |
1793 | | pub enum BufferSpec { |
1794 | | /// Each element is a fixed width primitive, with the given `byte_width` and `alignment` |
1795 | | /// |
1796 | | /// `alignment` is the alignment required by Rust for an array of the corresponding primitive, |
1797 | | /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. |
1798 | | /// |
1799 | | /// Arrow-rs requires that all buffers have at least this alignment, to allow for |
1800 | | /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow |
1801 | | /// for array slicing and interoperability with `Vec`, which cannot be over-aligned. |
1802 | | /// |
1803 | | /// Note that these alignment requirements will vary between architectures |
1804 | | FixedWidth { |
1805 | | /// The width of each element in bytes |
1806 | | byte_width: usize, |
1807 | | /// The alignment required by Rust for an array of the corresponding primitive |
1808 | | alignment: usize, |
1809 | | }, |
1810 | | /// Variable width, such as string data for utf8 data |
1811 | | VariableWidth, |
1812 | | /// Buffer holds a bitmap. |
1813 | | /// |
1814 | | /// Note: Unlike the C++ implementation, the null/validity buffer |
1815 | | /// is handled specially rather than as another of the buffers in |
1816 | | /// the spec, so this variant is only used for the Boolean type. |
1817 | | BitMap, |
1818 | | /// Buffer is always null. Unused currently in Rust implementation, |
1819 | | /// (used in C++ for Union type) |
1820 | | #[allow(dead_code)] |
1821 | | AlwaysNull, |
1822 | | } |
1823 | | |
1824 | | impl PartialEq for ArrayData { |
1825 | 81.1k | fn eq(&self, other: &Self) -> bool { |
1826 | 81.1k | equal::equal(self, other) |
1827 | 81.1k | } |
1828 | | } |
1829 | | |
1830 | | /// A boolean flag that cannot be mutated outside of unsafe code. |
1831 | | /// |
1832 | | /// Defaults to a value of false. |
1833 | | /// |
1834 | | /// This structure is used to enforce safety in the [`ArrayDataBuilder`] |
1835 | | /// |
1836 | | /// [`ArrayDataBuilder`]: super::ArrayDataBuilder |
1837 | | /// |
1838 | | /// # Example |
1839 | | /// ```rust |
1840 | | /// use arrow_data::UnsafeFlag; |
1841 | | /// assert!(!UnsafeFlag::default().get()); // default is false |
1842 | | /// let mut flag = UnsafeFlag::new(); |
1843 | | /// assert!(!flag.get()); // defaults to false |
1844 | | /// // can only set it to true in unsafe code |
1845 | | /// unsafe { flag.set(true) }; |
1846 | | /// assert!(flag.get()); // now true |
1847 | | /// ``` |
1848 | | #[derive(Debug, Clone)] |
1849 | | #[doc(hidden)] |
1850 | | pub struct UnsafeFlag(bool); |
1851 | | |
1852 | | impl UnsafeFlag { |
1853 | | /// Creates a new `UnsafeFlag` with the value set to `false`. |
1854 | | /// |
1855 | | /// See examples on [`Self::new`] |
1856 | | #[inline] |
1857 | 970k | pub const fn new() -> Self { |
1858 | 970k | Self(false) |
1859 | 970k | } |
1860 | | |
1861 | | /// Sets the value of the flag to the given value |
1862 | | /// |
1863 | | /// Note this can purposely only be done in `unsafe` code |
1864 | | /// |
1865 | | /// # Safety |
1866 | | /// |
1867 | | /// If set, the flag will be set to the given value. There is nothing |
1868 | | /// immediately unsafe about doing so, however, the flag can be used to |
1869 | | /// subsequently bypass safety checks in the [`ArrayDataBuilder`]. |
1870 | | #[inline] |
1871 | 970k | pub unsafe fn set(&mut self, val: bool) { |
1872 | 970k | self.0 = val; |
1873 | 970k | } |
1874 | | |
1875 | | /// Returns the value of the flag |
1876 | | #[inline] |
1877 | 970k | pub fn get(&self) -> bool { |
1878 | 970k | self.0 |
1879 | 970k | } |
1880 | | } |
1881 | | |
1882 | | // Manual impl to make it clear you can not construct unsafe with true |
1883 | | impl Default for UnsafeFlag { |
1884 | 0 | fn default() -> Self { |
1885 | 0 | Self::new() |
1886 | 0 | } |
1887 | | } |
1888 | | |
1889 | | /// Builder for [`ArrayData`] type |
1890 | | #[derive(Debug)] |
1891 | | pub struct ArrayDataBuilder { |
1892 | | data_type: DataType, |
1893 | | len: usize, |
1894 | | null_count: Option<usize>, |
1895 | | null_bit_buffer: Option<Buffer>, |
1896 | | nulls: Option<NullBuffer>, |
1897 | | offset: usize, |
1898 | | buffers: Vec<Buffer>, |
1899 | | child_data: Vec<ArrayData>, |
1900 | | /// Should buffers be realigned (copying if necessary)? |
1901 | | /// |
1902 | | /// Defaults to false. |
1903 | | align_buffers: bool, |
1904 | | /// Should data validation be skipped for this [`ArrayData`]? |
1905 | | /// |
1906 | | /// Defaults to false. |
1907 | | /// |
1908 | | /// # Safety |
1909 | | /// |
1910 | | /// This flag can only be set to true using `unsafe` APIs. However, once true |
1911 | | /// subsequent calls to `build()` may result in undefined behavior if the data |
1912 | | /// is not valid. |
1913 | | skip_validation: UnsafeFlag, |
1914 | | } |
1915 | | |
1916 | | impl ArrayDataBuilder { |
1917 | | #[inline] |
1918 | | /// Creates a new array data builder |
1919 | 647k | pub const fn new(data_type: DataType) -> Self { |
1920 | 647k | Self { |
1921 | 647k | data_type, |
1922 | 647k | len: 0, |
1923 | 647k | null_count: None, |
1924 | 647k | null_bit_buffer: None, |
1925 | 647k | nulls: None, |
1926 | 647k | offset: 0, |
1927 | 647k | buffers: vec![], |
1928 | 647k | child_data: vec![], |
1929 | 647k | align_buffers: false, |
1930 | 647k | skip_validation: UnsafeFlag::new(), |
1931 | 647k | } |
1932 | 647k | } |
1933 | | |
1934 | | /// Creates a new array data builder from an existing one, changing the data type |
1935 | 320k | pub fn data_type(self, data_type: DataType) -> Self { |
1936 | 320k | Self { data_type, ..self } |
1937 | 320k | } |
1938 | | |
1939 | | #[inline] |
1940 | | #[allow(clippy::len_without_is_empty)] |
1941 | | /// Sets the length of the [ArrayData] |
1942 | 647k | pub const fn len(mut self, n: usize) -> Self { |
1943 | 647k | self.len = n; |
1944 | 647k | self |
1945 | 647k | } |
1946 | | |
1947 | | /// Sets the null buffer of the [ArrayData] |
1948 | 647k | pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self { |
1949 | 647k | self.nulls = nulls; |
1950 | 647k | self.null_count = None; |
1951 | 647k | self.null_bit_buffer = None; |
1952 | 647k | self |
1953 | 647k | } |
1954 | | |
1955 | | /// Sets the null count of the [ArrayData] |
1956 | 547 | pub fn null_count(mut self, null_count: usize) -> Self { |
1957 | 547 | self.null_count = Some(null_count); |
1958 | 547 | self |
1959 | 547 | } |
1960 | | |
1961 | | /// Sets the `null_bit_buffer` of the [ArrayData] |
1962 | 565 | pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self { |
1963 | 565 | self.nulls = None; |
1964 | 565 | self.null_bit_buffer = buf; |
1965 | 565 | self |
1966 | 565 | } |
1967 | | |
1968 | | /// Sets the offset of the [ArrayData] |
1969 | | #[inline] |
1970 | 262 | pub const fn offset(mut self, n: usize) -> Self { |
1971 | 262 | self.offset = n; |
1972 | 262 | self |
1973 | 262 | } |
1974 | | |
1975 | | /// Sets the buffers of the [ArrayData] |
1976 | 485k | pub fn buffers(mut self, v: Vec<Buffer>) -> Self { |
1977 | 485k | self.buffers = v; |
1978 | 485k | self |
1979 | 485k | } |
1980 | | |
1981 | | /// Adds a single buffer to the [ArrayData]'s buffers |
1982 | 243k | pub fn add_buffer(mut self, b: Buffer) -> Self { |
1983 | 243k | self.buffers.push(b); |
1984 | 243k | self |
1985 | 243k | } |
1986 | | |
1987 | | /// Adds multiple buffers to the [ArrayData]'s buffers |
1988 | 84 | pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self { |
1989 | 84 | self.buffers.extend(bs); |
1990 | 84 | self |
1991 | 84 | } |
1992 | | |
1993 | | /// Sets the child data of the [ArrayData] |
1994 | 320k | pub fn child_data(mut self, v: Vec<ArrayData>) -> Self { |
1995 | 320k | self.child_data = v; |
1996 | 320k | self |
1997 | 320k | } |
1998 | | |
1999 | | /// Adds a single child data to the [ArrayData]'s child data |
2000 | 106 | pub fn add_child_data(mut self, r: ArrayData) -> Self { |
2001 | 106 | self.child_data.push(r); |
2002 | 106 | self |
2003 | 106 | } |
2004 | | |
2005 | | /// Creates an array data, without any validation |
2006 | | /// |
2007 | | /// Note: This is shorthand for |
2008 | | /// ```rust |
2009 | | /// # #[expect(unsafe_op_in_unsafe_fn)] |
2010 | | /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null); |
2011 | | /// # let _ = unsafe { |
2012 | | /// builder.skip_validation(true).build().unwrap() |
2013 | | /// # }; |
2014 | | /// ``` |
2015 | | /// |
2016 | | /// # Safety |
2017 | | /// |
2018 | | /// The same caveats as [`ArrayData::new_unchecked`] |
2019 | | /// apply. |
2020 | 968k | pub unsafe fn build_unchecked(self) -> ArrayData { |
2021 | 968k | unsafe { self.skip_validation(true) }.build().unwrap() |
2022 | 968k | } |
2023 | | |
2024 | | /// Creates an `ArrayData`, consuming `self` |
2025 | | /// |
2026 | | /// # Safety |
2027 | | /// |
2028 | | /// By default the underlying buffers are checked to ensure they are valid |
2029 | | /// Arrow data. However, if the [`Self::skip_validation`] flag has been set |
2030 | | /// to true (by the `unsafe` API) this validation is skipped. If the data is |
2031 | | /// not valid, undefined behavior will result. |
2032 | 970k | pub fn build(self) -> Result<ArrayData, ArrowError> { |
2033 | | let Self { |
2034 | 970k | data_type, |
2035 | 970k | len, |
2036 | 970k | null_count, |
2037 | 970k | null_bit_buffer, |
2038 | 970k | nulls, |
2039 | 970k | offset, |
2040 | 970k | buffers, |
2041 | 970k | child_data, |
2042 | 970k | align_buffers, |
2043 | 970k | skip_validation, |
2044 | 970k | } = self; |
2045 | | |
2046 | 970k | let nulls = nulls |
2047 | 970k | .or_else(|| {965k |
2048 | 965k | let buffer2.69k = null_bit_buffer?963k ; |
2049 | 2.69k | let buffer = BooleanBuffer::new(buffer, offset, len); |
2050 | 2.69k | Some(match null_count { |
2051 | 545 | Some(n) => { |
2052 | | // SAFETY: call to `data.validate_data()` below validates the null buffer is valid |
2053 | 545 | unsafe { NullBuffer::new_unchecked(buffer, n) } |
2054 | | } |
2055 | 2.14k | None => NullBuffer::new(buffer), |
2056 | | }) |
2057 | 965k | }) |
2058 | 970k | .filter(|b| b7.78k .null_count7.78k () != 0); |
2059 | | |
2060 | 970k | let mut data = ArrayData { |
2061 | 970k | data_type, |
2062 | 970k | len, |
2063 | 970k | offset, |
2064 | 970k | buffers, |
2065 | 970k | child_data, |
2066 | 970k | nulls, |
2067 | 970k | }; |
2068 | | |
2069 | 970k | if align_buffers { |
2070 | 0 | data.align_buffers(); |
2071 | 970k | } |
2072 | | |
2073 | | // SAFETY: `skip_validation` is only set to true using `unsafe` APIs |
2074 | 970k | if !skip_validation.get() || cfg!970k (feature = "force_validate") { |
2075 | 128 | data.validate_data()?0 ; |
2076 | 970k | } |
2077 | 970k | Ok(data) |
2078 | 970k | } |
2079 | | |
2080 | | /// Creates an array data, validating all inputs, and aligning any buffers |
2081 | | #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")] |
2082 | 0 | pub fn build_aligned(self) -> Result<ArrayData, ArrowError> { |
2083 | 0 | self.align_buffers(true).build() |
2084 | 0 | } |
2085 | | |
2086 | | /// Ensure that all buffers are aligned, copying data if necessary |
2087 | | /// |
2088 | | /// Rust requires that arrays are aligned to their corresponding primitive, |
2089 | | /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. |
2090 | | /// |
2091 | | /// [`ArrayData`] therefore requires that all buffers have at least this alignment, |
2092 | | /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. |
2093 | | /// |
2094 | | /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, |
2095 | | /// this flag is provided to automatically copy buffers to a new correctly aligned allocation |
2096 | | /// when necessary, making it useful when interacting with buffers produced by other systems, |
2097 | | /// e.g. IPC or FFI. |
2098 | | /// |
2099 | | /// If this flag is not enabled, `[Self::build`] return an error on encountering |
2100 | | /// insufficiently aligned buffers. |
2101 | 0 | pub fn align_buffers(mut self, align_buffers: bool) -> Self { |
2102 | 0 | self.align_buffers = align_buffers; |
2103 | 0 | self |
2104 | 0 | } |
2105 | | |
2106 | | /// Skips validation of the data. |
2107 | | /// |
2108 | | /// If this flag is enabled, `[Self::build`] will skip validation of the |
2109 | | /// data |
2110 | | /// |
2111 | | /// If this flag is not enabled, `[Self::build`] will validate that all |
2112 | | /// buffers are valid and will return an error if any data is invalid. |
2113 | | /// Validation can be expensive. |
2114 | | /// |
2115 | | /// # Safety |
2116 | | /// |
2117 | | /// If validation is skipped, the buffers must form a valid Arrow array, |
2118 | | /// otherwise undefined behavior will result |
2119 | 968k | pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self { |
2120 | 968k | unsafe { |
2121 | 968k | self.skip_validation.set(skip_validation); |
2122 | 968k | } |
2123 | 968k | self |
2124 | 968k | } |
2125 | | } |
2126 | | |
2127 | | impl From<ArrayData> for ArrayDataBuilder { |
2128 | 321k | fn from(d: ArrayData) -> Self { |
2129 | 321k | Self { |
2130 | 321k | data_type: d.data_type, |
2131 | 321k | len: d.len, |
2132 | 321k | offset: d.offset, |
2133 | 321k | buffers: d.buffers, |
2134 | 321k | child_data: d.child_data, |
2135 | 321k | nulls: d.nulls, |
2136 | 321k | null_bit_buffer: None, |
2137 | 321k | null_count: None, |
2138 | 321k | align_buffers: false, |
2139 | 321k | skip_validation: UnsafeFlag::new(), |
2140 | 321k | } |
2141 | 321k | } |
2142 | | } |
2143 | | |
2144 | | #[cfg(test)] |
2145 | | mod tests { |
2146 | | use super::*; |
2147 | | use arrow_schema::{Field, Fields}; |
2148 | | |
2149 | | // See arrow/tests/array_data_validation.rs for test of array validation |
2150 | | |
2151 | | /// returns a buffer initialized with some constant value for tests |
2152 | | fn make_i32_buffer(n: usize) -> Buffer { |
2153 | | Buffer::from_slice_ref(vec![42i32; n]) |
2154 | | } |
2155 | | |
2156 | | /// returns a buffer initialized with some constant value for tests |
2157 | | fn make_f32_buffer(n: usize) -> Buffer { |
2158 | | Buffer::from_slice_ref(vec![42f32; n]) |
2159 | | } |
2160 | | |
2161 | | #[test] |
2162 | | fn test_builder() { |
2163 | | // Buffer needs to be at least 25 long |
2164 | | let v = (0..25).collect::<Vec<i32>>(); |
2165 | | let b1 = Buffer::from_slice_ref(&v); |
2166 | | let arr_data = ArrayData::builder(DataType::Int32) |
2167 | | .len(20) |
2168 | | .offset(5) |
2169 | | .add_buffer(b1) |
2170 | | .null_bit_buffer(Some(Buffer::from([ |
2171 | | 0b01011111, 0b10110101, 0b01100011, 0b00011110, |
2172 | | ]))) |
2173 | | .build() |
2174 | | .unwrap(); |
2175 | | |
2176 | | assert_eq!(20, arr_data.len()); |
2177 | | assert_eq!(10, arr_data.null_count()); |
2178 | | assert_eq!(5, arr_data.offset()); |
2179 | | assert_eq!(1, arr_data.buffers().len()); |
2180 | | assert_eq!( |
2181 | | Buffer::from_slice_ref(&v).as_slice(), |
2182 | | arr_data.buffers()[0].as_slice() |
2183 | | ); |
2184 | | } |
2185 | | |
2186 | | #[test] |
2187 | | fn test_builder_with_child_data() { |
2188 | | let child_arr_data = ArrayData::try_new( |
2189 | | DataType::Int32, |
2190 | | 5, |
2191 | | None, |
2192 | | 0, |
2193 | | vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])], |
2194 | | vec![], |
2195 | | ) |
2196 | | .unwrap(); |
2197 | | |
2198 | | let field = Arc::new(Field::new("x", DataType::Int32, true)); |
2199 | | let data_type = DataType::Struct(vec![field].into()); |
2200 | | |
2201 | | let arr_data = ArrayData::builder(data_type) |
2202 | | .len(5) |
2203 | | .offset(0) |
2204 | | .add_child_data(child_arr_data.clone()) |
2205 | | .build() |
2206 | | .unwrap(); |
2207 | | |
2208 | | assert_eq!(5, arr_data.len()); |
2209 | | assert_eq!(1, arr_data.child_data().len()); |
2210 | | assert_eq!(child_arr_data, arr_data.child_data()[0]); |
2211 | | } |
2212 | | |
2213 | | #[test] |
2214 | | fn test_null_count() { |
2215 | | let mut bit_v: [u8; 2] = [0; 2]; |
2216 | | bit_util::set_bit(&mut bit_v, 0); |
2217 | | bit_util::set_bit(&mut bit_v, 3); |
2218 | | bit_util::set_bit(&mut bit_v, 10); |
2219 | | let arr_data = ArrayData::builder(DataType::Int32) |
2220 | | .len(16) |
2221 | | .add_buffer(make_i32_buffer(16)) |
2222 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2223 | | .build() |
2224 | | .unwrap(); |
2225 | | assert_eq!(13, arr_data.null_count()); |
2226 | | |
2227 | | // Test with offset |
2228 | | let mut bit_v: [u8; 2] = [0; 2]; |
2229 | | bit_util::set_bit(&mut bit_v, 0); |
2230 | | bit_util::set_bit(&mut bit_v, 3); |
2231 | | bit_util::set_bit(&mut bit_v, 10); |
2232 | | let arr_data = ArrayData::builder(DataType::Int32) |
2233 | | .len(12) |
2234 | | .offset(2) |
2235 | | .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space, |
2236 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2237 | | .build() |
2238 | | .unwrap(); |
2239 | | assert_eq!(10, arr_data.null_count()); |
2240 | | } |
2241 | | |
2242 | | #[test] |
2243 | | fn test_null_buffer_ref() { |
2244 | | let mut bit_v: [u8; 2] = [0; 2]; |
2245 | | bit_util::set_bit(&mut bit_v, 0); |
2246 | | bit_util::set_bit(&mut bit_v, 3); |
2247 | | bit_util::set_bit(&mut bit_v, 10); |
2248 | | let arr_data = ArrayData::builder(DataType::Int32) |
2249 | | .len(16) |
2250 | | .add_buffer(make_i32_buffer(16)) |
2251 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2252 | | .build() |
2253 | | .unwrap(); |
2254 | | assert!(arr_data.nulls().is_some()); |
2255 | | assert_eq!(&bit_v, arr_data.nulls().unwrap().validity()); |
2256 | | } |
2257 | | |
2258 | | #[test] |
2259 | | fn test_slice() { |
2260 | | let mut bit_v: [u8; 2] = [0; 2]; |
2261 | | bit_util::set_bit(&mut bit_v, 0); |
2262 | | bit_util::set_bit(&mut bit_v, 3); |
2263 | | bit_util::set_bit(&mut bit_v, 10); |
2264 | | let data = ArrayData::builder(DataType::Int32) |
2265 | | .len(16) |
2266 | | .add_buffer(make_i32_buffer(16)) |
2267 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2268 | | .build() |
2269 | | .unwrap(); |
2270 | | let new_data = data.slice(1, 15); |
2271 | | assert_eq!(data.len() - 1, new_data.len()); |
2272 | | assert_eq!(1, new_data.offset()); |
2273 | | assert_eq!(data.null_count(), new_data.null_count()); |
2274 | | |
2275 | | // slice of a slice (removes one null) |
2276 | | let new_data = new_data.slice(1, 14); |
2277 | | assert_eq!(data.len() - 2, new_data.len()); |
2278 | | assert_eq!(2, new_data.offset()); |
2279 | | assert_eq!(data.null_count() - 1, new_data.null_count()); |
2280 | | } |
2281 | | |
2282 | | #[test] |
2283 | | fn test_equality() { |
2284 | | let int_data = ArrayData::builder(DataType::Int32) |
2285 | | .len(1) |
2286 | | .add_buffer(make_i32_buffer(1)) |
2287 | | .build() |
2288 | | .unwrap(); |
2289 | | |
2290 | | let float_data = ArrayData::builder(DataType::Float32) |
2291 | | .len(1) |
2292 | | .add_buffer(make_f32_buffer(1)) |
2293 | | .build() |
2294 | | .unwrap(); |
2295 | | assert_ne!(int_data, float_data); |
2296 | | assert!(!int_data.ptr_eq(&float_data)); |
2297 | | assert!(int_data.ptr_eq(&int_data)); |
2298 | | |
2299 | | #[allow(clippy::redundant_clone)] |
2300 | | let int_data_clone = int_data.clone(); |
2301 | | assert_eq!(int_data, int_data_clone); |
2302 | | assert!(int_data.ptr_eq(&int_data_clone)); |
2303 | | assert!(int_data_clone.ptr_eq(&int_data)); |
2304 | | |
2305 | | let int_data_slice = int_data_clone.slice(1, 0); |
2306 | | assert!(int_data_slice.ptr_eq(&int_data_slice)); |
2307 | | assert!(!int_data.ptr_eq(&int_data_slice)); |
2308 | | assert!(!int_data_slice.ptr_eq(&int_data)); |
2309 | | |
2310 | | let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); |
2311 | | let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); |
2312 | | let string_data = ArrayData::try_new( |
2313 | | DataType::Utf8, |
2314 | | 3, |
2315 | | Some(Buffer::from_iter(vec![true, false, true])), |
2316 | | 0, |
2317 | | vec![offsets_buffer, data_buffer], |
2318 | | vec![], |
2319 | | ) |
2320 | | .unwrap(); |
2321 | | |
2322 | | assert_ne!(float_data, string_data); |
2323 | | assert!(!float_data.ptr_eq(&string_data)); |
2324 | | |
2325 | | assert!(string_data.ptr_eq(&string_data)); |
2326 | | |
2327 | | #[allow(clippy::redundant_clone)] |
2328 | | let string_data_cloned = string_data.clone(); |
2329 | | assert!(string_data_cloned.ptr_eq(&string_data)); |
2330 | | assert!(string_data.ptr_eq(&string_data_cloned)); |
2331 | | |
2332 | | let string_data_slice = string_data.slice(1, 2); |
2333 | | assert!(string_data_slice.ptr_eq(&string_data_slice)); |
2334 | | assert!(!string_data_slice.ptr_eq(&string_data)) |
2335 | | } |
2336 | | |
2337 | | #[test] |
2338 | | fn test_slice_memory_size() { |
2339 | | let mut bit_v: [u8; 2] = [0; 2]; |
2340 | | bit_util::set_bit(&mut bit_v, 0); |
2341 | | bit_util::set_bit(&mut bit_v, 3); |
2342 | | bit_util::set_bit(&mut bit_v, 10); |
2343 | | let data = ArrayData::builder(DataType::Int32) |
2344 | | .len(16) |
2345 | | .add_buffer(make_i32_buffer(16)) |
2346 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2347 | | .build() |
2348 | | .unwrap(); |
2349 | | let new_data = data.slice(1, 14); |
2350 | | assert_eq!( |
2351 | | data.get_slice_memory_size().unwrap() - 8, |
2352 | | new_data.get_slice_memory_size().unwrap() |
2353 | | ); |
2354 | | let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); |
2355 | | let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); |
2356 | | let string_data = ArrayData::try_new( |
2357 | | DataType::Utf8, |
2358 | | 3, |
2359 | | Some(Buffer::from_iter(vec![true, false, true])), |
2360 | | 0, |
2361 | | vec![offsets_buffer, data_buffer], |
2362 | | vec![], |
2363 | | ) |
2364 | | .unwrap(); |
2365 | | let string_data_slice = string_data.slice(1, 2); |
2366 | | //4 bytes of offset and 2 bytes of data reduced by slicing. |
2367 | | assert_eq!( |
2368 | | string_data.get_slice_memory_size().unwrap() - 6, |
2369 | | string_data_slice.get_slice_memory_size().unwrap() |
2370 | | ); |
2371 | | } |
2372 | | |
2373 | | #[test] |
2374 | | fn test_count_nulls() { |
2375 | | let buffer = Buffer::from([0b00010110, 0b10011111]); |
2376 | | let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16)); |
2377 | | let count = count_nulls(Some(&buffer), 0, 16); |
2378 | | assert_eq!(count, 7); |
2379 | | |
2380 | | let count = count_nulls(Some(&buffer), 4, 8); |
2381 | | assert_eq!(count, 3); |
2382 | | } |
2383 | | |
2384 | | #[test] |
2385 | | fn test_contains_nulls() { |
2386 | | let buffer: Buffer = |
2387 | | MutableBuffer::from_iter([false, false, false, true, true, false]).into(); |
2388 | | let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6)); |
2389 | | assert!(contains_nulls(Some(&buffer), 0, 6)); |
2390 | | assert!(contains_nulls(Some(&buffer), 0, 3)); |
2391 | | assert!(!contains_nulls(Some(&buffer), 3, 2)); |
2392 | | assert!(!contains_nulls(Some(&buffer), 0, 0)); |
2393 | | } |
2394 | | |
2395 | | #[test] |
2396 | | fn test_alignment() { |
2397 | | let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); |
2398 | | let sliced = buffer.slice(1); |
2399 | | |
2400 | | let mut data = ArrayData { |
2401 | | data_type: DataType::Int32, |
2402 | | len: 0, |
2403 | | offset: 0, |
2404 | | buffers: vec![buffer], |
2405 | | child_data: vec![], |
2406 | | nulls: None, |
2407 | | }; |
2408 | | data.validate_full().unwrap(); |
2409 | | |
2410 | | // break alignment in data |
2411 | | data.buffers[0] = sliced; |
2412 | | let err = data.validate().unwrap_err(); |
2413 | | |
2414 | | assert_eq!( |
2415 | | err.to_string(), |
2416 | | "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" |
2417 | | ); |
2418 | | |
2419 | | data.align_buffers(); |
2420 | | data.validate_full().unwrap(); |
2421 | | } |
2422 | | |
2423 | | #[test] |
2424 | | fn test_alignment_struct() { |
2425 | | let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); |
2426 | | let sliced = buffer.slice(1); |
2427 | | |
2428 | | let child_data = ArrayData { |
2429 | | data_type: DataType::Int32, |
2430 | | len: 0, |
2431 | | offset: 0, |
2432 | | buffers: vec![buffer], |
2433 | | child_data: vec![], |
2434 | | nulls: None, |
2435 | | }; |
2436 | | |
2437 | | let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)])); |
2438 | | let mut data = ArrayData { |
2439 | | data_type: schema, |
2440 | | len: 0, |
2441 | | offset: 0, |
2442 | | buffers: vec![], |
2443 | | child_data: vec![child_data], |
2444 | | nulls: None, |
2445 | | }; |
2446 | | data.validate_full().unwrap(); |
2447 | | |
2448 | | // break alignment in child data |
2449 | | data.child_data[0].buffers[0] = sliced; |
2450 | | let err = data.validate().unwrap_err(); |
2451 | | |
2452 | | assert_eq!( |
2453 | | err.to_string(), |
2454 | | "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" |
2455 | | ); |
2456 | | |
2457 | | data.align_buffers(); |
2458 | | data.validate_full().unwrap(); |
2459 | | } |
2460 | | |
2461 | | #[test] |
2462 | | fn test_null_view_types() { |
2463 | | let array_len = 32; |
2464 | | let array = ArrayData::new_null(&DataType::BinaryView, array_len); |
2465 | | assert_eq!(array.len(), array_len); |
2466 | | for i in 0..array.len() { |
2467 | | assert!(array.is_null(i)); |
2468 | | } |
2469 | | |
2470 | | let array = ArrayData::new_null(&DataType::Utf8View, array_len); |
2471 | | assert_eq!(array.len(), array_len); |
2472 | | for i in 0..array.len() { |
2473 | | assert!(array.is_null(i)); |
2474 | | } |
2475 | | } |
2476 | | } |