/Users/andrewlamb/Software/arrow-rs/arrow-data/src/data.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates |
19 | | //! common attributes and operations for Arrow array. |
20 | | |
21 | | use crate::bit_iterator::BitSliceIterator; |
22 | | use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; |
23 | | use arrow_buffer::{ |
24 | | bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, |
25 | | }; |
26 | | use arrow_schema::{ArrowError, DataType, UnionMode}; |
27 | | use std::mem; |
28 | | use std::ops::Range; |
29 | | use std::sync::Arc; |
30 | | |
31 | | use crate::{equal, validate_binary_view, validate_string_view}; |
32 | | |
33 | | #[inline] |
34 | 624 | pub(crate) fn contains_nulls( |
35 | 624 | null_bit_buffer: Option<&NullBuffer>, |
36 | 624 | offset: usize, |
37 | 624 | len: usize, |
38 | 624 | ) -> bool { |
39 | 624 | match null_bit_buffer { |
40 | 105 | Some(buffer) => { |
41 | 105 | match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() { |
42 | 66 | Some((start, end)) => start != 0 || end != len59 , |
43 | 39 | None => len != 0, // No non-null values |
44 | | } |
45 | | } |
46 | 519 | None => false, // No null buffer |
47 | | } |
48 | 624 | } |
49 | | |
50 | | #[inline] |
51 | 122 | pub(crate) fn count_nulls( |
52 | 122 | null_bit_buffer: Option<&NullBuffer>, |
53 | 122 | offset: usize, |
54 | 122 | len: usize, |
55 | 122 | ) -> usize { |
56 | 122 | if let Some(buf68 ) = null_bit_buffer { |
57 | 68 | let buffer = buf.buffer(); |
58 | 68 | len - buffer.count_set_bits_offset(offset + buf.offset(), len) |
59 | | } else { |
60 | 54 | 0 |
61 | | } |
62 | 122 | } |
63 | | |
64 | | /// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots). |
65 | | #[inline] |
66 | 18 | pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] { |
67 | 18 | let empty_buffer = MutableBuffer::new(0); |
68 | 18 | match data_type { |
69 | 0 | DataType::Null => [empty_buffer, MutableBuffer::new(0)], |
70 | | DataType::Boolean => { |
71 | 0 | let bytes = bit_util::ceil(capacity, 8); |
72 | 0 | let buffer = MutableBuffer::new(bytes); |
73 | 0 | [buffer, empty_buffer] |
74 | | } |
75 | | DataType::UInt8 |
76 | | | DataType::UInt16 |
77 | | | DataType::UInt32 |
78 | | | DataType::UInt64 |
79 | | | DataType::Int8 |
80 | | | DataType::Int16 |
81 | | | DataType::Int32 |
82 | | | DataType::Int64 |
83 | | | DataType::Float16 |
84 | | | DataType::Float32 |
85 | | | DataType::Float64 |
86 | | | DataType::Decimal32(_, _) |
87 | | | DataType::Decimal64(_, _) |
88 | | | DataType::Decimal128(_, _) |
89 | | | DataType::Decimal256(_, _) |
90 | | | DataType::Date32 |
91 | | | DataType::Time32(_) |
92 | | | DataType::Date64 |
93 | | | DataType::Time64(_) |
94 | | | DataType::Duration(_) |
95 | | | DataType::Timestamp(_, _) |
96 | 3 | | DataType::Interval(_) => [ |
97 | 3 | MutableBuffer::new(capacity * data_type.primitive_width().unwrap()), |
98 | 3 | empty_buffer, |
99 | 3 | ], |
100 | | DataType::Utf8 | DataType::Binary => { |
101 | 3 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
102 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
103 | 3 | buffer.push(0i32); |
104 | 3 | [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
105 | | } |
106 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
107 | 0 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
108 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
109 | 0 | buffer.push(0i64); |
110 | 0 | [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
111 | | } |
112 | 0 | DataType::BinaryView | DataType::Utf8View => [ |
113 | 0 | MutableBuffer::new(capacity * mem::size_of::<u128>()), |
114 | 0 | empty_buffer, |
115 | 0 | ], |
116 | | DataType::List(_) | DataType::Map(_, _) => { |
117 | | // offset buffer always starts with a zero |
118 | 4 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
119 | 4 | buffer.push(0i32); |
120 | 4 | [buffer, empty_buffer] |
121 | | } |
122 | 0 | DataType::ListView(_) => [ |
123 | 0 | MutableBuffer::new(capacity * mem::size_of::<i32>()), |
124 | 0 | MutableBuffer::new(capacity * mem::size_of::<i32>()), |
125 | 0 | ], |
126 | | DataType::LargeList(_) => { |
127 | | // offset buffer always starts with a zero |
128 | 0 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
129 | 0 | buffer.push(0i64); |
130 | 0 | [buffer, empty_buffer] |
131 | | } |
132 | 0 | DataType::LargeListView(_) => [ |
133 | 0 | MutableBuffer::new(capacity * mem::size_of::<i64>()), |
134 | 0 | MutableBuffer::new(capacity * mem::size_of::<i64>()), |
135 | 0 | ], |
136 | 3 | DataType::FixedSizeBinary(size) => { |
137 | 3 | [MutableBuffer::new(capacity * *size as usize), empty_buffer] |
138 | | } |
139 | 0 | DataType::Dictionary(k, _) => [ |
140 | 0 | MutableBuffer::new(capacity * k.primitive_width().unwrap()), |
141 | 0 | empty_buffer, |
142 | 0 | ], |
143 | | DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { |
144 | 5 | [empty_buffer, MutableBuffer::new(0)] |
145 | | } |
146 | 0 | DataType::Union(_, mode) => { |
147 | 0 | let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>()); |
148 | 0 | match mode { |
149 | 0 | UnionMode::Sparse => [type_ids, empty_buffer], |
150 | | UnionMode::Dense => { |
151 | 0 | let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>()); |
152 | 0 | [type_ids, offsets] |
153 | | } |
154 | | } |
155 | | } |
156 | | } |
157 | 18 | } |
158 | | |
159 | | /// A generic representation of Arrow array data which encapsulates common attributes |
160 | | /// and operations for Arrow array. |
161 | | /// |
162 | | /// Specific operations for different arrays types (e.g., primitive, list, struct) |
163 | | /// are implemented in `Array`. |
164 | | /// |
165 | | /// # Memory Layout |
166 | | /// |
167 | | /// `ArrayData` has references to one or more underlying data buffers |
168 | | /// and optional child ArrayData, depending on type as illustrated |
169 | | /// below. Bitmaps are not shown for simplicity but they are stored |
170 | | /// similarly to the buffers. |
171 | | /// |
172 | | /// ```text |
173 | | /// offset |
174 | | /// points to |
175 | | /// ┌───────────────────┐ start of ┌───────┐ Different |
176 | | /// │ │ data │ │ ArrayData may |
177 | | /// │ArrayData { │ │.... │ also refers to |
178 | | /// │ data_type: ... │ ─ ─ ─ ─▶│1234 │ ┌ ─ the same |
179 | | /// │ offset: ... ─ ─ ─│─ ┘ │4372 │ underlying |
180 | | /// │ len: ... ─ ─ ─│─ ┐ │4888 │ │ buffer with different offset/len |
181 | | /// │ buffers: [ │ │5882 │◀─ |
182 | | /// │ ... │ │ │4323 │ |
183 | | /// │ ] │ ─ ─ ─ ─▶│4859 │ |
184 | | /// │ child_data: [ │ │.... │ |
185 | | /// │ ... │ │ │ |
186 | | /// │ ] │ └───────┘ |
187 | | /// │} │ |
188 | | /// │ │ Shared Buffer uses |
189 | | /// │ │ │ bytes::Bytes to hold |
190 | | /// └───────────────────┘ actual data values |
191 | | /// ┌ ─ ─ ┘ |
192 | | /// |
193 | | /// ▼ |
194 | | /// ┌───────────────────┐ |
195 | | /// │ArrayData { │ |
196 | | /// │ ... │ |
197 | | /// │} │ |
198 | | /// │ │ |
199 | | /// └───────────────────┘ |
200 | | /// |
201 | | /// Child ArrayData may also have its own buffers and children |
202 | | /// ``` |
203 | | |
204 | | #[derive(Debug, Clone)] |
205 | | pub struct ArrayData { |
206 | | /// The data type |
207 | | data_type: DataType, |
208 | | |
209 | | /// The number of elements |
210 | | len: usize, |
211 | | |
212 | | /// The offset in number of items (not bytes). |
213 | | /// |
214 | | /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It |
215 | | /// does NOT apply to [`Self::nulls`]. |
216 | | offset: usize, |
217 | | |
218 | | /// The buffers that store the actual data for this array, as defined |
219 | | /// in the [Arrow Spec]. |
220 | | /// |
221 | | /// Depending on the array types, [`Self::buffers`] can hold different |
222 | | /// kinds of buffers (e.g., value buffer, value offset buffer) at different |
223 | | /// positions. |
224 | | /// |
225 | | /// The buffer may be larger than needed. Some items at the beginning may be skipped if |
226 | | /// there is an `offset`. Some items at the end may be skipped if the buffer is longer than |
227 | | /// we need to satisfy `len`. |
228 | | /// |
229 | | /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) |
230 | | buffers: Vec<Buffer>, |
231 | | |
232 | | /// The child(ren) of this array. |
233 | | /// |
234 | | /// Only non-empty for nested types, such as `ListArray` and |
235 | | /// `StructArray`. |
236 | | /// |
237 | | /// The first logical element in each child element begins at `offset`. |
238 | | /// |
239 | | /// If the child element also has an offset then these offsets are |
240 | | /// cumulative. |
241 | | child_data: Vec<ArrayData>, |
242 | | |
243 | | /// The null bitmap. |
244 | | /// |
245 | | /// `None` indicates all values are non-null in this array. |
246 | | /// |
247 | | /// [`Self::offset]` does not apply to the null bitmap. While the |
248 | | /// BooleanBuffer may be sliced (have its own offset) internally, this |
249 | | /// `NullBuffer` always represents exactly `len` elements. |
250 | | nulls: Option<NullBuffer>, |
251 | | } |
252 | | |
253 | | /// A thread-safe, shared reference to the Arrow array data. |
254 | | pub type ArrayDataRef = Arc<ArrayData>; |
255 | | |
256 | | impl ArrayData { |
257 | | /// Create a new ArrayData instance; |
258 | | /// |
259 | | /// If `null_count` is not specified, the number of nulls in |
260 | | /// null_bit_buffer is calculated. |
261 | | /// |
262 | | /// If the number of nulls is 0 then the null_bit_buffer |
263 | | /// is set to `None`. |
264 | | /// |
265 | | /// # Safety |
266 | | /// |
267 | | /// The input values *must* form a valid Arrow array for |
268 | | /// `data_type`, or undefined behavior can result. |
269 | | /// |
270 | | /// Note: This is a low level API and most users of the arrow |
271 | | /// crate should create arrays using the methods in the `array` |
272 | | /// module. |
273 | 34 | pub unsafe fn new_unchecked( |
274 | 34 | data_type: DataType, |
275 | 34 | len: usize, |
276 | 34 | null_count: Option<usize>, |
277 | 34 | null_bit_buffer: Option<Buffer>, |
278 | 34 | offset: usize, |
279 | 34 | buffers: Vec<Buffer>, |
280 | 34 | child_data: Vec<ArrayData>, |
281 | 34 | ) -> Self { |
282 | 34 | let mut skip_validation = UnsafeFlag::new(); |
283 | | // SAFETY: caller responsible for ensuring data is valid |
284 | 34 | skip_validation.set(true); |
285 | | |
286 | 34 | ArrayDataBuilder { |
287 | 34 | data_type, |
288 | 34 | len, |
289 | 34 | null_count, |
290 | 34 | null_bit_buffer, |
291 | 34 | nulls: None, |
292 | 34 | offset, |
293 | 34 | buffers, |
294 | 34 | child_data, |
295 | 34 | align_buffers: false, |
296 | 34 | skip_validation, |
297 | 34 | } |
298 | 34 | .build() |
299 | 34 | .unwrap() |
300 | 34 | } |
301 | | |
302 | | /// Create a new ArrayData, validating that the provided buffers form a valid |
303 | | /// Arrow array of the specified data type. |
304 | | /// |
305 | | /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer |
306 | | /// is set to `None`. |
307 | | /// |
308 | | /// Internally this calls through to [`Self::validate_data`] |
309 | | /// |
310 | | /// Note: This is a low level API and most users of the arrow crate should create |
311 | | /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array) |
312 | 0 | pub fn try_new( |
313 | 0 | data_type: DataType, |
314 | 0 | len: usize, |
315 | 0 | null_bit_buffer: Option<Buffer>, |
316 | 0 | offset: usize, |
317 | 0 | buffers: Vec<Buffer>, |
318 | 0 | child_data: Vec<ArrayData>, |
319 | 0 | ) -> Result<Self, ArrowError> { |
320 | | // we must check the length of `null_bit_buffer` first |
321 | | // because we use this buffer to calculate `null_count` |
322 | | // in `Self::new_unchecked`. |
323 | 0 | if let Some(null_bit_buffer) = null_bit_buffer.as_ref() { |
324 | 0 | let needed_len = bit_util::ceil(len + offset, 8); |
325 | 0 | if null_bit_buffer.len() < needed_len { |
326 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
327 | 0 | "null_bit_buffer size too small. got {} needed {}", |
328 | 0 | null_bit_buffer.len(), |
329 | 0 | needed_len |
330 | 0 | ))); |
331 | 0 | } |
332 | 0 | } |
333 | | // Safety justification: `validate_full` is called below |
334 | 0 | let new_self = unsafe { |
335 | 0 | Self::new_unchecked( |
336 | 0 | data_type, |
337 | 0 | len, |
338 | 0 | None, |
339 | 0 | null_bit_buffer, |
340 | 0 | offset, |
341 | 0 | buffers, |
342 | 0 | child_data, |
343 | | ) |
344 | | }; |
345 | | |
346 | | // As the data is not trusted, do a full validation of its contents |
347 | | // We don't need to validate children as we can assume that the |
348 | | // [`ArrayData`] in `child_data` have already been validated through |
349 | | // a call to `ArrayData::try_new` or created using unsafe |
350 | 0 | new_self.validate_data()?; |
351 | 0 | Ok(new_self) |
352 | 0 | } |
353 | | |
354 | | /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`] |
355 | | #[inline] |
356 | 162 | pub const fn builder(data_type: DataType) -> ArrayDataBuilder { |
357 | 162 | ArrayDataBuilder::new(data_type) |
358 | 162 | } |
359 | | |
360 | | /// Returns a reference to the [`DataType`] of this [`ArrayData`] |
361 | | #[inline] |
362 | 2.22k | pub const fn data_type(&self) -> &DataType { |
363 | 2.22k | &self.data_type |
364 | 2.22k | } |
365 | | |
366 | | /// Returns the [`Buffer`] storing data for this [`ArrayData`] |
367 | 2.17k | pub fn buffers(&self) -> &[Buffer] { |
368 | 2.17k | &self.buffers |
369 | 2.17k | } |
370 | | |
371 | | /// Returns a slice of children [`ArrayData`]. This will be non |
372 | | /// empty for type such as lists and structs. |
373 | 369 | pub fn child_data(&self) -> &[ArrayData] { |
374 | 369 | &self.child_data[..] |
375 | 369 | } |
376 | | |
377 | | /// Returns whether the element at index `i` is null |
378 | | #[inline] |
379 | 0 | pub fn is_null(&self, i: usize) -> bool { |
380 | 0 | match &self.nulls { |
381 | 0 | Some(v) => v.is_null(i), |
382 | 0 | None => false, |
383 | | } |
384 | 0 | } |
385 | | |
386 | | /// Returns a reference to the null buffer of this [`ArrayData`] if any |
387 | | /// |
388 | | /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`] |
389 | | #[inline] |
390 | 2.65k | pub fn nulls(&self) -> Option<&NullBuffer> { |
391 | 2.65k | self.nulls.as_ref() |
392 | 2.65k | } |
393 | | |
394 | | /// Returns whether the element at index `i` is not null |
395 | | #[inline] |
396 | | pub fn is_valid(&self, i: usize) -> bool { |
397 | | !self.is_null(i) |
398 | | } |
399 | | |
400 | | /// Returns the length (i.e., number of elements) of this [`ArrayData`]. |
401 | | #[inline] |
402 | 2.10k | pub const fn len(&self) -> usize { |
403 | 2.10k | self.len |
404 | 2.10k | } |
405 | | |
406 | | /// Returns whether this [`ArrayData`] is empty |
407 | | #[inline] |
408 | 53 | pub const fn is_empty(&self) -> bool { |
409 | 53 | self.len == 0 |
410 | 53 | } |
411 | | |
412 | | /// Returns the offset of this [`ArrayData`] |
413 | | #[inline] |
414 | 1.15k | pub const fn offset(&self) -> usize { |
415 | 1.15k | self.offset |
416 | 1.15k | } |
417 | | |
418 | | /// Returns the total number of nulls in this array |
419 | | #[inline] |
420 | 943 | pub fn null_count(&self) -> usize { |
421 | 943 | self.nulls |
422 | 943 | .as_ref() |
423 | 943 | .map(|x| x107 .null_count107 ()) |
424 | 943 | .unwrap_or_default() |
425 | 943 | } |
426 | | |
427 | | /// Returns the total number of bytes of memory occupied by the |
428 | | /// buffers owned by this [`ArrayData`] and all of its |
429 | | /// children. (See also diagram on [`ArrayData`]). |
430 | | /// |
431 | | /// Note that this [`ArrayData`] may only refer to a subset of the |
432 | | /// data in the underlying [`Buffer`]s (due to `offset` and |
433 | | /// `length`), but the size returned includes the entire size of |
434 | | /// the buffers. |
435 | | /// |
436 | | /// If multiple [`ArrayData`]s refer to the same underlying |
437 | | /// [`Buffer`]s they will both report the same size. |
438 | 0 | pub fn get_buffer_memory_size(&self) -> usize { |
439 | 0 | let mut size = 0; |
440 | 0 | for buffer in &self.buffers { |
441 | 0 | size += buffer.capacity(); |
442 | 0 | } |
443 | 0 | if let Some(bitmap) = &self.nulls { |
444 | 0 | size += bitmap.buffer().capacity() |
445 | 0 | } |
446 | 0 | for child in &self.child_data { |
447 | 0 | size += child.get_buffer_memory_size(); |
448 | 0 | } |
449 | 0 | size |
450 | 0 | } |
451 | | |
452 | | /// Returns the total number of the bytes of memory occupied by |
453 | | /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). |
454 | | /// |
455 | | /// This is approximately the number of bytes if a new |
456 | | /// [`ArrayData`] was formed by creating new [`Buffer`]s with |
457 | | /// exactly the data needed. |
458 | | /// |
459 | | /// For example, a [`DataType::Int64`] with `100` elements, |
460 | | /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If |
461 | | /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its |
462 | | /// first `20` elements, then [`Self::get_slice_memory_size`] on the |
463 | | /// sliced [`ArrayData`] would return `20 * 8 = 160`. |
464 | 0 | pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> { |
465 | 0 | let mut result: usize = 0; |
466 | 0 | let layout = layout(&self.data_type); |
467 | | |
468 | 0 | for spec in layout.buffers.iter() { |
469 | 0 | match spec { |
470 | 0 | BufferSpec::FixedWidth { byte_width, .. } => { |
471 | 0 | let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { |
472 | 0 | ArrowError::ComputeError( |
473 | 0 | "Integer overflow computing buffer size".to_string(), |
474 | 0 | ) |
475 | 0 | })?; |
476 | 0 | result += buffer_size; |
477 | | } |
478 | | BufferSpec::VariableWidth => { |
479 | | let buffer_len: usize; |
480 | 0 | match self.data_type { |
481 | | DataType::Utf8 | DataType::Binary => { |
482 | 0 | let offsets = self.typed_offsets::<i32>()?; |
483 | 0 | buffer_len = (offsets[self.len] - offsets[0] ) as usize; |
484 | | } |
485 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
486 | 0 | let offsets = self.typed_offsets::<i64>()?; |
487 | 0 | buffer_len = (offsets[self.len] - offsets[0]) as usize; |
488 | | } |
489 | | _ => { |
490 | 0 | return Err(ArrowError::NotYetImplemented(format!( |
491 | 0 | "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", |
492 | 0 | self.data_type |
493 | 0 | ))) |
494 | | } |
495 | | }; |
496 | 0 | result += buffer_len; |
497 | | } |
498 | 0 | BufferSpec::BitMap => { |
499 | 0 | let buffer_size = bit_util::ceil(self.len, 8); |
500 | 0 | result += buffer_size; |
501 | 0 | } |
502 | 0 | BufferSpec::AlwaysNull => { |
503 | 0 | // Nothing to do |
504 | 0 | } |
505 | | } |
506 | | } |
507 | | |
508 | 0 | if self.nulls().is_some() { |
509 | 0 | result += bit_util::ceil(self.len, 8); |
510 | 0 | } |
511 | | |
512 | 0 | for child in &self.child_data { |
513 | 0 | result += child.get_slice_memory_size()?; |
514 | | } |
515 | 0 | Ok(result) |
516 | 0 | } |
517 | | |
518 | | /// Returns the total number of bytes of memory occupied |
519 | | /// physically by this [`ArrayData`] and all its [`Buffer`]s and |
520 | | /// children. (See also diagram on [`ArrayData`]). |
521 | | /// |
522 | | /// Equivalent to: |
523 | | /// `size_of_val(self)` + |
524 | | /// [`Self::get_buffer_memory_size`] + |
525 | | /// `size_of_val(child)` for all children |
526 | 0 | pub fn get_array_memory_size(&self) -> usize { |
527 | 0 | let mut size = mem::size_of_val(self); |
528 | | |
529 | | // Calculate rest of the fields top down which contain actual data |
530 | 0 | for buffer in &self.buffers { |
531 | 0 | size += mem::size_of::<Buffer>(); |
532 | 0 | size += buffer.capacity(); |
533 | 0 | } |
534 | 0 | if let Some(nulls) = &self.nulls { |
535 | 0 | size += nulls.buffer().capacity(); |
536 | 0 | } |
537 | 0 | for child in &self.child_data { |
538 | 0 | size += child.get_array_memory_size(); |
539 | 0 | } |
540 | | |
541 | 0 | size |
542 | 0 | } |
543 | | |
544 | | /// Creates a zero-copy slice of itself. This creates a new |
545 | | /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a |
546 | | /// different offset and len |
547 | | /// |
548 | | /// # Panics |
549 | | /// |
550 | | /// Panics if `offset + length > self.len()`. |
551 | 0 | pub fn slice(&self, offset: usize, length: usize) -> ArrayData { |
552 | 0 | assert!((offset + length) <= self.len()); |
553 | | |
554 | 0 | if let DataType::Struct(_) = self.data_type() { |
555 | | // Slice into children |
556 | 0 | let new_offset = self.offset + offset; |
557 | 0 | let new_data = ArrayData { |
558 | 0 | data_type: self.data_type().clone(), |
559 | 0 | len: length, |
560 | 0 | offset: new_offset, |
561 | 0 | buffers: self.buffers.clone(), |
562 | | // Slice child data, to propagate offsets down to them |
563 | 0 | child_data: self |
564 | 0 | .child_data() |
565 | 0 | .iter() |
566 | 0 | .map(|data| data.slice(offset, length)) |
567 | 0 | .collect(), |
568 | 0 | nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), |
569 | | }; |
570 | | |
571 | 0 | new_data |
572 | | } else { |
573 | 0 | let mut new_data = self.clone(); |
574 | | |
575 | 0 | new_data.len = length; |
576 | 0 | new_data.offset = offset + self.offset; |
577 | 0 | new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length)); |
578 | | |
579 | 0 | new_data |
580 | | } |
581 | 0 | } |
582 | | |
583 | | /// Returns the `buffer` as a slice of type `T` starting at self.offset |
584 | | /// |
585 | | /// # Panics |
586 | | /// This function panics if: |
587 | | /// * the buffer is not byte-aligned with type T, or |
588 | | /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) |
589 | 496 | pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] { |
590 | 496 | &self.buffers()[buffer].typed_data()[self.offset..] |
591 | 496 | } |
592 | | |
593 | | /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values |
594 | 0 | pub fn new_null(data_type: &DataType, len: usize) -> Self { |
595 | 0 | let bit_len = bit_util::ceil(len, 8); |
596 | 0 | let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len)); |
597 | | |
598 | 0 | let (buffers, child_data, has_nulls) = match data_type.primitive_width() { |
599 | 0 | Some(width) => (vec![zeroed(width * len)], vec![], true), |
600 | 0 | None => match data_type { |
601 | 0 | DataType::Null => (vec![], vec![], false), |
602 | 0 | DataType::Boolean => (vec![zeroed(bit_len)], vec![], true), |
603 | | DataType::Binary | DataType::Utf8 => { |
604 | 0 | (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true) |
605 | | } |
606 | 0 | DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true), |
607 | | DataType::LargeBinary | DataType::LargeUtf8 => { |
608 | 0 | (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true) |
609 | | } |
610 | 0 | DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true), |
611 | 0 | DataType::List(f) | DataType::Map(f, _) => ( |
612 | 0 | vec![zeroed((len + 1) * 4)], |
613 | 0 | vec![ArrayData::new_empty(f.data_type())], |
614 | 0 | true, |
615 | 0 | ), |
616 | 0 | DataType::LargeList(f) => ( |
617 | 0 | vec![zeroed((len + 1) * 8)], |
618 | 0 | vec![ArrayData::new_empty(f.data_type())], |
619 | 0 | true, |
620 | 0 | ), |
621 | 0 | DataType::FixedSizeList(f, list_len) => ( |
622 | 0 | vec![], |
623 | 0 | vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)], |
624 | 0 | true, |
625 | 0 | ), |
626 | 0 | DataType::Struct(fields) => ( |
627 | 0 | vec![], |
628 | 0 | fields |
629 | 0 | .iter() |
630 | 0 | .map(|f| Self::new_null(f.data_type(), len)) |
631 | 0 | .collect(), |
632 | | true, |
633 | | ), |
634 | 0 | DataType::Dictionary(k, v) => ( |
635 | 0 | vec![zeroed(k.primitive_width().unwrap() * len)], |
636 | 0 | vec![ArrayData::new_empty(v.as_ref())], |
637 | 0 | true, |
638 | 0 | ), |
639 | 0 | DataType::Union(f, mode) => { |
640 | 0 | let (id, _) = f.iter().next().unwrap(); |
641 | 0 | let ids = Buffer::from_iter(std::iter::repeat_n(id, len)); |
642 | 0 | let buffers = match mode { |
643 | 0 | UnionMode::Sparse => vec![ids], |
644 | | UnionMode::Dense => { |
645 | 0 | let end_offset = i32::from_usize(len).unwrap(); |
646 | 0 | vec![ids, Buffer::from_iter(0_i32..end_offset)] |
647 | | } |
648 | | }; |
649 | | |
650 | 0 | let children = f |
651 | 0 | .iter() |
652 | 0 | .enumerate() |
653 | 0 | .map(|(idx, (_, f))| { |
654 | 0 | if idx == 0 || *mode == UnionMode::Sparse { |
655 | 0 | Self::new_null(f.data_type(), len) |
656 | | } else { |
657 | 0 | Self::new_empty(f.data_type()) |
658 | | } |
659 | 0 | }) |
660 | 0 | .collect(); |
661 | | |
662 | 0 | (buffers, children, false) |
663 | | } |
664 | 0 | DataType::RunEndEncoded(r, v) => { |
665 | 0 | let runs = match r.data_type() { |
666 | | DataType::Int16 => { |
667 | 0 | let i = i16::from_usize(len).expect("run overflow"); |
668 | 0 | Buffer::from_slice_ref([i]) |
669 | | } |
670 | | DataType::Int32 => { |
671 | 0 | let i = i32::from_usize(len).expect("run overflow"); |
672 | 0 | Buffer::from_slice_ref([i]) |
673 | | } |
674 | | DataType::Int64 => { |
675 | 0 | let i = i64::from_usize(len).expect("run overflow"); |
676 | 0 | Buffer::from_slice_ref([i]) |
677 | | } |
678 | 0 | dt => unreachable!("Invalid run ends data type {dt}"), |
679 | | }; |
680 | | |
681 | 0 | let builder = ArrayData::builder(r.data_type().clone()) |
682 | 0 | .len(1) |
683 | 0 | .buffers(vec![runs]); |
684 | | |
685 | | // SAFETY: |
686 | | // Valid by construction |
687 | 0 | let runs = unsafe { builder.build_unchecked() }; |
688 | 0 | ( |
689 | 0 | vec![], |
690 | 0 | vec![runs, ArrayData::new_null(v.data_type(), 1)], |
691 | 0 | false, |
692 | 0 | ) |
693 | | } |
694 | 0 | d => unreachable!("{d}"), |
695 | | }, |
696 | | }; |
697 | | |
698 | 0 | let mut builder = ArrayDataBuilder::new(data_type.clone()) |
699 | 0 | .len(len) |
700 | 0 | .buffers(buffers) |
701 | 0 | .child_data(child_data); |
702 | | |
703 | 0 | if has_nulls { |
704 | 0 | builder = builder.nulls(Some(NullBuffer::new_null(len))) |
705 | 0 | } |
706 | | |
707 | | // SAFETY: |
708 | | // Data valid by construction |
709 | 0 | unsafe { builder.build_unchecked() } |
710 | 0 | } |
711 | | |
712 | | /// Returns a new empty [ArrayData] valid for `data_type`. |
713 | 0 | pub fn new_empty(data_type: &DataType) -> Self { |
714 | 0 | Self::new_null(data_type, 0) |
715 | 0 | } |
716 | | |
717 | | /// Verifies that the buffers meet the minimum alignment requirements for the data type |
718 | | /// |
719 | | /// Buffers that are not adequately aligned will be copied to a new aligned allocation |
720 | | /// |
721 | | /// This can be useful for when interacting with data sent over IPC or FFI, that may |
722 | | /// not meet the minimum alignment requirements |
723 | | /// |
724 | | /// This also aligns buffers of children data |
725 | 0 | pub fn align_buffers(&mut self) { |
726 | 0 | let layout = layout(&self.data_type); |
727 | 0 | for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { |
728 | 0 | if let BufferSpec::FixedWidth { alignment, .. } = spec { |
729 | 0 | if buffer.as_ptr().align_offset(*alignment) != 0 { |
730 | 0 | *buffer = Buffer::from_slice_ref(buffer.as_ref()); |
731 | 0 | } |
732 | 0 | } |
733 | | } |
734 | | // align children data recursively |
735 | 0 | for data in self.child_data.iter_mut() { |
736 | 0 | data.align_buffers() |
737 | | } |
738 | 0 | } |
739 | | |
740 | | /// "cheap" validation of an `ArrayData`. Ensures buffers are |
741 | | /// sufficiently sized to store `len` + `offset` total elements of |
742 | | /// `data_type` and performs other inexpensive consistency checks. |
743 | | /// |
744 | | /// This check is "cheap" in the sense that it does not validate the |
745 | | /// contents of the buffers (e.g. that all offsets for UTF8 arrays |
746 | | /// are within the bounds of the values buffer). |
747 | | /// |
748 | | /// See [ArrayData::validate_data] to validate fully the offset content |
749 | | /// and the validity of utf8 data |
750 | 25 | pub fn validate(&self) -> Result<(), ArrowError> { |
751 | | // Need at least this mich space in each buffer |
752 | 25 | let len_plus_offset = self.len + self.offset; |
753 | | |
754 | | // Check that the data layout conforms to the spec |
755 | 25 | let layout = layout(&self.data_type); |
756 | | |
757 | 25 | if !layout.can_contain_null_mask && self.nulls0 .is_some0 () { |
758 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
759 | 0 | "Arrays of type {:?} cannot contain a null bitmask", |
760 | 0 | self.data_type, |
761 | 0 | ))); |
762 | 25 | } |
763 | | |
764 | | // Check data buffers length for view types and other types |
765 | 25 | if self.buffers.len() < layout.buffers.len() |
766 | 25 | || (!layout.variadic && self.buffers.len() != layout.buffers.len()) |
767 | | { |
768 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
769 | 0 | "Expected {} buffers in array of type {:?}, got {}", |
770 | 0 | layout.buffers.len(), |
771 | 0 | self.data_type, |
772 | 0 | self.buffers.len(), |
773 | 0 | ))); |
774 | 25 | } |
775 | | |
776 | 26 | for (i, (buffer, spec)) in self.buffers.iter()25 .zip25 (layout.buffers.iter()25 ).enumerate25 () { |
777 | 26 | match spec { |
778 | | BufferSpec::FixedWidth { |
779 | 19 | byte_width, |
780 | 19 | alignment, |
781 | | } => { |
782 | 19 | let min_buffer_size = len_plus_offset.saturating_mul(*byte_width); |
783 | | |
784 | 19 | if buffer.len() < min_buffer_size { |
785 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
786 | 0 | "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}", |
787 | 0 | min_buffer_size, i, self.data_type, buffer.len() |
788 | 0 | ))); |
789 | 19 | } |
790 | | |
791 | 19 | let align_offset = buffer.as_ptr().align_offset(*alignment); |
792 | 19 | if align_offset != 0 { |
793 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
794 | 0 | "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}", |
795 | 0 | self.data_type, align_offset.min(alignment - align_offset) |
796 | 0 | ))); |
797 | 19 | } |
798 | | } |
799 | 6 | BufferSpec::VariableWidth => { |
800 | 6 | // not cheap to validate (need to look at the |
801 | 6 | // data). Partially checked in validate_offsets |
802 | 6 | // called below. Can check with `validate_full` |
803 | 6 | } |
804 | | BufferSpec::BitMap => { |
805 | 1 | let min_buffer_size = bit_util::ceil(len_plus_offset, 8); |
806 | 1 | if buffer.len() < min_buffer_size { |
807 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
808 | 0 | "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}", |
809 | 0 | min_buffer_size, i, self.data_type, buffer.len() |
810 | 0 | ))); |
811 | 1 | } |
812 | | } |
813 | 0 | BufferSpec::AlwaysNull => { |
814 | 0 | // Nothing to validate |
815 | 0 | } |
816 | | } |
817 | | } |
818 | | |
819 | | // check null bit buffer size |
820 | 25 | if let Some(nulls9 ) = self.nulls() { |
821 | 9 | if nulls.null_count() > self.len { |
822 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
823 | 0 | "null_count {} for an array exceeds length of {} elements", |
824 | 0 | nulls.null_count(), |
825 | 0 | self.len |
826 | 0 | ))); |
827 | 9 | } |
828 | | |
829 | 9 | let actual_len = nulls.validity().len(); |
830 | 9 | let needed_len = bit_util::ceil(len_plus_offset, 8); |
831 | 9 | if actual_len < needed_len { |
832 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
833 | 0 | "null_bit_buffer size too small. got {actual_len} needed {needed_len}", |
834 | 0 | ))); |
835 | 9 | } |
836 | | |
837 | 9 | if nulls.len() != self.len { |
838 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
839 | 0 | "null buffer incorrect size. got {} expected {}", |
840 | 0 | nulls.len(), |
841 | 0 | self.len |
842 | 0 | ))); |
843 | 9 | } |
844 | 16 | } |
845 | | |
846 | 25 | self.validate_child_data()?0 ; |
847 | | |
848 | | // Additional Type specific checks |
849 | 25 | match &self.data_type { |
850 | | DataType::Utf8 | DataType::Binary => { |
851 | 6 | self.validate_offsets::<i32>(self.buffers[1].len())?0 ; |
852 | | } |
853 | | DataType::LargeUtf8 | DataType::LargeBinary => { |
854 | 0 | self.validate_offsets::<i64>(self.buffers[1].len())?; |
855 | | } |
856 | 0 | DataType::Dictionary(key_type, _value_type) => { |
857 | | // At the moment, constructing a DictionaryArray will also check this |
858 | 0 | if !DataType::is_dictionary_key_type(key_type) { |
859 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
860 | 0 | "Dictionary key type must be integer, but was {key_type}" |
861 | 0 | ))); |
862 | 0 | } |
863 | | } |
864 | 0 | DataType::RunEndEncoded(run_ends_type, _) => { |
865 | 0 | if run_ends_type.is_nullable() { |
866 | 0 | return Err(ArrowError::InvalidArgumentError( |
867 | 0 | "The nullable should be set to false for the field defining run_ends array.".to_string() |
868 | 0 | )); |
869 | 0 | } |
870 | 0 | if !DataType::is_run_ends_type(run_ends_type.data_type()) { |
871 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
872 | 0 | "RunArray run_ends types must be Int16, Int32 or Int64, but was {}", |
873 | 0 | run_ends_type.data_type() |
874 | 0 | ))); |
875 | 0 | } |
876 | | } |
877 | 19 | _ => {} |
878 | | }; |
879 | | |
880 | 25 | Ok(()) |
881 | 25 | } |
882 | | |
883 | | /// Returns a reference to the data in `buffer` as a typed slice |
884 | | /// (typically `&[i32]` or `&[i64]`) after validating. The |
885 | | /// returned slice is guaranteed to have at least `self.len + 1` |
886 | | /// entries. |
887 | | /// |
888 | | /// For an empty array, the `buffer` can also be empty. |
889 | 20 | fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> { |
890 | | // An empty list-like array can have 0 offsets |
891 | 20 | if self.len == 0 && self.buffers[0]0 .is_empty0 () { |
892 | 0 | return Ok(&[]); |
893 | 20 | } |
894 | | |
895 | 20 | self.typed_buffer(0, self.len + 1) |
896 | 20 | } |
897 | | |
898 | | /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating |
899 | 20 | fn typed_buffer<T: ArrowNativeType + num::Num>( |
900 | 20 | &self, |
901 | 20 | idx: usize, |
902 | 20 | len: usize, |
903 | 20 | ) -> Result<&[T], ArrowError> { |
904 | 20 | let buffer = &self.buffers[idx]; |
905 | | |
906 | 20 | let required_len = (len + self.offset) * mem::size_of::<T>(); |
907 | | |
908 | 20 | if buffer.len() < required_len { |
909 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
910 | 0 | "Buffer {} of {} isn't large enough. Expected {} bytes got {}", |
911 | 0 | idx, |
912 | 0 | self.data_type, |
913 | 0 | required_len, |
914 | 0 | buffer.len() |
915 | 0 | ))); |
916 | 20 | } |
917 | | |
918 | 20 | Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len]) |
919 | 20 | } |
920 | | |
921 | | /// Does a cheap sanity check that the `self.len` values in `buffer` are valid |
922 | | /// offsets (of type T) into some other buffer of `values_length` bytes long |
923 | 15 | fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>( |
924 | 15 | &self, |
925 | 15 | values_length: usize, |
926 | 15 | ) -> Result<(), ArrowError> { |
927 | | // Justification: buffer size was validated above |
928 | 15 | let offsets = self.typed_offsets::<T>()?0 ; |
929 | 15 | if offsets.is_empty() { |
930 | 0 | return Ok(()); |
931 | 15 | } |
932 | | |
933 | 15 | let first_offset = offsets[0].to_usize().ok_or_else(|| {0 |
934 | 0 | ArrowError::InvalidArgumentError(format!( |
935 | 0 | "Error converting offset[0] ({}) to usize for {}", |
936 | 0 | offsets[0], self.data_type |
937 | 0 | )) |
938 | 0 | })?; |
939 | | |
940 | 15 | let last_offset = offsets[self.len].to_usize().ok_or_else(|| {0 |
941 | 0 | ArrowError::InvalidArgumentError(format!( |
942 | 0 | "Error converting offset[{}] ({}) to usize for {}", |
943 | 0 | self.len, offsets[self.len], self.data_type |
944 | 0 | )) |
945 | 0 | })?; |
946 | | |
947 | 15 | if first_offset > values_length { |
948 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
949 | 0 | "First offset {} of {} is larger than values length {}", |
950 | 0 | first_offset, self.data_type, values_length, |
951 | 0 | ))); |
952 | 15 | } |
953 | | |
954 | 15 | if last_offset > values_length { |
955 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
956 | 0 | "Last offset {} of {} is larger than values length {}", |
957 | 0 | last_offset, self.data_type, values_length, |
958 | 0 | ))); |
959 | 15 | } |
960 | | |
961 | 15 | if first_offset > last_offset { |
962 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
963 | 0 | "First offset {} in {} is smaller than last offset {}", |
964 | 0 | first_offset, self.data_type, last_offset, |
965 | 0 | ))); |
966 | 15 | } |
967 | | |
968 | 15 | Ok(()) |
969 | 15 | } |
970 | | |
971 | | /// Does a cheap sanity check that the `self.len` values in `buffer` are valid |
972 | | /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long |
973 | 0 | fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>( |
974 | 0 | &self, |
975 | 0 | values_length: usize, |
976 | 0 | ) -> Result<(), ArrowError> { |
977 | 0 | let offsets: &[T] = self.typed_buffer(0, self.len)?; |
978 | 0 | let sizes: &[T] = self.typed_buffer(1, self.len)?; |
979 | 0 | for i in 0..values_length { |
980 | 0 | let size = sizes[i].to_usize().ok_or_else(|| { |
981 | 0 | ArrowError::InvalidArgumentError(format!( |
982 | 0 | "Error converting size[{}] ({}) to usize for {}", |
983 | 0 | i, sizes[i], self.data_type |
984 | 0 | )) |
985 | 0 | })?; |
986 | 0 | let offset = offsets[i].to_usize().ok_or_else(|| { |
987 | 0 | ArrowError::InvalidArgumentError(format!( |
988 | 0 | "Error converting offset[{}] ({}) to usize for {}", |
989 | 0 | i, offsets[i], self.data_type |
990 | 0 | )) |
991 | 0 | })?; |
992 | 0 | if size |
993 | 0 | .checked_add(offset) |
994 | 0 | .expect("Offset and size have exceeded the usize boundary") |
995 | 0 | > values_length |
996 | | { |
997 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
998 | 0 | "Size {} at index {} is larger than the remaining values for {}", |
999 | 0 | size, i, self.data_type |
1000 | 0 | ))); |
1001 | 0 | } |
1002 | | } |
1003 | 0 | Ok(()) |
1004 | 0 | } |
1005 | | |
1006 | | /// Validates the layout of `child_data` ArrayData structures |
1007 | 25 | fn validate_child_data(&self) -> Result<(), ArrowError> { |
1008 | 25 | match &self.data_type { |
1009 | 9 | DataType::List(field) | DataType::Map(field0 , _) => { |
1010 | 9 | let values_data = self.get_single_valid_child_data(field.data_type())?0 ; |
1011 | 9 | self.validate_offsets::<i32>(values_data.len)?0 ; |
1012 | 9 | Ok(()) |
1013 | | } |
1014 | 0 | DataType::LargeList(field) => { |
1015 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1016 | 0 | self.validate_offsets::<i64>(values_data.len)?; |
1017 | 0 | Ok(()) |
1018 | | } |
1019 | 0 | DataType::ListView(field) => { |
1020 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1021 | 0 | self.validate_offsets_and_sizes::<i32>(values_data.len)?; |
1022 | 0 | Ok(()) |
1023 | | } |
1024 | 0 | DataType::LargeListView(field) => { |
1025 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1026 | 0 | self.validate_offsets_and_sizes::<i64>(values_data.len)?; |
1027 | 0 | Ok(()) |
1028 | | } |
1029 | 0 | DataType::FixedSizeList(field, list_size) => { |
1030 | 0 | let values_data = self.get_single_valid_child_data(field.data_type())?; |
1031 | | |
1032 | 0 | let list_size: usize = (*list_size).try_into().map_err(|_| { |
1033 | 0 | ArrowError::InvalidArgumentError(format!( |
1034 | 0 | "{} has a negative list_size {}", |
1035 | 0 | self.data_type, list_size |
1036 | 0 | )) |
1037 | 0 | })?; |
1038 | | |
1039 | 0 | let expected_values_len = self.len |
1040 | 0 | .checked_mul(list_size) |
1041 | 0 | .expect("integer overflow computing expected number of expected values in FixedListSize"); |
1042 | | |
1043 | 0 | if values_data.len < expected_values_len { |
1044 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1045 | 0 | "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}", |
1046 | 0 | values_data.len, self.len, list_size, self.data_type |
1047 | 0 | ))); |
1048 | 0 | } |
1049 | | |
1050 | 0 | Ok(()) |
1051 | | } |
1052 | 5 | DataType::Struct(fields) => { |
1053 | 5 | self.validate_num_child_data(fields.len())?0 ; |
1054 | 9 | for (i, field) in fields.iter()5 .enumerate5 () { |
1055 | 9 | let field_data = self.get_valid_child_data(i, field.data_type())?0 ; |
1056 | | |
1057 | | // Ensure child field has sufficient size |
1058 | 9 | if field_data.len < self.len { |
1059 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1060 | 0 | "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})", |
1061 | 0 | self.data_type, i, field.name(), field_data.len, self.len |
1062 | 0 | ))); |
1063 | 9 | } |
1064 | | } |
1065 | 5 | Ok(()) |
1066 | | } |
1067 | 0 | DataType::RunEndEncoded(run_ends_field, values_field) => { |
1068 | 0 | self.validate_num_child_data(2)?; |
1069 | 0 | let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?; |
1070 | 0 | let values_data = self.get_valid_child_data(1, values_field.data_type())?; |
1071 | 0 | if run_ends_data.len != values_data.len { |
1072 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1073 | 0 | "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", |
1074 | 0 | run_ends_data.len, values_data.len |
1075 | 0 | ))); |
1076 | 0 | } |
1077 | 0 | if run_ends_data.nulls.is_some() { |
1078 | 0 | return Err(ArrowError::InvalidArgumentError( |
1079 | 0 | "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), |
1080 | 0 | )); |
1081 | 0 | } |
1082 | 0 | Ok(()) |
1083 | | } |
1084 | 0 | DataType::Union(fields, mode) => { |
1085 | 0 | self.validate_num_child_data(fields.len())?; |
1086 | | |
1087 | 0 | for (i, (_, field)) in fields.iter().enumerate() { |
1088 | 0 | let field_data = self.get_valid_child_data(i, field.data_type())?; |
1089 | | |
1090 | 0 | if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) { |
1091 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1092 | 0 | "Sparse union child array #{} has length smaller than expected for union array ({} < {})", |
1093 | 0 | i, field_data.len, self.len + self.offset |
1094 | 0 | ))); |
1095 | 0 | } |
1096 | | } |
1097 | 0 | Ok(()) |
1098 | | } |
1099 | 0 | DataType::Dictionary(_key_type, value_type) => { |
1100 | 0 | self.get_single_valid_child_data(value_type)?; |
1101 | 0 | Ok(()) |
1102 | | } |
1103 | | _ => { |
1104 | | // other types do not have child data |
1105 | 11 | if !self.child_data.is_empty() { |
1106 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1107 | 0 | "Expected no child arrays for type {} but got {}", |
1108 | 0 | self.data_type, |
1109 | 0 | self.child_data.len() |
1110 | 0 | ))); |
1111 | 11 | } |
1112 | 11 | Ok(()) |
1113 | | } |
1114 | | } |
1115 | 25 | } |
1116 | | |
1117 | | /// Ensures that this array data has a single child_data with the |
1118 | | /// expected type, and calls `validate()` on it. Returns a |
1119 | | /// reference to that child_data |
1120 | 9 | fn get_single_valid_child_data( |
1121 | 9 | &self, |
1122 | 9 | expected_type: &DataType, |
1123 | 9 | ) -> Result<&ArrayData, ArrowError> { |
1124 | 9 | self.validate_num_child_data(1)?0 ; |
1125 | 9 | self.get_valid_child_data(0, expected_type) |
1126 | 9 | } |
1127 | | |
1128 | | /// Returns `Err` if self.child_data does not have exactly `expected_len` elements |
1129 | 14 | fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { |
1130 | 14 | if self.child_data.len() != expected_len { |
1131 | 0 | Err(ArrowError::InvalidArgumentError(format!( |
1132 | 0 | "Value data for {} should contain {} child data array(s), had {}", |
1133 | 0 | self.data_type, |
1134 | 0 | expected_len, |
1135 | 0 | self.child_data.len() |
1136 | 0 | ))) |
1137 | | } else { |
1138 | 14 | Ok(()) |
1139 | | } |
1140 | 14 | } |
1141 | | |
1142 | | /// Ensures that `child_data[i]` has the expected type, calls |
1143 | | /// `validate()` on it, and returns a reference to that child_data |
1144 | 18 | fn get_valid_child_data( |
1145 | 18 | &self, |
1146 | 18 | i: usize, |
1147 | 18 | expected_type: &DataType, |
1148 | 18 | ) -> Result<&ArrayData, ArrowError> { |
1149 | 18 | let values_data = self.child_data.get(i).ok_or_else(|| {0 |
1150 | 0 | ArrowError::InvalidArgumentError(format!( |
1151 | 0 | "{} did not have enough child arrays. Expected at least {} but had only {}", |
1152 | 0 | self.data_type, |
1153 | 0 | i + 1, |
1154 | 0 | self.child_data.len() |
1155 | 0 | )) |
1156 | 0 | })?; |
1157 | | |
1158 | 18 | if expected_type != &values_data.data_type { |
1159 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1160 | 0 | "Child type mismatch for {}. Expected {} but child data had {}", |
1161 | 0 | self.data_type, expected_type, values_data.data_type |
1162 | 0 | ))); |
1163 | 18 | } |
1164 | | |
1165 | 18 | values_data.validate()?0 ; |
1166 | 18 | Ok(values_data) |
1167 | 18 | } |
1168 | | |
1169 | | /// Validate that the data contained within this [`ArrayData`] is valid |
1170 | | /// |
1171 | | /// 1. Null count is correct |
1172 | | /// 2. All offsets are valid |
1173 | | /// 3. All String data is valid UTF-8 |
1174 | | /// 4. All dictionary offsets are valid |
1175 | | /// |
1176 | | /// Internally this calls: |
1177 | | /// |
1178 | | /// * [`Self::validate`] |
1179 | | /// * [`Self::validate_nulls`] |
1180 | | /// * [`Self::validate_values`] |
1181 | | /// |
1182 | | /// Note: this does not recurse into children, for a recursive variant |
1183 | | /// see [`Self::validate_full`] |
1184 | 7 | pub fn validate_data(&self) -> Result<(), ArrowError> { |
1185 | 7 | self.validate()?0 ; |
1186 | | |
1187 | 7 | self.validate_nulls()?0 ; |
1188 | 7 | self.validate_values()?0 ; |
1189 | 7 | Ok(()) |
1190 | 7 | } |
1191 | | |
1192 | | /// Performs a full recursive validation of this [`ArrayData`] and all its children |
1193 | | /// |
1194 | | /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] |
1195 | | /// and all its children recursively |
1196 | 0 | pub fn validate_full(&self) -> Result<(), ArrowError> { |
1197 | 0 | self.validate_data()?; |
1198 | | // validate all children recursively |
1199 | 0 | self.child_data |
1200 | 0 | .iter() |
1201 | 0 | .enumerate() |
1202 | 0 | .try_for_each(|(i, child_data)| { |
1203 | 0 | child_data.validate_full().map_err(|e| { |
1204 | 0 | ArrowError::InvalidArgumentError(format!( |
1205 | 0 | "{} child #{} invalid: {}", |
1206 | 0 | self.data_type, i, e |
1207 | 0 | )) |
1208 | 0 | }) |
1209 | 0 | })?; |
1210 | 0 | Ok(()) |
1211 | 0 | } |
1212 | | |
1213 | | /// Validates the values stored within this [`ArrayData`] are valid |
1214 | | /// without recursing into child [`ArrayData`] |
1215 | | /// |
1216 | | /// Does not (yet) check |
1217 | | /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) |
1218 | | /// 2. the the null count is correct and that any |
1219 | | /// 3. nullability requirements of its children are correct |
1220 | | /// |
1221 | | /// [#85]: https://github.com/apache/arrow-rs/issues/85 |
1222 | 7 | pub fn validate_nulls(&self) -> Result<(), ArrowError> { |
1223 | 7 | if let Some(nulls3 ) = &self.nulls { |
1224 | 3 | let actual = nulls.len() - nulls.inner().count_set_bits(); |
1225 | 3 | if actual != nulls.null_count() { |
1226 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1227 | 0 | "null_count value ({}) doesn't match actual number of nulls in array ({})", |
1228 | 0 | nulls.null_count(), |
1229 | 0 | actual |
1230 | 0 | ))); |
1231 | 3 | } |
1232 | 4 | } |
1233 | | |
1234 | | // In general non-nullable children should not contain nulls, however, for certain |
1235 | | // types, such as StructArray and FixedSizeList, nulls in the parent take up |
1236 | | // space in the child. As such we permit nulls in the children in the corresponding |
1237 | | // positions for such types |
1238 | 7 | match &self.data_type { |
1239 | 5 | DataType::List(f) | DataType::LargeList(f0 ) | DataType::Map(f0 , _) => { |
1240 | 5 | if !f.is_nullable() { |
1241 | 1 | self.validate_non_nullable(None, &self.child_data[0])?0 |
1242 | 4 | } |
1243 | | } |
1244 | 0 | DataType::FixedSizeList(field, len) => { |
1245 | 0 | let child = &self.child_data[0]; |
1246 | 0 | if !field.is_nullable() { |
1247 | 0 | match &self.nulls { |
1248 | 0 | Some(nulls) => { |
1249 | 0 | let element_len = *len as usize; |
1250 | 0 | let expanded = nulls.expand(element_len); |
1251 | 0 | self.validate_non_nullable(Some(&expanded), child)?; |
1252 | | } |
1253 | 0 | None => self.validate_non_nullable(None, child)?, |
1254 | | } |
1255 | 0 | } |
1256 | | } |
1257 | 2 | DataType::Struct(fields) => { |
1258 | 3 | for (field, child) in fields.iter()2 .zip2 (&self.child_data2 ) { |
1259 | 3 | if !field.is_nullable() { |
1260 | 0 | self.validate_non_nullable(self.nulls(), child)? |
1261 | 3 | } |
1262 | | } |
1263 | | } |
1264 | 0 | _ => {} |
1265 | | } |
1266 | | |
1267 | 7 | Ok(()) |
1268 | 7 | } |
1269 | | |
1270 | | /// Verifies that `child` contains no nulls not present in `mask` |
1271 | 1 | fn validate_non_nullable( |
1272 | 1 | &self, |
1273 | 1 | mask: Option<&NullBuffer>, |
1274 | 1 | child: &ArrayData, |
1275 | 1 | ) -> Result<(), ArrowError> { |
1276 | 1 | let mask0 = match mask { |
1277 | 0 | Some(mask) => mask, |
1278 | | None => { |
1279 | 1 | return match child.null_count() { |
1280 | 1 | 0 => Ok(()), |
1281 | 0 | _ => Err(ArrowError::InvalidArgumentError(format!( |
1282 | 0 | "non-nullable child of type {} contains nulls not present in parent {}", |
1283 | 0 | child.data_type, self.data_type |
1284 | 0 | ))), |
1285 | | } |
1286 | | } |
1287 | | }; |
1288 | | |
1289 | 0 | match child.nulls() { |
1290 | 0 | Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!( |
1291 | 0 | "non-nullable child of type {} contains nulls not present in parent", |
1292 | 0 | child.data_type |
1293 | 0 | ))), |
1294 | 0 | _ => Ok(()), |
1295 | | } |
1296 | 1 | } |
1297 | | |
1298 | | /// Validates the values stored within this [`ArrayData`] are valid |
1299 | | /// without recursing into child [`ArrayData`] |
1300 | | /// |
1301 | | /// Does not (yet) check |
1302 | | /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) |
1303 | 7 | pub fn validate_values(&self) -> Result<(), ArrowError> { |
1304 | 7 | match &self.data_type { |
1305 | 0 | DataType::Utf8 => self.validate_utf8::<i32>(), |
1306 | 0 | DataType::LargeUtf8 => self.validate_utf8::<i64>(), |
1307 | 0 | DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()), |
1308 | 0 | DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()), |
1309 | | DataType::BinaryView => { |
1310 | 0 | let views = self.typed_buffer::<u128>(0, self.len)?; |
1311 | 0 | validate_binary_view(views, &self.buffers[1..]) |
1312 | | } |
1313 | | DataType::Utf8View => { |
1314 | 0 | let views = self.typed_buffer::<u128>(0, self.len)?; |
1315 | 0 | validate_string_view(views, &self.buffers[1..]) |
1316 | | } |
1317 | | DataType::List(_) | DataType::Map(_, _) => { |
1318 | 5 | let child = &self.child_data[0]; |
1319 | 5 | self.validate_offsets_full::<i32>(child.len) |
1320 | | } |
1321 | | DataType::LargeList(_) => { |
1322 | 0 | let child = &self.child_data[0]; |
1323 | 0 | self.validate_offsets_full::<i64>(child.len) |
1324 | | } |
1325 | | DataType::Union(_, _) => { |
1326 | | // Validate Union Array as part of implementing new Union semantics |
1327 | | // See comments in `ArrayData::validate()` |
1328 | | // https://github.com/apache/arrow-rs/issues/85 |
1329 | | // |
1330 | | // TODO file follow on ticket for full union validation |
1331 | 0 | Ok(()) |
1332 | | } |
1333 | 0 | DataType::Dictionary(key_type, _value_type) => { |
1334 | 0 | let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); |
1335 | 0 | let max_value = dictionary_length - 1; |
1336 | 0 | match key_type.as_ref() { |
1337 | 0 | DataType::UInt8 => self.check_bounds::<u8>(max_value), |
1338 | 0 | DataType::UInt16 => self.check_bounds::<u16>(max_value), |
1339 | 0 | DataType::UInt32 => self.check_bounds::<u32>(max_value), |
1340 | 0 | DataType::UInt64 => self.check_bounds::<u64>(max_value), |
1341 | 0 | DataType::Int8 => self.check_bounds::<i8>(max_value), |
1342 | 0 | DataType::Int16 => self.check_bounds::<i16>(max_value), |
1343 | 0 | DataType::Int32 => self.check_bounds::<i32>(max_value), |
1344 | 0 | DataType::Int64 => self.check_bounds::<i64>(max_value), |
1345 | 0 | _ => unreachable!(), |
1346 | | } |
1347 | | } |
1348 | 0 | DataType::RunEndEncoded(run_ends, _values) => { |
1349 | 0 | let run_ends_data = self.child_data()[0].clone(); |
1350 | 0 | match run_ends.data_type() { |
1351 | 0 | DataType::Int16 => run_ends_data.check_run_ends::<i16>(), |
1352 | 0 | DataType::Int32 => run_ends_data.check_run_ends::<i32>(), |
1353 | 0 | DataType::Int64 => run_ends_data.check_run_ends::<i64>(), |
1354 | 0 | _ => unreachable!(), |
1355 | | } |
1356 | | } |
1357 | | _ => { |
1358 | | // No extra validation check required for other types |
1359 | 2 | Ok(()) |
1360 | | } |
1361 | | } |
1362 | 7 | } |
1363 | | |
1364 | | /// Calls the `validate(item_index, range)` function for each of |
1365 | | /// the ranges specified in the arrow offsets buffer of type |
1366 | | /// `T`. Also validates that each offset is smaller than |
1367 | | /// `offset_limit` |
1368 | | /// |
1369 | | /// For an empty array, the offsets buffer can either be empty |
1370 | | /// or contain a single `0`. |
1371 | | /// |
1372 | | /// For example, the offsets buffer contained `[1, 2, 4]`, this |
1373 | | /// function would call `validate([1,2])`, and `validate([2,4])` |
1374 | 5 | fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError> |
1375 | 5 | where |
1376 | 5 | T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
1377 | 5 | V: Fn(usize, Range<usize>) -> Result<(), ArrowError>, |
1378 | | { |
1379 | 5 | self.typed_offsets::<T>()?0 |
1380 | 5 | .iter() |
1381 | 5 | .enumerate() |
1382 | 35 | .map5 (|(i, x)| { |
1383 | | // check if the offset can be converted to usize |
1384 | 35 | let r = x.to_usize().ok_or_else(|| {0 |
1385 | 0 | ArrowError::InvalidArgumentError(format!( |
1386 | 0 | "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))} |
1387 | | ); |
1388 | | // check if the offset exceeds the limit |
1389 | 35 | match r { |
1390 | 35 | Ok(n) if n <= offset_limit => Ok((i, n)), |
1391 | 0 | Ok(_) => Err(ArrowError::InvalidArgumentError(format!( |
1392 | 0 | "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}")) |
1393 | 0 | ), |
1394 | 0 | Err(e) => Err(e), |
1395 | | } |
1396 | 35 | }) |
1397 | 35 | .scan5 (0_usize, |start, end| { |
1398 | | // check offsets are monotonically increasing |
1399 | 35 | match end { |
1400 | 35 | Ok((i, end)) if *start <= end => { |
1401 | 35 | let range = Some(Ok((i, *start..end))); |
1402 | 35 | *start = end; |
1403 | 35 | range |
1404 | | } |
1405 | 0 | Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( |
1406 | 0 | "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", |
1407 | 0 | i - 1, start, end)) |
1408 | 0 | )), |
1409 | 0 | Err(err) => Some(Err(err)), |
1410 | | } |
1411 | 35 | }) |
1412 | 5 | .skip(1) // the first element is meaningless |
1413 | 30 | .try_for_each5 (|res: Result<(usize, Range<usize>), ArrowError>| { |
1414 | 30 | let (item_index, range) = res?0 ; |
1415 | 30 | validate(item_index-1, range) |
1416 | 30 | }) |
1417 | 5 | } |
1418 | | |
1419 | | /// Ensures that all strings formed by the offsets in `buffers[0]` |
1420 | | /// into `buffers[1]` are valid utf8 sequences |
1421 | 0 | fn validate_utf8<T>(&self) -> Result<(), ArrowError> |
1422 | 0 | where |
1423 | 0 | T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
1424 | | { |
1425 | 0 | let values_buffer = &self.buffers[1].as_slice(); |
1426 | 0 | if let Ok(values_str) = std::str::from_utf8(values_buffer) { |
1427 | | // Validate Offsets are correct |
1428 | 0 | self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| { |
1429 | 0 | if !values_str.is_char_boundary(range.start) |
1430 | 0 | || !values_str.is_char_boundary(range.end) |
1431 | | { |
1432 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1433 | 0 | "incomplete utf-8 byte sequence from index {string_index}" |
1434 | 0 | ))); |
1435 | 0 | } |
1436 | 0 | Ok(()) |
1437 | 0 | }) |
1438 | | } else { |
1439 | | // find specific offset that failed utf8 validation |
1440 | 0 | self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| { |
1441 | 0 | std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { |
1442 | 0 | ArrowError::InvalidArgumentError(format!( |
1443 | 0 | "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" |
1444 | 0 | )) |
1445 | 0 | })?; |
1446 | 0 | Ok(()) |
1447 | 0 | }) |
1448 | | } |
1449 | 0 | } |
1450 | | |
1451 | | /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are |
1452 | | /// between `0` and `offset_limit` |
1453 | 5 | fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError> |
1454 | 5 | where |
1455 | 5 | T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
1456 | | { |
1457 | 30 | self5 .validate_each_offset5 ::<T, _>(offset_limit5 , |_string_index, _range| { |
1458 | | // No validation applied to each value, but the iteration |
1459 | | // itself applies bounds checking to each range |
1460 | 30 | Ok(()) |
1461 | 30 | }) |
1462 | 5 | } |
1463 | | |
1464 | | /// Validates that each value in self.buffers (typed as T) |
1465 | | /// is within the range [0, max_value], inclusive |
1466 | 0 | fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError> |
1467 | 0 | where |
1468 | 0 | T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display, |
1469 | | { |
1470 | 0 | let required_len = self.len + self.offset; |
1471 | 0 | let buffer = &self.buffers[0]; |
1472 | | |
1473 | | // This should have been checked as part of `validate()` prior |
1474 | | // to calling `validate_full()` but double check to be sure |
1475 | 0 | assert!(buffer.len() / mem::size_of::<T>() >= required_len); |
1476 | | |
1477 | | // Justification: buffer size was validated above |
1478 | 0 | let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len]; |
1479 | | |
1480 | 0 | indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { |
1481 | | // Do not check the value is null (value can be arbitrary) |
1482 | 0 | if self.is_null(i) { |
1483 | 0 | return Ok(()); |
1484 | 0 | } |
1485 | 0 | let dict_index: i64 = dict_index.try_into().map_err(|_| { |
1486 | 0 | ArrowError::InvalidArgumentError(format!( |
1487 | 0 | "Value at position {i} out of bounds: {dict_index} (can not convert to i64)" |
1488 | 0 | )) |
1489 | 0 | })?; |
1490 | | |
1491 | 0 | if dict_index < 0 || dict_index > max_value { |
1492 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1493 | 0 | "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])" |
1494 | 0 | ))); |
1495 | 0 | } |
1496 | 0 | Ok(()) |
1497 | 0 | }) |
1498 | 0 | } |
1499 | | |
1500 | | /// Validates that each value in run_ends array is positive and strictly increasing. |
1501 | 0 | fn check_run_ends<T>(&self) -> Result<(), ArrowError> |
1502 | 0 | where |
1503 | 0 | T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display, |
1504 | | { |
1505 | 0 | let values = self.typed_buffer::<T>(0, self.len)?; |
1506 | 0 | let mut prev_value: i64 = 0_i64; |
1507 | 0 | values.iter().enumerate().try_for_each(|(ix, &inp_value)| { |
1508 | 0 | let value: i64 = inp_value.try_into().map_err(|_| { |
1509 | 0 | ArrowError::InvalidArgumentError(format!( |
1510 | 0 | "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)" |
1511 | 0 | )) |
1512 | 0 | })?; |
1513 | 0 | if value <= 0_i64 { |
1514 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1515 | 0 | "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria." |
1516 | 0 | ))); |
1517 | 0 | } |
1518 | 0 | if ix > 0 && value <= prev_value { |
1519 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1520 | 0 | "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria." |
1521 | 0 | ))); |
1522 | 0 | } |
1523 | | |
1524 | 0 | prev_value = value; |
1525 | 0 | Ok(()) |
1526 | 0 | })?; |
1527 | | |
1528 | 0 | if prev_value.as_usize() < (self.offset + self.len) { |
1529 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
1530 | 0 | "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.", |
1531 | 0 | self.offset + self.len |
1532 | 0 | ))); |
1533 | 0 | } |
1534 | 0 | Ok(()) |
1535 | 0 | } |
1536 | | |
1537 | | /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons |
1538 | | /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may |
1539 | | /// return false when the arrays are logically equal |
1540 | 0 | pub fn ptr_eq(&self, other: &Self) -> bool { |
1541 | 0 | if self.offset != other.offset |
1542 | 0 | || self.len != other.len |
1543 | 0 | || self.data_type != other.data_type |
1544 | 0 | || self.buffers.len() != other.buffers.len() |
1545 | 0 | || self.child_data.len() != other.child_data.len() |
1546 | | { |
1547 | 0 | return false; |
1548 | 0 | } |
1549 | | |
1550 | 0 | match (&self.nulls, &other.nulls) { |
1551 | 0 | (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false, |
1552 | 0 | (Some(_), None) | (None, Some(_)) => return false, |
1553 | 0 | _ => {} |
1554 | | }; |
1555 | | |
1556 | 0 | if !self |
1557 | 0 | .buffers |
1558 | 0 | .iter() |
1559 | 0 | .zip(other.buffers.iter()) |
1560 | 0 | .all(|(a, b)| a.as_ptr() == b.as_ptr()) |
1561 | | { |
1562 | 0 | return false; |
1563 | 0 | } |
1564 | | |
1565 | 0 | self.child_data |
1566 | 0 | .iter() |
1567 | 0 | .zip(other.child_data.iter()) |
1568 | 0 | .all(|(a, b)| a.ptr_eq(b)) |
1569 | 0 | } |
1570 | | |
1571 | | /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] |
1572 | 13 | pub fn into_builder(self) -> ArrayDataBuilder { |
1573 | 13 | self.into() |
1574 | 13 | } |
1575 | | } |
1576 | | |
1577 | | /// Return the expected [`DataTypeLayout`] Arrays of this data |
1578 | | /// type are expected to have |
1579 | 25 | pub fn layout(data_type: &DataType) -> DataTypeLayout { |
1580 | | // based on C/C++ implementation in |
1581 | | // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) |
1582 | | use arrow_schema::IntervalUnit::*; |
1583 | | |
1584 | 0 | match data_type { |
1585 | 0 | DataType::Null => DataTypeLayout { |
1586 | 0 | buffers: vec![], |
1587 | 0 | can_contain_null_mask: false, |
1588 | 0 | variadic: false, |
1589 | 0 | }, |
1590 | 1 | DataType::Boolean => DataTypeLayout { |
1591 | 1 | buffers: vec![BufferSpec::BitMap], |
1592 | 1 | can_contain_null_mask: true, |
1593 | 1 | variadic: false, |
1594 | 1 | }, |
1595 | 0 | DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(), |
1596 | 0 | DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(), |
1597 | 0 | DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(), |
1598 | 3 | DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(), |
1599 | 0 | DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(), |
1600 | 0 | DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(), |
1601 | 0 | DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(), |
1602 | 0 | DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(), |
1603 | 0 | DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(), |
1604 | 1 | DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(), |
1605 | 0 | DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(), |
1606 | 0 | DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(), |
1607 | 0 | DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(), |
1608 | 0 | DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(), |
1609 | 0 | DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(), |
1610 | 0 | DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1611 | 0 | DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(), |
1612 | 0 | DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(), |
1613 | | DataType::Interval(MonthDayNano) => { |
1614 | 0 | DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>() |
1615 | | } |
1616 | 0 | DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1617 | 0 | DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(), |
1618 | 0 | DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(), |
1619 | 0 | DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(), |
1620 | 0 | DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(), |
1621 | 0 | DataType::FixedSizeBinary(size) => { |
1622 | 0 | let spec = BufferSpec::FixedWidth { |
1623 | 0 | byte_width: (*size).try_into().unwrap(), |
1624 | 0 | alignment: mem::align_of::<u8>(), |
1625 | 0 | }; |
1626 | 0 | DataTypeLayout { |
1627 | 0 | buffers: vec![spec], |
1628 | 0 | can_contain_null_mask: true, |
1629 | 0 | variadic: false, |
1630 | 0 | } |
1631 | | } |
1632 | 0 | DataType::Binary => DataTypeLayout::new_binary::<i32>(), |
1633 | 0 | DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(), |
1634 | 6 | DataType::Utf8 => DataTypeLayout::new_binary::<i32>(), |
1635 | 0 | DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(), |
1636 | 0 | DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), |
1637 | 0 | DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data |
1638 | 9 | DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(), |
1639 | 0 | DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(), |
1640 | 0 | DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(), |
1641 | 0 | DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(), |
1642 | 0 | DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(), |
1643 | 5 | DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data, |
1644 | 0 | DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, |
1645 | 0 | DataType::Union(_, mode) => { |
1646 | 0 | let type_ids = BufferSpec::FixedWidth { |
1647 | 0 | byte_width: mem::size_of::<i8>(), |
1648 | 0 | alignment: mem::align_of::<i8>(), |
1649 | 0 | }; |
1650 | | |
1651 | | DataTypeLayout { |
1652 | 0 | buffers: match mode { |
1653 | | UnionMode::Sparse => { |
1654 | 0 | vec![type_ids] |
1655 | | } |
1656 | | UnionMode::Dense => { |
1657 | 0 | vec![ |
1658 | 0 | type_ids, |
1659 | 0 | BufferSpec::FixedWidth { |
1660 | 0 | byte_width: mem::size_of::<i32>(), |
1661 | 0 | alignment: mem::align_of::<i32>(), |
1662 | 0 | }, |
1663 | | ] |
1664 | | } |
1665 | | }, |
1666 | | can_contain_null_mask: false, |
1667 | | variadic: false, |
1668 | | } |
1669 | | } |
1670 | 0 | DataType::Dictionary(key_type, _value_type) => layout(key_type), |
1671 | | } |
1672 | 25 | } |
1673 | | |
1674 | | /// Layout specification for a data type |
1675 | | #[derive(Debug, PartialEq, Eq)] |
1676 | | // Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 |
1677 | | pub struct DataTypeLayout { |
1678 | | /// A vector of buffer layout specifications, one for each expected buffer |
1679 | | pub buffers: Vec<BufferSpec>, |
1680 | | |
1681 | | /// Can contain a null bitmask |
1682 | | pub can_contain_null_mask: bool, |
1683 | | |
1684 | | /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`] |
1685 | | /// If `variadic` is true, the number of buffers expected is only lower-bounded by |
1686 | | /// buffers.len(). Buffers that exceed the lower bound are legal. |
1687 | | pub variadic: bool, |
1688 | | } |
1689 | | |
1690 | | impl DataTypeLayout { |
1691 | | /// Describes a basic numeric array where each element has type `T` |
1692 | 13 | pub fn new_fixed_width<T>() -> Self { |
1693 | 13 | Self { |
1694 | 13 | buffers: vec![BufferSpec::FixedWidth { |
1695 | 13 | byte_width: mem::size_of::<T>(), |
1696 | 13 | alignment: mem::align_of::<T>(), |
1697 | 13 | }], |
1698 | 13 | can_contain_null_mask: true, |
1699 | 13 | variadic: false, |
1700 | 13 | } |
1701 | 13 | } |
1702 | | |
1703 | | /// Describes arrays which have no data of their own |
1704 | | /// but may still have a Null Bitmap (e.g. FixedSizeList) |
1705 | 5 | pub fn new_nullable_empty() -> Self { |
1706 | 5 | Self { |
1707 | 5 | buffers: vec![], |
1708 | 5 | can_contain_null_mask: true, |
1709 | 5 | variadic: false, |
1710 | 5 | } |
1711 | 5 | } |
1712 | | |
1713 | | /// Describes arrays which have no data of their own |
1714 | | /// (e.g. RunEndEncoded). |
1715 | 0 | pub fn new_empty() -> Self { |
1716 | 0 | Self { |
1717 | 0 | buffers: vec![], |
1718 | 0 | can_contain_null_mask: false, |
1719 | 0 | variadic: false, |
1720 | 0 | } |
1721 | 0 | } |
1722 | | |
1723 | | /// Describes a basic numeric array where each element has a fixed |
1724 | | /// with offset buffer of type `T`, followed by a |
1725 | | /// variable width data buffer |
1726 | 6 | pub fn new_binary<T>() -> Self { |
1727 | 6 | Self { |
1728 | 6 | buffers: vec![ |
1729 | 6 | // offsets |
1730 | 6 | BufferSpec::FixedWidth { |
1731 | 6 | byte_width: mem::size_of::<T>(), |
1732 | 6 | alignment: mem::align_of::<T>(), |
1733 | 6 | }, |
1734 | 6 | // values |
1735 | 6 | BufferSpec::VariableWidth, |
1736 | 6 | ], |
1737 | 6 | can_contain_null_mask: true, |
1738 | 6 | variadic: false, |
1739 | 6 | } |
1740 | 6 | } |
1741 | | |
1742 | | /// Describes a view type |
1743 | 0 | pub fn new_view() -> Self { |
1744 | 0 | Self { |
1745 | 0 | buffers: vec![BufferSpec::FixedWidth { |
1746 | 0 | byte_width: mem::size_of::<u128>(), |
1747 | 0 | alignment: mem::align_of::<u128>(), |
1748 | 0 | }], |
1749 | 0 | can_contain_null_mask: true, |
1750 | 0 | variadic: true, |
1751 | 0 | } |
1752 | 0 | } |
1753 | | |
1754 | | /// Describes a list view type |
1755 | 0 | pub fn new_list_view<T>() -> Self { |
1756 | 0 | Self { |
1757 | 0 | buffers: vec![ |
1758 | 0 | BufferSpec::FixedWidth { |
1759 | 0 | byte_width: mem::size_of::<T>(), |
1760 | 0 | alignment: mem::align_of::<T>(), |
1761 | 0 | }, |
1762 | 0 | BufferSpec::FixedWidth { |
1763 | 0 | byte_width: mem::size_of::<T>(), |
1764 | 0 | alignment: mem::align_of::<T>(), |
1765 | 0 | }, |
1766 | 0 | ], |
1767 | 0 | can_contain_null_mask: true, |
1768 | 0 | variadic: true, |
1769 | 0 | } |
1770 | 0 | } |
1771 | | } |
1772 | | |
1773 | | /// Layout specification for a single data type buffer |
1774 | | #[derive(Debug, PartialEq, Eq)] |
1775 | | pub enum BufferSpec { |
1776 | | /// Each element is a fixed width primitive, with the given `byte_width` and `alignment` |
1777 | | /// |
1778 | | /// `alignment` is the alignment required by Rust for an array of the corresponding primitive, |
1779 | | /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. |
1780 | | /// |
1781 | | /// Arrow-rs requires that all buffers have at least this alignment, to allow for |
1782 | | /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow |
1783 | | /// for array slicing and interoperability with `Vec`, which cannot be over-aligned. |
1784 | | /// |
1785 | | /// Note that these alignment requirements will vary between architectures |
1786 | | FixedWidth { |
1787 | | /// The width of each element in bytes |
1788 | | byte_width: usize, |
1789 | | /// The alignment required by Rust for an array of the corresponding primitive |
1790 | | alignment: usize, |
1791 | | }, |
1792 | | /// Variable width, such as string data for utf8 data |
1793 | | VariableWidth, |
1794 | | /// Buffer holds a bitmap. |
1795 | | /// |
1796 | | /// Note: Unlike the C++ implementation, the null/validity buffer |
1797 | | /// is handled specially rather than as another of the buffers in |
1798 | | /// the spec, so this variant is only used for the Boolean type. |
1799 | | BitMap, |
1800 | | /// Buffer is always null. Unused currently in Rust implementation, |
1801 | | /// (used in C++ for Union type) |
1802 | | #[allow(dead_code)] |
1803 | | AlwaysNull, |
1804 | | } |
1805 | | |
1806 | | impl PartialEq for ArrayData { |
1807 | 439 | fn eq(&self, other: &Self) -> bool { |
1808 | 439 | equal::equal(self, other) |
1809 | 439 | } |
1810 | | } |
1811 | | |
1812 | | /// A boolean flag that cannot be mutated outside of unsafe code. |
1813 | | /// |
1814 | | /// Defaults to a value of false. |
1815 | | /// |
1816 | | /// This structure is used to enforce safety in the [`ArrayDataBuilder`] |
1817 | | /// |
1818 | | /// [`ArrayDataBuilder`]: super::ArrayDataBuilder |
1819 | | /// |
1820 | | /// # Example |
1821 | | /// ```rust |
1822 | | /// use arrow_data::UnsafeFlag; |
1823 | | /// assert!(!UnsafeFlag::default().get()); // default is false |
1824 | | /// let mut flag = UnsafeFlag::new(); |
1825 | | /// assert!(!flag.get()); // defaults to false |
1826 | | /// // can only set it to true in unsafe code |
1827 | | /// unsafe { flag.set(true) }; |
1828 | | /// assert!(flag.get()); // now true |
1829 | | /// ``` |
1830 | | #[derive(Debug, Clone)] |
1831 | | #[doc(hidden)] |
1832 | | pub struct UnsafeFlag(bool); |
1833 | | |
1834 | | impl UnsafeFlag { |
1835 | | /// Creates a new `UnsafeFlag` with the value set to `false`. |
1836 | | /// |
1837 | | /// See examples on [`Self::new`] |
1838 | | #[inline] |
1839 | 1.45k | pub const fn new() -> Self { |
1840 | 1.45k | Self(false) |
1841 | 1.45k | } |
1842 | | |
1843 | | /// Sets the value of the flag to the given value |
1844 | | /// |
1845 | | /// Note this can purposely only be done in `unsafe` code |
1846 | | /// |
1847 | | /// # Safety |
1848 | | /// |
1849 | | /// If set, the flag will be set to the given value. There is nothing |
1850 | | /// immediately unsafe about doing so, however, the flag can be used to |
1851 | | /// subsequently bypass safety checks in the [`ArrayDataBuilder`]. |
1852 | | #[inline] |
1853 | 1.45k | pub unsafe fn set(&mut self, val: bool) { |
1854 | 1.45k | self.0 = val; |
1855 | 1.45k | } |
1856 | | |
1857 | | /// Returns the value of the flag |
1858 | | #[inline] |
1859 | 1.45k | pub fn get(&self) -> bool { |
1860 | 1.45k | self.0 |
1861 | 1.45k | } |
1862 | | } |
1863 | | |
1864 | | // Manual impl to make it clear you can not construct unsafe with true |
1865 | | impl Default for UnsafeFlag { |
1866 | 0 | fn default() -> Self { |
1867 | 0 | Self::new() |
1868 | 0 | } |
1869 | | } |
1870 | | |
1871 | | /// Builder for [`ArrayData`] type |
1872 | | #[derive(Debug)] |
1873 | | pub struct ArrayDataBuilder { |
1874 | | data_type: DataType, |
1875 | | len: usize, |
1876 | | null_count: Option<usize>, |
1877 | | null_bit_buffer: Option<Buffer>, |
1878 | | nulls: Option<NullBuffer>, |
1879 | | offset: usize, |
1880 | | buffers: Vec<Buffer>, |
1881 | | child_data: Vec<ArrayData>, |
1882 | | /// Should buffers be realigned (copying if necessary)? |
1883 | | /// |
1884 | | /// Defaults to false. |
1885 | | align_buffers: bool, |
1886 | | /// Should data validation be skipped for this [`ArrayData`]? |
1887 | | /// |
1888 | | /// Defaults to false. |
1889 | | /// |
1890 | | /// # Safety |
1891 | | /// |
1892 | | /// This flag can only be set to true using `unsafe` APIs. However, once true |
1893 | | /// subsequent calls to `build()` may result in undefined behavior if the data |
1894 | | /// is not valid. |
1895 | | skip_validation: UnsafeFlag, |
1896 | | } |
1897 | | |
1898 | | impl ArrayDataBuilder { |
1899 | | #[inline] |
1900 | | /// Creates a new array data builder |
1901 | 1.41k | pub const fn new(data_type: DataType) -> Self { |
1902 | 1.41k | Self { |
1903 | 1.41k | data_type, |
1904 | 1.41k | len: 0, |
1905 | 1.41k | null_count: None, |
1906 | 1.41k | null_bit_buffer: None, |
1907 | 1.41k | nulls: None, |
1908 | 1.41k | offset: 0, |
1909 | 1.41k | buffers: vec![], |
1910 | 1.41k | child_data: vec![], |
1911 | 1.41k | align_buffers: false, |
1912 | 1.41k | skip_validation: UnsafeFlag::new(), |
1913 | 1.41k | } |
1914 | 1.41k | } |
1915 | | |
1916 | | /// Creates a new array data builder from an existing one, changing the data type |
1917 | 13 | pub fn data_type(self, data_type: DataType) -> Self { |
1918 | 13 | Self { data_type, ..self } |
1919 | 13 | } |
1920 | | |
1921 | | #[inline] |
1922 | | #[allow(clippy::len_without_is_empty)] |
1923 | | /// Sets the length of the [ArrayData] |
1924 | 1.41k | pub const fn len(mut self, n: usize) -> Self { |
1925 | 1.41k | self.len = n; |
1926 | 1.41k | self |
1927 | 1.41k | } |
1928 | | |
1929 | | /// Sets the null buffer of the [ArrayData] |
1930 | 1.37k | pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self { |
1931 | 1.37k | self.nulls = nulls; |
1932 | 1.37k | self.null_count = None; |
1933 | 1.37k | self.null_bit_buffer = None; |
1934 | 1.37k | self |
1935 | 1.37k | } |
1936 | | |
1937 | | /// Sets the null count of the [ArrayData] |
1938 | 0 | pub fn null_count(mut self, null_count: usize) -> Self { |
1939 | 0 | self.null_count = Some(null_count); |
1940 | 0 | self |
1941 | 0 | } |
1942 | | |
1943 | | /// Sets the `null_bit_buffer` of the [ArrayData] |
1944 | 5 | pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self { |
1945 | 5 | self.nulls = None; |
1946 | 5 | self.null_bit_buffer = buf; |
1947 | 5 | self |
1948 | 5 | } |
1949 | | |
1950 | | /// Sets the offset of the [ArrayData] |
1951 | | #[inline] |
1952 | 93 | pub const fn offset(mut self, n: usize) -> Self { |
1953 | 93 | self.offset = n; |
1954 | 93 | self |
1955 | 93 | } |
1956 | | |
1957 | | /// Sets the buffers of the [ArrayData] |
1958 | 1.12k | pub fn buffers(mut self, v: Vec<Buffer>) -> Self { |
1959 | 1.12k | self.buffers = v; |
1960 | 1.12k | self |
1961 | 1.12k | } |
1962 | | |
1963 | | /// Adds a single buffer to the [ArrayData]'s buffers |
1964 | 214 | pub fn add_buffer(mut self, b: Buffer) -> Self { |
1965 | 214 | self.buffers.push(b); |
1966 | 214 | self |
1967 | 214 | } |
1968 | | |
1969 | | /// Adds multiple buffers to the [ArrayData]'s buffers |
1970 | 0 | pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self { |
1971 | 0 | self.buffers.extend(bs); |
1972 | 0 | self |
1973 | 0 | } |
1974 | | |
1975 | | /// Sets the child data of the [ArrayData] |
1976 | 246 | pub fn child_data(mut self, v: Vec<ArrayData>) -> Self { |
1977 | 246 | self.child_data = v; |
1978 | 246 | self |
1979 | 246 | } |
1980 | | |
1981 | | /// Adds a single child data to the [ArrayData]'s child data |
1982 | 6 | pub fn add_child_data(mut self, r: ArrayData) -> Self { |
1983 | 6 | self.child_data.push(r); |
1984 | 6 | self |
1985 | 6 | } |
1986 | | |
1987 | | /// Creates an array data, without any validation |
1988 | | /// |
1989 | | /// Note: This is shorthand for |
1990 | | /// ```rust |
1991 | | /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null); |
1992 | | /// # let _ = unsafe { |
1993 | | /// builder.skip_validation(true).build().unwrap() |
1994 | | /// # }; |
1995 | | /// ``` |
1996 | | /// |
1997 | | /// # Safety |
1998 | | /// |
1999 | | /// The same caveats as [`ArrayData::new_unchecked`] |
2000 | | /// apply. |
2001 | 1.41k | pub unsafe fn build_unchecked(self) -> ArrayData { |
2002 | 1.41k | self.skip_validation(true).build().unwrap() |
2003 | 1.41k | } |
2004 | | |
2005 | | /// Creates an `ArrayData`, consuming `self` |
2006 | | /// |
2007 | | /// # Safety |
2008 | | /// |
2009 | | /// By default the underlying buffers are checked to ensure they are valid |
2010 | | /// Arrow data. However, if the [`Self::skip_validation`] flag has been set |
2011 | | /// to true (by the `unsafe` API) this validation is skipped. If the data is |
2012 | | /// not valid, undefined behavior will result. |
2013 | 1.45k | pub fn build(self) -> Result<ArrayData, ArrowError> { |
2014 | | let Self { |
2015 | 1.45k | data_type, |
2016 | 1.45k | len, |
2017 | 1.45k | null_count, |
2018 | 1.45k | null_bit_buffer, |
2019 | 1.45k | nulls, |
2020 | 1.45k | offset, |
2021 | 1.45k | buffers, |
2022 | 1.45k | child_data, |
2023 | 1.45k | align_buffers, |
2024 | 1.45k | skip_validation, |
2025 | 1.45k | } = self; |
2026 | | |
2027 | 1.45k | let nulls = nulls |
2028 | 1.45k | .or_else(|| {1.27k |
2029 | 1.27k | let buffer39 = null_bit_buffer?1.24k ; |
2030 | 39 | let buffer = BooleanBuffer::new(buffer, offset, len); |
2031 | 39 | Some(match null_count { |
2032 | 0 | Some(n) => { |
2033 | | // SAFETY: call to `data.validate_data()` below validates the null buffer is valid |
2034 | 0 | unsafe { NullBuffer::new_unchecked(buffer, n) } |
2035 | | } |
2036 | 39 | None => NullBuffer::new(buffer), |
2037 | | }) |
2038 | 1.27k | }) |
2039 | 1.45k | .filter(|b| b219 .null_count219 () != 0); |
2040 | | |
2041 | 1.45k | let mut data = ArrayData { |
2042 | 1.45k | data_type, |
2043 | 1.45k | len, |
2044 | 1.45k | offset, |
2045 | 1.45k | buffers, |
2046 | 1.45k | child_data, |
2047 | 1.45k | nulls, |
2048 | 1.45k | }; |
2049 | | |
2050 | 1.45k | if align_buffers { |
2051 | 0 | data.align_buffers(); |
2052 | 1.45k | } |
2053 | | |
2054 | | // SAFETY: `skip_validation` is only set to true using `unsafe` APIs |
2055 | 1.45k | if !skip_validation.get() || cfg!1.45k (feature = "force_validate") { |
2056 | 7 | data.validate_data()?0 ; |
2057 | 1.45k | } |
2058 | 1.45k | Ok(data) |
2059 | 1.45k | } |
2060 | | |
2061 | | /// Creates an array data, validating all inputs, and aligning any buffers |
2062 | | #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")] |
2063 | 0 | pub fn build_aligned(self) -> Result<ArrayData, ArrowError> { |
2064 | 0 | self.align_buffers(true).build() |
2065 | 0 | } |
2066 | | |
2067 | | /// Ensure that all buffers are aligned, copying data if necessary |
2068 | | /// |
2069 | | /// Rust requires that arrays are aligned to their corresponding primitive, |
2070 | | /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. |
2071 | | /// |
2072 | | /// [`ArrayData`] therefore requires that all buffers have at least this alignment, |
2073 | | /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. |
2074 | | /// |
2075 | | /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, |
2076 | | /// this flag is provided to automatically copy buffers to a new correctly aligned allocation |
2077 | | /// when necessary, making it useful when interacting with buffers produced by other systems, |
2078 | | /// e.g. IPC or FFI. |
2079 | | /// |
2080 | | /// If this flag is not enabled, `[Self::build`] return an error on encountering |
2081 | | /// insufficiently aligned buffers. |
2082 | 0 | pub fn align_buffers(mut self, align_buffers: bool) -> Self { |
2083 | 0 | self.align_buffers = align_buffers; |
2084 | 0 | self |
2085 | 0 | } |
2086 | | |
2087 | | /// Skips validation of the data. |
2088 | | /// |
2089 | | /// If this flag is enabled, `[Self::build`] will skip validation of the |
2090 | | /// data |
2091 | | /// |
2092 | | /// If this flag is not enabled, `[Self::build`] will validate that all |
2093 | | /// buffers are valid and will return an error if any data is invalid. |
2094 | | /// Validation can be expensive. |
2095 | | /// |
2096 | | /// # Safety |
2097 | | /// |
2098 | | /// If validation is skipped, the buffers must form a valid Arrow array, |
2099 | | /// otherwise undefined behavior will result |
2100 | 1.41k | pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self { |
2101 | 1.41k | self.skip_validation.set(skip_validation); |
2102 | 1.41k | self |
2103 | 1.41k | } |
2104 | | } |
2105 | | |
2106 | | impl From<ArrayData> for ArrayDataBuilder { |
2107 | 13 | fn from(d: ArrayData) -> Self { |
2108 | 13 | Self { |
2109 | 13 | data_type: d.data_type, |
2110 | 13 | len: d.len, |
2111 | 13 | offset: d.offset, |
2112 | 13 | buffers: d.buffers, |
2113 | 13 | child_data: d.child_data, |
2114 | 13 | nulls: d.nulls, |
2115 | 13 | null_bit_buffer: None, |
2116 | 13 | null_count: None, |
2117 | 13 | align_buffers: false, |
2118 | 13 | skip_validation: UnsafeFlag::new(), |
2119 | 13 | } |
2120 | 13 | } |
2121 | | } |
2122 | | |
2123 | | #[cfg(test)] |
2124 | | mod tests { |
2125 | | use super::*; |
2126 | | use arrow_schema::{Field, Fields}; |
2127 | | |
2128 | | // See arrow/tests/array_data_validation.rs for test of array validation |
2129 | | |
2130 | | /// returns a buffer initialized with some constant value for tests |
2131 | | fn make_i32_buffer(n: usize) -> Buffer { |
2132 | | Buffer::from_slice_ref(vec![42i32; n]) |
2133 | | } |
2134 | | |
2135 | | /// returns a buffer initialized with some constant value for tests |
2136 | | fn make_f32_buffer(n: usize) -> Buffer { |
2137 | | Buffer::from_slice_ref(vec![42f32; n]) |
2138 | | } |
2139 | | |
2140 | | #[test] |
2141 | | fn test_builder() { |
2142 | | // Buffer needs to be at least 25 long |
2143 | | let v = (0..25).collect::<Vec<i32>>(); |
2144 | | let b1 = Buffer::from_slice_ref(&v); |
2145 | | let arr_data = ArrayData::builder(DataType::Int32) |
2146 | | .len(20) |
2147 | | .offset(5) |
2148 | | .add_buffer(b1) |
2149 | | .null_bit_buffer(Some(Buffer::from([ |
2150 | | 0b01011111, 0b10110101, 0b01100011, 0b00011110, |
2151 | | ]))) |
2152 | | .build() |
2153 | | .unwrap(); |
2154 | | |
2155 | | assert_eq!(20, arr_data.len()); |
2156 | | assert_eq!(10, arr_data.null_count()); |
2157 | | assert_eq!(5, arr_data.offset()); |
2158 | | assert_eq!(1, arr_data.buffers().len()); |
2159 | | assert_eq!( |
2160 | | Buffer::from_slice_ref(&v).as_slice(), |
2161 | | arr_data.buffers()[0].as_slice() |
2162 | | ); |
2163 | | } |
2164 | | |
2165 | | #[test] |
2166 | | fn test_builder_with_child_data() { |
2167 | | let child_arr_data = ArrayData::try_new( |
2168 | | DataType::Int32, |
2169 | | 5, |
2170 | | None, |
2171 | | 0, |
2172 | | vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])], |
2173 | | vec![], |
2174 | | ) |
2175 | | .unwrap(); |
2176 | | |
2177 | | let field = Arc::new(Field::new("x", DataType::Int32, true)); |
2178 | | let data_type = DataType::Struct(vec![field].into()); |
2179 | | |
2180 | | let arr_data = ArrayData::builder(data_type) |
2181 | | .len(5) |
2182 | | .offset(0) |
2183 | | .add_child_data(child_arr_data.clone()) |
2184 | | .build() |
2185 | | .unwrap(); |
2186 | | |
2187 | | assert_eq!(5, arr_data.len()); |
2188 | | assert_eq!(1, arr_data.child_data().len()); |
2189 | | assert_eq!(child_arr_data, arr_data.child_data()[0]); |
2190 | | } |
2191 | | |
2192 | | #[test] |
2193 | | fn test_null_count() { |
2194 | | let mut bit_v: [u8; 2] = [0; 2]; |
2195 | | bit_util::set_bit(&mut bit_v, 0); |
2196 | | bit_util::set_bit(&mut bit_v, 3); |
2197 | | bit_util::set_bit(&mut bit_v, 10); |
2198 | | let arr_data = ArrayData::builder(DataType::Int32) |
2199 | | .len(16) |
2200 | | .add_buffer(make_i32_buffer(16)) |
2201 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2202 | | .build() |
2203 | | .unwrap(); |
2204 | | assert_eq!(13, arr_data.null_count()); |
2205 | | |
2206 | | // Test with offset |
2207 | | let mut bit_v: [u8; 2] = [0; 2]; |
2208 | | bit_util::set_bit(&mut bit_v, 0); |
2209 | | bit_util::set_bit(&mut bit_v, 3); |
2210 | | bit_util::set_bit(&mut bit_v, 10); |
2211 | | let arr_data = ArrayData::builder(DataType::Int32) |
2212 | | .len(12) |
2213 | | .offset(2) |
2214 | | .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space, |
2215 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2216 | | .build() |
2217 | | .unwrap(); |
2218 | | assert_eq!(10, arr_data.null_count()); |
2219 | | } |
2220 | | |
2221 | | #[test] |
2222 | | fn test_null_buffer_ref() { |
2223 | | let mut bit_v: [u8; 2] = [0; 2]; |
2224 | | bit_util::set_bit(&mut bit_v, 0); |
2225 | | bit_util::set_bit(&mut bit_v, 3); |
2226 | | bit_util::set_bit(&mut bit_v, 10); |
2227 | | let arr_data = ArrayData::builder(DataType::Int32) |
2228 | | .len(16) |
2229 | | .add_buffer(make_i32_buffer(16)) |
2230 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2231 | | .build() |
2232 | | .unwrap(); |
2233 | | assert!(arr_data.nulls().is_some()); |
2234 | | assert_eq!(&bit_v, arr_data.nulls().unwrap().validity()); |
2235 | | } |
2236 | | |
2237 | | #[test] |
2238 | | fn test_slice() { |
2239 | | let mut bit_v: [u8; 2] = [0; 2]; |
2240 | | bit_util::set_bit(&mut bit_v, 0); |
2241 | | bit_util::set_bit(&mut bit_v, 3); |
2242 | | bit_util::set_bit(&mut bit_v, 10); |
2243 | | let data = ArrayData::builder(DataType::Int32) |
2244 | | .len(16) |
2245 | | .add_buffer(make_i32_buffer(16)) |
2246 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2247 | | .build() |
2248 | | .unwrap(); |
2249 | | let new_data = data.slice(1, 15); |
2250 | | assert_eq!(data.len() - 1, new_data.len()); |
2251 | | assert_eq!(1, new_data.offset()); |
2252 | | assert_eq!(data.null_count(), new_data.null_count()); |
2253 | | |
2254 | | // slice of a slice (removes one null) |
2255 | | let new_data = new_data.slice(1, 14); |
2256 | | assert_eq!(data.len() - 2, new_data.len()); |
2257 | | assert_eq!(2, new_data.offset()); |
2258 | | assert_eq!(data.null_count() - 1, new_data.null_count()); |
2259 | | } |
2260 | | |
2261 | | #[test] |
2262 | | fn test_equality() { |
2263 | | let int_data = ArrayData::builder(DataType::Int32) |
2264 | | .len(1) |
2265 | | .add_buffer(make_i32_buffer(1)) |
2266 | | .build() |
2267 | | .unwrap(); |
2268 | | |
2269 | | let float_data = ArrayData::builder(DataType::Float32) |
2270 | | .len(1) |
2271 | | .add_buffer(make_f32_buffer(1)) |
2272 | | .build() |
2273 | | .unwrap(); |
2274 | | assert_ne!(int_data, float_data); |
2275 | | assert!(!int_data.ptr_eq(&float_data)); |
2276 | | assert!(int_data.ptr_eq(&int_data)); |
2277 | | |
2278 | | #[allow(clippy::redundant_clone)] |
2279 | | let int_data_clone = int_data.clone(); |
2280 | | assert_eq!(int_data, int_data_clone); |
2281 | | assert!(int_data.ptr_eq(&int_data_clone)); |
2282 | | assert!(int_data_clone.ptr_eq(&int_data)); |
2283 | | |
2284 | | let int_data_slice = int_data_clone.slice(1, 0); |
2285 | | assert!(int_data_slice.ptr_eq(&int_data_slice)); |
2286 | | assert!(!int_data.ptr_eq(&int_data_slice)); |
2287 | | assert!(!int_data_slice.ptr_eq(&int_data)); |
2288 | | |
2289 | | let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); |
2290 | | let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); |
2291 | | let string_data = ArrayData::try_new( |
2292 | | DataType::Utf8, |
2293 | | 3, |
2294 | | Some(Buffer::from_iter(vec![true, false, true])), |
2295 | | 0, |
2296 | | vec![offsets_buffer, data_buffer], |
2297 | | vec![], |
2298 | | ) |
2299 | | .unwrap(); |
2300 | | |
2301 | | assert_ne!(float_data, string_data); |
2302 | | assert!(!float_data.ptr_eq(&string_data)); |
2303 | | |
2304 | | assert!(string_data.ptr_eq(&string_data)); |
2305 | | |
2306 | | #[allow(clippy::redundant_clone)] |
2307 | | let string_data_cloned = string_data.clone(); |
2308 | | assert!(string_data_cloned.ptr_eq(&string_data)); |
2309 | | assert!(string_data.ptr_eq(&string_data_cloned)); |
2310 | | |
2311 | | let string_data_slice = string_data.slice(1, 2); |
2312 | | assert!(string_data_slice.ptr_eq(&string_data_slice)); |
2313 | | assert!(!string_data_slice.ptr_eq(&string_data)) |
2314 | | } |
2315 | | |
2316 | | #[test] |
2317 | | fn test_slice_memory_size() { |
2318 | | let mut bit_v: [u8; 2] = [0; 2]; |
2319 | | bit_util::set_bit(&mut bit_v, 0); |
2320 | | bit_util::set_bit(&mut bit_v, 3); |
2321 | | bit_util::set_bit(&mut bit_v, 10); |
2322 | | let data = ArrayData::builder(DataType::Int32) |
2323 | | .len(16) |
2324 | | .add_buffer(make_i32_buffer(16)) |
2325 | | .null_bit_buffer(Some(Buffer::from(bit_v))) |
2326 | | .build() |
2327 | | .unwrap(); |
2328 | | let new_data = data.slice(1, 14); |
2329 | | assert_eq!( |
2330 | | data.get_slice_memory_size().unwrap() - 8, |
2331 | | new_data.get_slice_memory_size().unwrap() |
2332 | | ); |
2333 | | let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); |
2334 | | let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); |
2335 | | let string_data = ArrayData::try_new( |
2336 | | DataType::Utf8, |
2337 | | 3, |
2338 | | Some(Buffer::from_iter(vec![true, false, true])), |
2339 | | 0, |
2340 | | vec![offsets_buffer, data_buffer], |
2341 | | vec![], |
2342 | | ) |
2343 | | .unwrap(); |
2344 | | let string_data_slice = string_data.slice(1, 2); |
2345 | | //4 bytes of offset and 2 bytes of data reduced by slicing. |
2346 | | assert_eq!( |
2347 | | string_data.get_slice_memory_size().unwrap() - 6, |
2348 | | string_data_slice.get_slice_memory_size().unwrap() |
2349 | | ); |
2350 | | } |
2351 | | |
2352 | | #[test] |
2353 | | fn test_count_nulls() { |
2354 | | let buffer = Buffer::from([0b00010110, 0b10011111]); |
2355 | | let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16)); |
2356 | | let count = count_nulls(Some(&buffer), 0, 16); |
2357 | | assert_eq!(count, 7); |
2358 | | |
2359 | | let count = count_nulls(Some(&buffer), 4, 8); |
2360 | | assert_eq!(count, 3); |
2361 | | } |
2362 | | |
2363 | | #[test] |
2364 | | fn test_contains_nulls() { |
2365 | | let buffer: Buffer = |
2366 | | MutableBuffer::from_iter([false, false, false, true, true, false]).into(); |
2367 | | let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6)); |
2368 | | assert!(contains_nulls(Some(&buffer), 0, 6)); |
2369 | | assert!(contains_nulls(Some(&buffer), 0, 3)); |
2370 | | assert!(!contains_nulls(Some(&buffer), 3, 2)); |
2371 | | assert!(!contains_nulls(Some(&buffer), 0, 0)); |
2372 | | } |
2373 | | |
2374 | | #[test] |
2375 | | fn test_alignment() { |
2376 | | let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); |
2377 | | let sliced = buffer.slice(1); |
2378 | | |
2379 | | let mut data = ArrayData { |
2380 | | data_type: DataType::Int32, |
2381 | | len: 0, |
2382 | | offset: 0, |
2383 | | buffers: vec![buffer], |
2384 | | child_data: vec![], |
2385 | | nulls: None, |
2386 | | }; |
2387 | | data.validate_full().unwrap(); |
2388 | | |
2389 | | // break alignment in data |
2390 | | data.buffers[0] = sliced; |
2391 | | let err = data.validate().unwrap_err(); |
2392 | | |
2393 | | assert_eq!( |
2394 | | err.to_string(), |
2395 | | "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" |
2396 | | ); |
2397 | | |
2398 | | data.align_buffers(); |
2399 | | data.validate_full().unwrap(); |
2400 | | } |
2401 | | |
2402 | | #[test] |
2403 | | fn test_alignment_struct() { |
2404 | | let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); |
2405 | | let sliced = buffer.slice(1); |
2406 | | |
2407 | | let child_data = ArrayData { |
2408 | | data_type: DataType::Int32, |
2409 | | len: 0, |
2410 | | offset: 0, |
2411 | | buffers: vec![buffer], |
2412 | | child_data: vec![], |
2413 | | nulls: None, |
2414 | | }; |
2415 | | |
2416 | | let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)])); |
2417 | | let mut data = ArrayData { |
2418 | | data_type: schema, |
2419 | | len: 0, |
2420 | | offset: 0, |
2421 | | buffers: vec![], |
2422 | | child_data: vec![child_data], |
2423 | | nulls: None, |
2424 | | }; |
2425 | | data.validate_full().unwrap(); |
2426 | | |
2427 | | // break alignment in child data |
2428 | | data.child_data[0].buffers[0] = sliced; |
2429 | | let err = data.validate().unwrap_err(); |
2430 | | |
2431 | | assert_eq!( |
2432 | | err.to_string(), |
2433 | | "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" |
2434 | | ); |
2435 | | |
2436 | | data.align_buffers(); |
2437 | | data.validate_full().unwrap(); |
2438 | | } |
2439 | | |
2440 | | #[test] |
2441 | | fn test_null_view_types() { |
2442 | | let array_len = 32; |
2443 | | let array = ArrayData::new_null(&DataType::BinaryView, array_len); |
2444 | | assert_eq!(array.len(), array_len); |
2445 | | for i in 0..array.len() { |
2446 | | assert!(array.is_null(i)); |
2447 | | } |
2448 | | |
2449 | | let array = ArrayData::new_null(&DataType::Utf8View, array_len); |
2450 | | assert_eq!(array.len(), array_len); |
2451 | | for i in 0..array.len() { |
2452 | | assert!(array.is_null(i)); |
2453 | | } |
2454 | | } |
2455 | | } |