/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/list_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::{get_offsets, make_array, print_long_array}; |
19 | | use crate::builder::{GenericListBuilder, PrimitiveBuilder}; |
20 | | use crate::{ |
21 | | Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, |
22 | | iterator::GenericListArrayIter, new_empty_array, |
23 | | }; |
24 | | use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; |
25 | | use arrow_data::{ArrayData, ArrayDataBuilder}; |
26 | | use arrow_schema::{ArrowError, DataType, FieldRef}; |
27 | | use num_integer::Integer; |
28 | | use std::any::Any; |
29 | | use std::sync::Arc; |
30 | | |
31 | | /// A type that can be used within a variable-size array to encode offset information |
32 | | /// |
33 | | /// See [`ListArray`], [`LargeListArray`], [`BinaryArray`], [`LargeBinaryArray`], |
34 | | /// [`StringArray`] and [`LargeStringArray`] |
35 | | /// |
36 | | /// [`BinaryArray`]: crate::array::BinaryArray |
37 | | /// [`LargeBinaryArray`]: crate::array::LargeBinaryArray |
38 | | /// [`StringArray`]: crate::array::StringArray |
39 | | /// [`LargeStringArray`]: crate::array::LargeStringArray |
40 | | pub trait OffsetSizeTrait: |
41 | | ArrowNativeType + std::ops::AddAssign + Integer + num_traits::CheckedAdd |
42 | | { |
43 | | /// True for 64 bit offset size and false for 32 bit offset size |
44 | | const IS_LARGE: bool; |
45 | | /// Prefix for the offset size |
46 | | const PREFIX: &'static str; |
47 | | /// The max `usize` offset |
48 | | const MAX_OFFSET: usize; |
49 | | } |
50 | | |
51 | | impl OffsetSizeTrait for i32 { |
52 | | const IS_LARGE: bool = false; |
53 | | const PREFIX: &'static str = ""; |
54 | | const MAX_OFFSET: usize = i32::MAX as usize; |
55 | | } |
56 | | |
57 | | impl OffsetSizeTrait for i64 { |
58 | | const IS_LARGE: bool = true; |
59 | | const PREFIX: &'static str = "Large"; |
60 | | const MAX_OFFSET: usize = i64::MAX as usize; |
61 | | } |
62 | | |
63 | | /// An array of [variable length lists], similar to JSON arrays |
64 | | /// (e.g. `["A", "B", "C"]`). This struct specifically represents |
65 | | /// the [list layout]. Refer to [`GenericListViewArray`] for the |
66 | | /// [list-view layout]. |
67 | | /// |
68 | | /// Lists are represented using `offsets` into a `values` child |
69 | | /// array. Offsets are stored in two adjacent entries of an |
70 | | /// [`OffsetBuffer`]. |
71 | | /// |
72 | | /// Arrow defines [`ListArray`] with `i32` offsets and |
73 | | /// [`LargeListArray`] with `i64` offsets. |
74 | | /// |
75 | | /// Use [`GenericListBuilder`] to construct a [`GenericListArray`]. |
76 | | /// |
77 | | /// # Representation |
78 | | /// |
79 | | /// A [`ListArray`] can represent a list of values of any other |
80 | | /// supported Arrow type. Each element of the `ListArray` itself is |
81 | | /// a list which may be empty, may contain NULL and non-null values, |
82 | | /// or may itself be NULL. |
83 | | /// |
84 | | /// For example, the `ListArray` shown in the following diagram stores |
85 | | /// lists of strings. Note that `[]` represents an empty (length |
86 | | /// 0), but non NULL list. |
87 | | /// |
88 | | /// ```text |
89 | | /// ┌─────────────┐ |
90 | | /// │ [A,B,C] │ |
91 | | /// ├─────────────┤ |
92 | | /// │ [] │ |
93 | | /// ├─────────────┤ |
94 | | /// │ NULL │ |
95 | | /// ├─────────────┤ |
96 | | /// │ [D] │ |
97 | | /// ├─────────────┤ |
98 | | /// │ [NULL, F] │ |
99 | | /// └─────────────┘ |
100 | | /// ``` |
101 | | /// |
102 | | /// The `values` are stored in a child [`StringArray`] and the offsets |
103 | | /// are stored in an [`OffsetBuffer`] as shown in the following |
104 | | /// diagram. The logical values and offsets are shown on the left, and |
105 | | /// the actual `ListArray` encoding on the right. |
106 | | /// |
107 | | /// ```text |
108 | | /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ |
109 | | /// ┌ ─ ─ ─ ─ ─ ─ ┐ │ |
110 | | /// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ |
111 | | /// │ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ │ 1 │ │ A │ │ 0 │ |
112 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ |
113 | | /// │ [] (empty) │ │ (3,3) │ │ 1 │ │ 3 │ │ │ 1 │ │ B │ │ 1 │ |
114 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ |
115 | | /// │ NULL │ │ (3,3) │ │ 0 │ │ 3 │ │ │ 1 │ │ C │ │ 2 │ |
116 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ |
117 | | /// │ [D] │ │ (3,4) │ │ 1 │ │ 3 │ │ │ 1 │ │ D │ │ 3 │ |
118 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ |
119 | | /// │ [NULL, F] │ │ (4,6) │ │ 1 │ │ 4 │ │ │ 0 │ │ ? │ │ 4 │ |
120 | | /// └─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤ |
121 | | /// │ 6 │ │ │ 1 │ │ F │ │ 5 │ |
122 | | /// │ Validity └───┘ └───┘ └───┘ |
123 | | /// Logical Logical (nulls) Offsets │ Values │ │ |
124 | | /// Values Offsets │ (Array) |
125 | | /// └ ─ ─ ─ ─ ─ ─ ┘ │ |
126 | | /// (offsets[i], │ ListArray |
127 | | /// offsets[i+1]) │ |
128 | | /// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ |
129 | | /// ``` |
130 | | /// |
131 | | /// # Slicing |
132 | | /// |
133 | | /// Slicing a `ListArray` creates a new `ListArray` without copying any data, |
134 | | /// but this means the [`Self::values`] and [`Self::offsets`] may have "unused" data |
135 | | /// |
136 | | /// For example, calling `slice(1, 3)` on the `ListArray` in the above example |
137 | | /// would result in the following. Note |
138 | | /// |
139 | | /// 1. `Values` array is unchanged |
140 | | /// 2. `Offsets` do not start at `0`, nor cover all values in the Values array. |
141 | | /// |
142 | | /// ```text |
143 | | /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ |
144 | | /// ┌ ─ ─ ─ ─ ─ ─ ┐ │ ╔═══╗ |
145 | | /// │ ╔═══╗ ╔═══╗ ║ ║ Not used |
146 | | /// │ ║ 1 ║ ║ A ║ │ 0 │ ╚═══╝ |
147 | | /// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ╠═══╣ ╠═══╣ |
148 | | /// │ [] (empty) │ │ (3,3) │ │ 1 │ │ 3 │ │ ║ 1 ║ ║ B ║ │ 1 │ |
149 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ╠═══╣ ╠═══╣ |
150 | | /// │ NULL │ │ (3,3) │ │ 0 │ │ 3 │ │ ║ 1 ║ ║ C ║ │ 2 │ |
151 | | /// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ╚═══╝ ╚═══╝ |
152 | | /// │ [D] │ │ (3,4) │ │ 1 │ │ 3 │ │ │ 1 │ │ D │ │ 3 │ |
153 | | /// └─────────────┘ └───────┘ │ └───┘ ├───┤ ╔═══╗ ╔═══╗ |
154 | | /// │ 4 │ │ ║ 0 ║ ║ ? ║ │ 4 │ |
155 | | /// │ └───┘ ╠═══╣ ╠═══╣ |
156 | | /// │ ║ 1 ║ ║ F ║ │ 5 │ |
157 | | /// │ Validity ╚═══╝ ╚═══╝ |
158 | | /// Logical Logical (nulls) Offsets │ Values │ │ |
159 | | /// Values Offsets │ (Array) |
160 | | /// └ ─ ─ ─ ─ ─ ─ ┘ │ |
161 | | /// (offsets[i], │ ListArray |
162 | | /// offsets[i+1]) │ |
163 | | /// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ |
164 | | /// ``` |
165 | | /// |
166 | | /// [`StringArray`]: crate::array::StringArray |
167 | | /// [`GenericListViewArray`]: crate::array::GenericListViewArray |
168 | | /// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout |
169 | | /// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout |
170 | | /// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout |
171 | | pub struct GenericListArray<OffsetSize: OffsetSizeTrait> { |
172 | | data_type: DataType, |
173 | | nulls: Option<NullBuffer>, |
174 | | values: ArrayRef, |
175 | | value_offsets: OffsetBuffer<OffsetSize>, |
176 | | } |
177 | | |
178 | | impl<OffsetSize: OffsetSizeTrait> Clone for GenericListArray<OffsetSize> { |
179 | 29 | fn clone(&self) -> Self { |
180 | 29 | Self { |
181 | 29 | data_type: self.data_type.clone(), |
182 | 29 | nulls: self.nulls.clone(), |
183 | 29 | values: self.values.clone(), |
184 | 29 | value_offsets: self.value_offsets.clone(), |
185 | 29 | } |
186 | 29 | } |
187 | | } |
188 | | |
189 | | impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { |
190 | | /// The data type constructor of list array. |
191 | | /// The input is the schema of the child array and |
192 | | /// the output is the [`DataType`], List or LargeList. |
193 | | pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { |
194 | | DataType::LargeList |
195 | | } else { |
196 | | DataType::List |
197 | | }; |
198 | | |
199 | | /// Create a new [`GenericListArray`] from the provided parts |
200 | | /// |
201 | | /// # Errors |
202 | | /// |
203 | | /// Errors if |
204 | | /// |
205 | | /// * `offsets.len() - 1 != nulls.len()` |
206 | | /// * `offsets.last() > values.len()` |
207 | | /// * `!field.is_nullable() && values.is_nullable()` |
208 | | /// * `field.data_type() != values.data_type()` |
209 | 80.0k | pub fn try_new( |
210 | 80.0k | field: FieldRef, |
211 | 80.0k | offsets: OffsetBuffer<OffsetSize>, |
212 | 80.0k | values: ArrayRef, |
213 | 80.0k | nulls: Option<NullBuffer>, |
214 | 80.0k | ) -> Result<Self, ArrowError> { |
215 | 80.0k | let len = offsets.len() - 1; // Offsets guaranteed to not be empty |
216 | 80.0k | let end_offset = offsets.last().unwrap().as_usize(); |
217 | | // don't need to check other values of `offsets` because they are checked |
218 | | // during construction of `OffsetBuffer` |
219 | 80.0k | if end_offset > values.len() { |
220 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
221 | 0 | "Max offset of {end_offset} exceeds length of values {}", |
222 | 0 | values.len() |
223 | 0 | ))); |
224 | 80.0k | } |
225 | | |
226 | 80.0k | if let Some(n15 ) = nulls.as_ref() { |
227 | 15 | if n.len() != len { |
228 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
229 | 0 | "Incorrect length of null buffer for {}ListArray, expected {len} got {}", |
230 | 0 | OffsetSize::PREFIX, |
231 | 0 | n.len(), |
232 | 0 | ))); |
233 | 15 | } |
234 | 80.0k | } |
235 | 80.0k | if !field.is_nullable() && values0 .is_nullable0 () { |
236 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
237 | 0 | "Non-nullable field of {}ListArray {:?} cannot contain nulls", |
238 | 0 | OffsetSize::PREFIX, |
239 | 0 | field.name() |
240 | 0 | ))); |
241 | 80.0k | } |
242 | | |
243 | 80.0k | if field.data_type() != values.data_type() { |
244 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
245 | 0 | "{}ListArray expected data type {} got {} for {:?}", |
246 | 0 | OffsetSize::PREFIX, |
247 | 0 | field.data_type(), |
248 | 0 | values.data_type(), |
249 | 0 | field.name() |
250 | 0 | ))); |
251 | 80.0k | } |
252 | | |
253 | 80.0k | Ok(Self { |
254 | 80.0k | data_type: Self::DATA_TYPE_CONSTRUCTOR(field), |
255 | 80.0k | nulls, |
256 | 80.0k | values, |
257 | 80.0k | value_offsets: offsets, |
258 | 80.0k | }) |
259 | 80.0k | } |
260 | | |
261 | | /// Create a new [`GenericListArray`] from the provided parts |
262 | | /// |
263 | | /// # Panics |
264 | | /// |
265 | | /// Panics if [`Self::try_new`] returns an error |
266 | 80.0k | pub fn new( |
267 | 80.0k | field: FieldRef, |
268 | 80.0k | offsets: OffsetBuffer<OffsetSize>, |
269 | 80.0k | values: ArrayRef, |
270 | 80.0k | nulls: Option<NullBuffer>, |
271 | 80.0k | ) -> Self { |
272 | 80.0k | Self::try_new(field, offsets, values, nulls).unwrap() |
273 | 80.0k | } |
274 | | |
275 | | /// Create a new [`GenericListArray`] of length `len` where all values are null |
276 | | pub fn new_null(field: FieldRef, len: usize) -> Self { |
277 | | let values = new_empty_array(field.data_type()); |
278 | | Self { |
279 | | data_type: Self::DATA_TYPE_CONSTRUCTOR(field), |
280 | | nulls: Some(NullBuffer::new_null(len)), |
281 | | value_offsets: OffsetBuffer::new_zeroed(len), |
282 | | values, |
283 | | } |
284 | | } |
285 | | |
286 | | /// Deconstruct this array into its constituent parts |
287 | | pub fn into_parts( |
288 | | self, |
289 | | ) -> ( |
290 | | FieldRef, |
291 | | OffsetBuffer<OffsetSize>, |
292 | | ArrayRef, |
293 | | Option<NullBuffer>, |
294 | | ) { |
295 | | let f = match self.data_type { |
296 | | DataType::List(f) | DataType::LargeList(f) => f, |
297 | | _ => unreachable!(), |
298 | | }; |
299 | | (f, self.value_offsets, self.values, self.nulls) |
300 | | } |
301 | | |
302 | | /// Returns a reference to the offsets of this list |
303 | | /// |
304 | | /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] |
305 | | /// allowing for zero-copy cloning. |
306 | | /// |
307 | | /// Notes: The `offsets` may not start at 0 and may not cover all values in |
308 | | /// [`Self::values`]. This can happen when the list array was sliced via |
309 | | /// [`Self::slice`]. See documentation for [`Self`] for more details. |
310 | | #[inline] |
311 | 240k | pub fn offsets(&self) -> &OffsetBuffer<OffsetSize> { |
312 | 240k | &self.value_offsets |
313 | 240k | } |
314 | | |
315 | | /// Returns a reference to the values of this list |
316 | | /// |
317 | | /// Note: The list array may not refer to all values in the `values` array. |
318 | | /// For example if the list array was sliced via [`Self::slice`] values will |
319 | | /// still contain values both before and after the slice. See documentation |
320 | | /// for [`Self`] for more details. |
321 | | #[inline] |
322 | 160k | pub fn values(&self) -> &ArrayRef { |
323 | 160k | &self.values |
324 | 160k | } |
325 | | |
326 | | /// Returns a clone of the value type of this list. |
327 | | pub fn value_type(&self) -> DataType { |
328 | | self.values.data_type().clone() |
329 | | } |
330 | | |
331 | | /// Returns ith value of this list array. |
332 | | /// |
333 | | /// Note: This method does not check for nulls and the value is arbitrary |
334 | | /// if [`is_null`](Self::is_null) returns true for the index. |
335 | | /// |
336 | | /// # Safety |
337 | | /// Caller must ensure that the index is within the array bounds |
338 | | pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { |
339 | | let end = unsafe { self.value_offsets().get_unchecked(i + 1).as_usize() }; |
340 | | let start = unsafe { self.value_offsets().get_unchecked(i).as_usize() }; |
341 | | self.values.slice(start, end - start) |
342 | | } |
343 | | |
344 | | /// Returns ith value of this list array. |
345 | | /// |
346 | | /// Note: This method does not check for nulls and the value is arbitrary |
347 | | /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. |
348 | | /// |
349 | | /// # Panics |
350 | | /// Panics if index `i` is out of bounds |
351 | 160k | pub fn value(&self, i: usize) -> ArrayRef { |
352 | 160k | let end = self.value_offsets()[i + 1].as_usize(); |
353 | 160k | let start = self.value_offsets()[i].as_usize(); |
354 | 160k | self.values.slice(start, end - start) |
355 | 160k | } |
356 | | |
357 | | /// Returns the offset values in the offsets buffer. |
358 | | /// |
359 | | /// See [`Self::offsets`] for more details. |
360 | | #[inline] |
361 | 320k | pub fn value_offsets(&self) -> &[OffsetSize] { |
362 | 320k | &self.value_offsets |
363 | 320k | } |
364 | | |
365 | | /// Returns the length for value at index `i`. |
366 | | #[inline] |
367 | | pub fn value_length(&self, i: usize) -> OffsetSize { |
368 | | let offsets = self.value_offsets(); |
369 | | offsets[i + 1] - offsets[i] |
370 | | } |
371 | | |
372 | | /// constructs a new iterator |
373 | 4 | pub fn iter<'a>(&'a self) -> GenericListArrayIter<'a, OffsetSize> { |
374 | 4 | GenericListArrayIter::<'a, OffsetSize>::new(self) |
375 | 4 | } |
376 | | |
377 | | #[inline] |
378 | 26 | fn get_type(data_type: &DataType) -> Option<&DataType> { |
379 | 26 | match (OffsetSize::IS_LARGE, data_type) { |
380 | 14 | (true, DataType::LargeList(child12 )) | (false, DataType::List(child)) => { |
381 | 26 | Some(child.data_type()) |
382 | | } |
383 | 0 | _ => None, |
384 | | } |
385 | 26 | } |
386 | | |
387 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
388 | | /// |
389 | | /// Notes: this method does *NOT* slice the underlying values array or modify |
390 | | /// the values in the offsets buffer. See [`Self::values`] and |
391 | | /// [`Self::offsets`] for more information. |
392 | 2 | pub fn slice(&self, offset: usize, length: usize) -> Self { |
393 | | Self { |
394 | 2 | data_type: self.data_type.clone(), |
395 | 2 | nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), |
396 | 2 | values: self.values.clone(), |
397 | 2 | value_offsets: self.value_offsets.slice(offset, length), |
398 | | } |
399 | 2 | } |
400 | | |
401 | | /// Creates a [`GenericListArray`] from an iterator of primitive values |
402 | | /// # Example |
403 | | /// ``` |
404 | | /// # use arrow_array::ListArray; |
405 | | /// # use arrow_array::types::Int32Type; |
406 | | /// |
407 | | /// let data = vec![ |
408 | | /// Some(vec![Some(0), Some(1), Some(2)]), |
409 | | /// None, |
410 | | /// Some(vec![Some(3), None, Some(5)]), |
411 | | /// Some(vec![Some(6), Some(7)]), |
412 | | /// ]; |
413 | | /// let list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data); |
414 | | /// println!("{:?}", list_array); |
415 | | /// ``` |
416 | 15 | pub fn from_iter_primitive<T, P, I>(iter: I) -> Self |
417 | 15 | where |
418 | 15 | T: ArrowPrimitiveType, |
419 | 15 | P: IntoIterator<Item = Option<<T as ArrowPrimitiveType>::Native>>, |
420 | 15 | I: IntoIterator<Item = Option<P>>, |
421 | | { |
422 | 15 | let iter = iter.into_iter(); |
423 | 15 | let size_hint = iter.size_hint().0; |
424 | 15 | let mut builder = |
425 | 15 | GenericListBuilder::with_capacity(PrimitiveBuilder::<T>::new(), size_hint); |
426 | | |
427 | 71 | for i56 in iter { |
428 | 56 | match i { |
429 | 44 | Some(p) => { |
430 | 148 | for t104 in p { |
431 | 104 | builder.values().append_option(t); |
432 | 104 | } |
433 | 44 | builder.append(true); |
434 | | } |
435 | 12 | None => builder.append(false), |
436 | | } |
437 | | } |
438 | 15 | builder.finish() |
439 | 15 | } |
440 | | } |
441 | | |
442 | | impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericListArray<OffsetSize> { |
443 | 26 | fn from(data: ArrayData) -> Self { |
444 | 26 | Self::try_new_from_array_data(data) |
445 | 26 | .expect("Expected infallible creation of GenericListArray from ArrayDataRef failed") |
446 | 26 | } |
447 | | } |
448 | | |
449 | | impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>> for ArrayData { |
450 | 30 | fn from(array: GenericListArray<OffsetSize>) -> Self { |
451 | 30 | let len = array.len(); |
452 | 30 | let builder = ArrayDataBuilder::new(array.data_type) |
453 | 30 | .len(len) |
454 | 30 | .nulls(array.nulls) |
455 | 30 | .buffers(vec![array.value_offsets.into_inner().into_inner()]) |
456 | 30 | .child_data(vec![array.values.to_data()]); |
457 | | |
458 | 30 | unsafe { builder.build_unchecked() } |
459 | 30 | } |
460 | | } |
461 | | |
462 | | impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListArray<OffsetSize> { |
463 | | fn from(value: FixedSizeListArray) -> Self { |
464 | | let (field, size) = match value.data_type() { |
465 | | DataType::FixedSizeList(f, size) => (f, *size as usize), |
466 | | _ => unreachable!(), |
467 | | }; |
468 | | |
469 | | let offsets = OffsetBuffer::from_repeated_length(size, value.len()); |
470 | | |
471 | | Self { |
472 | | data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), |
473 | | nulls: value.nulls().cloned(), |
474 | | values: value.values().clone(), |
475 | | value_offsets: offsets, |
476 | | } |
477 | | } |
478 | | } |
479 | | |
480 | | impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { |
481 | 26 | fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> { |
482 | 26 | if data.buffers().len() != 1 { |
483 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
484 | 0 | "ListArray data should contain a single buffer only (value offsets), had {}", |
485 | 0 | data.buffers().len() |
486 | 0 | ))); |
487 | 26 | } |
488 | | |
489 | 26 | if data.child_data().len() != 1 { |
490 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
491 | 0 | "ListArray should contain a single child array (values array), had {}", |
492 | 0 | data.child_data().len() |
493 | 0 | ))); |
494 | 26 | } |
495 | | |
496 | 26 | let values = data.child_data()[0].clone(); |
497 | | |
498 | 26 | if let Some(child_data_type) = Self::get_type(data.data_type()) { |
499 | 26 | if values.data_type() != child_data_type { |
500 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
501 | 0 | "[Large]ListArray's child datatype {:?} does not \ |
502 | 0 | correspond to the List's datatype {:?}", |
503 | 0 | values.data_type(), |
504 | 0 | child_data_type |
505 | 0 | ))); |
506 | 26 | } |
507 | | } else { |
508 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
509 | 0 | "[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}", |
510 | 0 | data.data_type() |
511 | 0 | ))); |
512 | | } |
513 | | |
514 | 26 | let values = make_array(values); |
515 | | // SAFETY: |
516 | | // ArrayData is valid, and verified type above |
517 | 26 | let value_offsets = unsafe { get_offsets(&data) }; |
518 | | |
519 | 26 | Ok(Self { |
520 | 26 | data_type: data.data_type().clone(), |
521 | 26 | nulls: data.nulls().cloned(), |
522 | 26 | values, |
523 | 26 | value_offsets, |
524 | 26 | }) |
525 | 26 | } |
526 | | } |
527 | | |
528 | | impl<OffsetSize: OffsetSizeTrait> Array for GenericListArray<OffsetSize> { |
529 | 80.0k | fn as_any(&self) -> &dyn Any { |
530 | 80.0k | self |
531 | 80.0k | } |
532 | | |
533 | 29 | fn to_data(&self) -> ArrayData { |
534 | 29 | self.clone().into() |
535 | 29 | } |
536 | | |
537 | 1 | fn into_data(self) -> ArrayData { |
538 | 1 | self.into() |
539 | 1 | } |
540 | | |
541 | 80.0k | fn data_type(&self) -> &DataType { |
542 | 80.0k | &self.data_type |
543 | 80.0k | } |
544 | | |
545 | 0 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
546 | 0 | Arc::new(self.slice(offset, length)) |
547 | 0 | } |
548 | | |
549 | 80.0k | fn len(&self) -> usize { |
550 | 80.0k | self.value_offsets.len() - 1 |
551 | 80.0k | } |
552 | | |
553 | 0 | fn is_empty(&self) -> bool { |
554 | 0 | self.value_offsets.len() <= 1 |
555 | 0 | } |
556 | | |
557 | 0 | fn shrink_to_fit(&mut self) { |
558 | 0 | if let Some(nulls) = &mut self.nulls { |
559 | 0 | nulls.shrink_to_fit(); |
560 | 0 | } |
561 | 0 | self.values.shrink_to_fit(); |
562 | 0 | self.value_offsets.shrink_to_fit(); |
563 | 0 | } |
564 | | |
565 | 0 | fn offset(&self) -> usize { |
566 | 0 | 0 |
567 | 0 | } |
568 | | |
569 | 80.0k | fn nulls(&self) -> Option<&NullBuffer> { |
570 | 80.0k | self.nulls.as_ref() |
571 | 80.0k | } |
572 | | |
573 | 0 | fn logical_null_count(&self) -> usize { |
574 | | // More efficient that the default implementation |
575 | 0 | self.null_count() |
576 | 0 | } |
577 | | |
578 | 0 | fn get_buffer_memory_size(&self) -> usize { |
579 | 0 | let mut size = self.values.get_buffer_memory_size(); |
580 | 0 | size += self.value_offsets.inner().inner().capacity(); |
581 | 0 | if let Some(n) = self.nulls.as_ref() { |
582 | 0 | size += n.buffer().capacity(); |
583 | 0 | } |
584 | 0 | size |
585 | 0 | } |
586 | | |
587 | 0 | fn get_array_memory_size(&self) -> usize { |
588 | 0 | let mut size = std::mem::size_of::<Self>() + self.values.get_array_memory_size(); |
589 | 0 | size += self.value_offsets.inner().inner().capacity(); |
590 | 0 | if let Some(n) = self.nulls.as_ref() { |
591 | 0 | size += n.buffer().capacity(); |
592 | 0 | } |
593 | 0 | size |
594 | 0 | } |
595 | | } |
596 | | |
597 | | impl<OffsetSize: OffsetSizeTrait> ArrayAccessor for &GenericListArray<OffsetSize> { |
598 | | type Item = ArrayRef; |
599 | | |
600 | | fn value(&self, index: usize) -> Self::Item { |
601 | | GenericListArray::value(self, index) |
602 | | } |
603 | | |
604 | 160k | unsafe fn value_unchecked(&self, index: usize) -> Self::Item { |
605 | 160k | GenericListArray::value(self, index) |
606 | 160k | } |
607 | | } |
608 | | |
609 | | impl<OffsetSize: OffsetSizeTrait> std::fmt::Debug for GenericListArray<OffsetSize> { |
610 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
611 | 0 | let prefix = OffsetSize::PREFIX; |
612 | | |
613 | 0 | write!(f, "{prefix}ListArray\n[\n")?; |
614 | 0 | print_long_array(self, f, |array, index, f| { |
615 | 0 | std::fmt::Debug::fmt(&array.value(index), f) |
616 | 0 | })?; |
617 | 0 | write!(f, "]") |
618 | 0 | } |
619 | | } |
620 | | |
621 | | /// A [`GenericListArray`] of variable size lists, storing offsets as `i32`. |
622 | | /// |
623 | | /// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`] |
624 | | pub type ListArray = GenericListArray<i32>; |
625 | | |
626 | | /// A [`GenericListArray`] of variable size lists, storing offsets as `i64`. |
627 | | /// |
628 | | /// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`] |
629 | | pub type LargeListArray = GenericListArray<i64>; |
630 | | |
631 | | #[cfg(test)] |
632 | | mod tests { |
633 | | use super::*; |
634 | | use crate::builder::{FixedSizeListBuilder, Int32Builder, ListBuilder, UnionBuilder}; |
635 | | use crate::cast::AsArray; |
636 | | use crate::types::Int32Type; |
637 | | use crate::{Int32Array, Int64Array}; |
638 | | use arrow_buffer::{Buffer, ScalarBuffer, bit_util}; |
639 | | use arrow_schema::Field; |
640 | | |
641 | | fn create_from_buffers() -> ListArray { |
642 | | // [[0, 1, 2], [3, 4, 5], [6, 7]] |
643 | | let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); |
644 | | let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6, 8])); |
645 | | let field = Arc::new(Field::new_list_field(DataType::Int32, true)); |
646 | | ListArray::new(field, offsets, Arc::new(values), None) |
647 | | } |
648 | | |
649 | | #[test] |
650 | | fn test_from_iter_primitive() { |
651 | | let data = vec![ |
652 | | Some(vec![Some(0), Some(1), Some(2)]), |
653 | | Some(vec![Some(3), Some(4), Some(5)]), |
654 | | Some(vec![Some(6), Some(7)]), |
655 | | ]; |
656 | | let list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data); |
657 | | |
658 | | let another = create_from_buffers(); |
659 | | assert_eq!(list_array, another) |
660 | | } |
661 | | |
662 | | #[test] |
663 | | fn test_empty_list_array() { |
664 | | // Construct an empty value array |
665 | | let value_data = ArrayData::builder(DataType::Int32) |
666 | | .len(0) |
667 | | .add_buffer(Buffer::from([])) |
668 | | .build() |
669 | | .unwrap(); |
670 | | |
671 | | // Construct an empty offset buffer |
672 | | let value_offsets = Buffer::from([]); |
673 | | |
674 | | // Construct a list array from the above two |
675 | | let list_data_type = |
676 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
677 | | let list_data = ArrayData::builder(list_data_type) |
678 | | .len(0) |
679 | | .add_buffer(value_offsets) |
680 | | .add_child_data(value_data) |
681 | | .build() |
682 | | .unwrap(); |
683 | | |
684 | | let list_array = ListArray::from(list_data); |
685 | | assert_eq!(list_array.len(), 0) |
686 | | } |
687 | | |
688 | | #[test] |
689 | | fn test_list_array() { |
690 | | // Construct a value array |
691 | | let value_data = ArrayData::builder(DataType::Int32) |
692 | | .len(8) |
693 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) |
694 | | .build() |
695 | | .unwrap(); |
696 | | |
697 | | // Construct a buffer for value offsets, for the nested array: |
698 | | // [[0, 1, 2], [3, 4, 5], [6, 7]] |
699 | | let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); |
700 | | |
701 | | // Construct a list array from the above two |
702 | | let list_data_type = |
703 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
704 | | let list_data = ArrayData::builder(list_data_type.clone()) |
705 | | .len(3) |
706 | | .add_buffer(value_offsets.clone()) |
707 | | .add_child_data(value_data.clone()) |
708 | | .build() |
709 | | .unwrap(); |
710 | | let list_array = ListArray::from(list_data); |
711 | | |
712 | | let values = list_array.values(); |
713 | | assert_eq!(value_data, values.to_data()); |
714 | | assert_eq!(DataType::Int32, list_array.value_type()); |
715 | | assert_eq!(3, list_array.len()); |
716 | | assert_eq!(0, list_array.null_count()); |
717 | | assert_eq!(6, list_array.value_offsets()[2]); |
718 | | assert_eq!(2, list_array.value_length(2)); |
719 | | assert_eq!(0, list_array.value(0).as_primitive::<Int32Type>().value(0)); |
720 | | assert_eq!( |
721 | | 0, |
722 | | unsafe { list_array.value_unchecked(0) } |
723 | | .as_primitive::<Int32Type>() |
724 | | .value(0) |
725 | | ); |
726 | | for i in 0..3 { |
727 | | assert!(list_array.is_valid(i)); |
728 | | assert!(!list_array.is_null(i)); |
729 | | } |
730 | | |
731 | | // Now test with a non-zero offset (skip first element) |
732 | | // [[3, 4, 5], [6, 7]] |
733 | | let list_data = ArrayData::builder(list_data_type) |
734 | | .len(2) |
735 | | .offset(1) |
736 | | .add_buffer(value_offsets) |
737 | | .add_child_data(value_data.clone()) |
738 | | .build() |
739 | | .unwrap(); |
740 | | let list_array = ListArray::from(list_data); |
741 | | |
742 | | let values = list_array.values(); |
743 | | assert_eq!(value_data, values.to_data()); |
744 | | assert_eq!(DataType::Int32, list_array.value_type()); |
745 | | assert_eq!(2, list_array.len()); |
746 | | assert_eq!(0, list_array.null_count()); |
747 | | assert_eq!(6, list_array.value_offsets()[1]); |
748 | | assert_eq!(2, list_array.value_length(1)); |
749 | | assert_eq!(3, list_array.value(0).as_primitive::<Int32Type>().value(0)); |
750 | | assert_eq!( |
751 | | 3, |
752 | | unsafe { list_array.value_unchecked(0) } |
753 | | .as_primitive::<Int32Type>() |
754 | | .value(0) |
755 | | ); |
756 | | } |
757 | | |
758 | | #[test] |
759 | | fn test_large_list_array() { |
760 | | // Construct a value array |
761 | | let value_data = ArrayData::builder(DataType::Int32) |
762 | | .len(8) |
763 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) |
764 | | .build() |
765 | | .unwrap(); |
766 | | |
767 | | // Construct a buffer for value offsets, for the nested array: |
768 | | // [[0, 1, 2], [3, 4, 5], [6, 7]] |
769 | | let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); |
770 | | |
771 | | // Construct a list array from the above two |
772 | | let list_data_type = DataType::new_large_list(DataType::Int32, false); |
773 | | let list_data = ArrayData::builder(list_data_type.clone()) |
774 | | .len(3) |
775 | | .add_buffer(value_offsets.clone()) |
776 | | .add_child_data(value_data.clone()) |
777 | | .build() |
778 | | .unwrap(); |
779 | | let list_array = LargeListArray::from(list_data); |
780 | | |
781 | | let values = list_array.values(); |
782 | | assert_eq!(value_data, values.to_data()); |
783 | | assert_eq!(DataType::Int32, list_array.value_type()); |
784 | | assert_eq!(3, list_array.len()); |
785 | | assert_eq!(0, list_array.null_count()); |
786 | | assert_eq!(6, list_array.value_offsets()[2]); |
787 | | assert_eq!(2, list_array.value_length(2)); |
788 | | assert_eq!(0, list_array.value(0).as_primitive::<Int32Type>().value(0)); |
789 | | assert_eq!( |
790 | | 0, |
791 | | unsafe { list_array.value_unchecked(0) } |
792 | | .as_primitive::<Int32Type>() |
793 | | .value(0) |
794 | | ); |
795 | | for i in 0..3 { |
796 | | assert!(list_array.is_valid(i)); |
797 | | assert!(!list_array.is_null(i)); |
798 | | } |
799 | | |
800 | | // Now test with a non-zero offset |
801 | | // [[3, 4, 5], [6, 7]] |
802 | | let list_data = ArrayData::builder(list_data_type) |
803 | | .len(2) |
804 | | .offset(1) |
805 | | .add_buffer(value_offsets) |
806 | | .add_child_data(value_data.clone()) |
807 | | .build() |
808 | | .unwrap(); |
809 | | let list_array = LargeListArray::from(list_data); |
810 | | |
811 | | let values = list_array.values(); |
812 | | assert_eq!(value_data, values.to_data()); |
813 | | assert_eq!(DataType::Int32, list_array.value_type()); |
814 | | assert_eq!(2, list_array.len()); |
815 | | assert_eq!(0, list_array.null_count()); |
816 | | assert_eq!(6, list_array.value_offsets()[1]); |
817 | | assert_eq!(2, list_array.value_length(1)); |
818 | | assert_eq!(3, list_array.value(0).as_primitive::<Int32Type>().value(0)); |
819 | | assert_eq!( |
820 | | 3, |
821 | | unsafe { list_array.value_unchecked(0) } |
822 | | .as_primitive::<Int32Type>() |
823 | | .value(0) |
824 | | ); |
825 | | } |
826 | | |
827 | | #[test] |
828 | | fn test_list_array_slice() { |
829 | | // Construct a value array |
830 | | let value_data = ArrayData::builder(DataType::Int32) |
831 | | .len(10) |
832 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) |
833 | | .build() |
834 | | .unwrap(); |
835 | | |
836 | | // Construct a buffer for value offsets, for the nested array: |
837 | | // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] |
838 | | let value_offsets = Buffer::from_slice_ref([0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); |
839 | | // 01011001 00000001 |
840 | | let mut null_bits: [u8; 2] = [0; 2]; |
841 | | bit_util::set_bit(&mut null_bits, 0); |
842 | | bit_util::set_bit(&mut null_bits, 3); |
843 | | bit_util::set_bit(&mut null_bits, 4); |
844 | | bit_util::set_bit(&mut null_bits, 6); |
845 | | bit_util::set_bit(&mut null_bits, 8); |
846 | | |
847 | | // Construct a list array from the above two |
848 | | let list_data_type = |
849 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
850 | | let list_data = ArrayData::builder(list_data_type) |
851 | | .len(9) |
852 | | .add_buffer(value_offsets) |
853 | | .add_child_data(value_data.clone()) |
854 | | .null_bit_buffer(Some(Buffer::from(null_bits))) |
855 | | .build() |
856 | | .unwrap(); |
857 | | let list_array = ListArray::from(list_data); |
858 | | |
859 | | let values = list_array.values(); |
860 | | assert_eq!(value_data, values.to_data()); |
861 | | assert_eq!(DataType::Int32, list_array.value_type()); |
862 | | assert_eq!(9, list_array.len()); |
863 | | assert_eq!(4, list_array.null_count()); |
864 | | assert_eq!(2, list_array.value_offsets()[3]); |
865 | | assert_eq!(2, list_array.value_length(3)); |
866 | | |
867 | | let sliced_array = list_array.slice(1, 6); |
868 | | assert_eq!(6, sliced_array.len()); |
869 | | assert_eq!(3, sliced_array.null_count()); |
870 | | |
871 | | for i in 0..sliced_array.len() { |
872 | | if bit_util::get_bit(&null_bits, 1 + i) { |
873 | | assert!(sliced_array.is_valid(i)); |
874 | | } else { |
875 | | assert!(sliced_array.is_null(i)); |
876 | | } |
877 | | } |
878 | | |
879 | | // Check offset and length for each non-null value. |
880 | | let sliced_list_array = sliced_array.as_any().downcast_ref::<ListArray>().unwrap(); |
881 | | assert_eq!(2, sliced_list_array.value_offsets()[2]); |
882 | | assert_eq!(2, sliced_list_array.value_length(2)); |
883 | | assert_eq!(4, sliced_list_array.value_offsets()[3]); |
884 | | assert_eq!(2, sliced_list_array.value_length(3)); |
885 | | assert_eq!(6, sliced_list_array.value_offsets()[5]); |
886 | | assert_eq!(3, sliced_list_array.value_length(5)); |
887 | | } |
888 | | |
889 | | #[test] |
890 | | fn test_large_list_array_slice() { |
891 | | // Construct a value array |
892 | | let value_data = ArrayData::builder(DataType::Int32) |
893 | | .len(10) |
894 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) |
895 | | .build() |
896 | | .unwrap(); |
897 | | |
898 | | // Construct a buffer for value offsets, for the nested array: |
899 | | // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] |
900 | | let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); |
901 | | // 01011001 00000001 |
902 | | let mut null_bits: [u8; 2] = [0; 2]; |
903 | | bit_util::set_bit(&mut null_bits, 0); |
904 | | bit_util::set_bit(&mut null_bits, 3); |
905 | | bit_util::set_bit(&mut null_bits, 4); |
906 | | bit_util::set_bit(&mut null_bits, 6); |
907 | | bit_util::set_bit(&mut null_bits, 8); |
908 | | |
909 | | // Construct a list array from the above two |
910 | | let list_data_type = DataType::new_large_list(DataType::Int32, false); |
911 | | let list_data = ArrayData::builder(list_data_type) |
912 | | .len(9) |
913 | | .add_buffer(value_offsets) |
914 | | .add_child_data(value_data.clone()) |
915 | | .null_bit_buffer(Some(Buffer::from(null_bits))) |
916 | | .build() |
917 | | .unwrap(); |
918 | | let list_array = LargeListArray::from(list_data); |
919 | | |
920 | | let values = list_array.values(); |
921 | | assert_eq!(value_data, values.to_data()); |
922 | | assert_eq!(DataType::Int32, list_array.value_type()); |
923 | | assert_eq!(9, list_array.len()); |
924 | | assert_eq!(4, list_array.null_count()); |
925 | | assert_eq!(2, list_array.value_offsets()[3]); |
926 | | assert_eq!(2, list_array.value_length(3)); |
927 | | |
928 | | let sliced_array = list_array.slice(1, 6); |
929 | | assert_eq!(6, sliced_array.len()); |
930 | | assert_eq!(3, sliced_array.null_count()); |
931 | | |
932 | | for i in 0..sliced_array.len() { |
933 | | if bit_util::get_bit(&null_bits, 1 + i) { |
934 | | assert!(sliced_array.is_valid(i)); |
935 | | } else { |
936 | | assert!(sliced_array.is_null(i)); |
937 | | } |
938 | | } |
939 | | |
940 | | // Check offset and length for each non-null value. |
941 | | let sliced_list_array = sliced_array |
942 | | .as_any() |
943 | | .downcast_ref::<LargeListArray>() |
944 | | .unwrap(); |
945 | | assert_eq!(2, sliced_list_array.value_offsets()[2]); |
946 | | assert_eq!(2, sliced_list_array.value_length(2)); |
947 | | assert_eq!(4, sliced_list_array.value_offsets()[3]); |
948 | | assert_eq!(2, sliced_list_array.value_length(3)); |
949 | | assert_eq!(6, sliced_list_array.value_offsets()[5]); |
950 | | assert_eq!(3, sliced_list_array.value_length(5)); |
951 | | } |
952 | | |
953 | | #[test] |
954 | | #[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")] |
955 | | fn test_list_array_index_out_of_bound() { |
956 | | // Construct a value array |
957 | | let value_data = ArrayData::builder(DataType::Int32) |
958 | | .len(10) |
959 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) |
960 | | .build() |
961 | | .unwrap(); |
962 | | |
963 | | // Construct a buffer for value offsets, for the nested array: |
964 | | // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] |
965 | | let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); |
966 | | // 01011001 00000001 |
967 | | let mut null_bits: [u8; 2] = [0; 2]; |
968 | | bit_util::set_bit(&mut null_bits, 0); |
969 | | bit_util::set_bit(&mut null_bits, 3); |
970 | | bit_util::set_bit(&mut null_bits, 4); |
971 | | bit_util::set_bit(&mut null_bits, 6); |
972 | | bit_util::set_bit(&mut null_bits, 8); |
973 | | |
974 | | // Construct a list array from the above two |
975 | | let list_data_type = DataType::new_large_list(DataType::Int32, false); |
976 | | let list_data = ArrayData::builder(list_data_type) |
977 | | .len(9) |
978 | | .add_buffer(value_offsets) |
979 | | .add_child_data(value_data) |
980 | | .null_bit_buffer(Some(Buffer::from(null_bits))) |
981 | | .build() |
982 | | .unwrap(); |
983 | | let list_array = LargeListArray::from(list_data); |
984 | | assert_eq!(9, list_array.len()); |
985 | | |
986 | | list_array.value(10); |
987 | | } |
988 | | #[test] |
989 | | #[should_panic(expected = "ListArray data should contain a single buffer only (value offsets)")] |
990 | | // Different error messages, so skip for now |
991 | | // https://github.com/apache/arrow-rs/issues/1545 |
992 | | #[cfg(not(feature = "force_validate"))] |
993 | | fn test_list_array_invalid_buffer_len() { |
994 | | let value_data = unsafe { |
995 | | ArrayData::builder(DataType::Int32) |
996 | | .len(8) |
997 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) |
998 | | .build_unchecked() |
999 | | }; |
1000 | | let list_data_type = |
1001 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
1002 | | let list_data = unsafe { |
1003 | | ArrayData::builder(list_data_type) |
1004 | | .len(3) |
1005 | | .add_child_data(value_data) |
1006 | | .build_unchecked() |
1007 | | }; |
1008 | | drop(ListArray::from(list_data)); |
1009 | | } |
1010 | | |
1011 | | #[test] |
1012 | | #[should_panic(expected = "ListArray should contain a single child array (values array)")] |
1013 | | // Different error messages, so skip for now |
1014 | | // https://github.com/apache/arrow-rs/issues/1545 |
1015 | | #[cfg(not(feature = "force_validate"))] |
1016 | | fn test_list_array_invalid_child_array_len() { |
1017 | | let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); |
1018 | | let list_data_type = |
1019 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
1020 | | let list_data = unsafe { |
1021 | | ArrayData::builder(list_data_type) |
1022 | | .len(3) |
1023 | | .add_buffer(value_offsets) |
1024 | | .build_unchecked() |
1025 | | }; |
1026 | | drop(ListArray::from(list_data)); |
1027 | | } |
1028 | | |
1029 | | #[test] |
1030 | | #[should_panic(expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List")] |
1031 | | fn test_from_array_data_validation() { |
1032 | | let mut builder = ListBuilder::new(Int32Builder::new()); |
1033 | | builder.values().append_value(1); |
1034 | | builder.append(true); |
1035 | | let array = builder.finish(); |
1036 | | let _ = LargeListArray::from(array.into_data()); |
1037 | | } |
1038 | | |
1039 | | #[test] |
1040 | | fn test_list_array_offsets_need_not_start_at_zero() { |
1041 | | let value_data = ArrayData::builder(DataType::Int32) |
1042 | | .len(8) |
1043 | | .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) |
1044 | | .build() |
1045 | | .unwrap(); |
1046 | | |
1047 | | let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); |
1048 | | |
1049 | | let list_data_type = |
1050 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
1051 | | let list_data = ArrayData::builder(list_data_type) |
1052 | | .len(3) |
1053 | | .add_buffer(value_offsets) |
1054 | | .add_child_data(value_data) |
1055 | | .build() |
1056 | | .unwrap(); |
1057 | | |
1058 | | let list_array = ListArray::from(list_data); |
1059 | | assert_eq!(list_array.value_length(0), 0); |
1060 | | assert_eq!(list_array.value_length(1), 3); |
1061 | | assert_eq!(list_array.value_length(2), 2); |
1062 | | } |
1063 | | |
1064 | | #[test] |
1065 | | #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] |
1066 | | // Different error messages, so skip for now |
1067 | | // https://github.com/apache/arrow-rs/issues/1545 |
1068 | | #[cfg(not(feature = "force_validate"))] |
1069 | | fn test_primitive_array_alignment() { |
1070 | | let buf = Buffer::from_slice_ref([0_u64]); |
1071 | | let buf2 = buf.slice(1); |
1072 | | let array_data = unsafe { |
1073 | | ArrayData::builder(DataType::Int32) |
1074 | | .add_buffer(buf2) |
1075 | | .build_unchecked() |
1076 | | }; |
1077 | | drop(Int32Array::from(array_data)); |
1078 | | } |
1079 | | |
1080 | | #[test] |
1081 | | #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] |
1082 | | // Different error messages, so skip for now |
1083 | | // https://github.com/apache/arrow-rs/issues/1545 |
1084 | | #[cfg(not(feature = "force_validate"))] |
1085 | | fn test_list_array_alignment() { |
1086 | | let buf = Buffer::from_slice_ref([0_u64]); |
1087 | | let buf2 = buf.slice(1); |
1088 | | |
1089 | | let values: [i32; 8] = [0; 8]; |
1090 | | let value_data = unsafe { |
1091 | | ArrayData::builder(DataType::Int32) |
1092 | | .add_buffer(Buffer::from_slice_ref(values)) |
1093 | | .build_unchecked() |
1094 | | }; |
1095 | | |
1096 | | let list_data_type = |
1097 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
1098 | | let list_data = unsafe { |
1099 | | ArrayData::builder(list_data_type) |
1100 | | .add_buffer(buf2) |
1101 | | .add_child_data(value_data) |
1102 | | .build_unchecked() |
1103 | | }; |
1104 | | drop(ListArray::from(list_data)); |
1105 | | } |
1106 | | |
1107 | | #[test] |
1108 | | fn list_array_equality() { |
1109 | | // test scaffold |
1110 | | fn do_comparison( |
1111 | | lhs_data: Vec<Option<Vec<Option<i32>>>>, |
1112 | | rhs_data: Vec<Option<Vec<Option<i32>>>>, |
1113 | | should_equal: bool, |
1114 | | ) { |
1115 | | let lhs = ListArray::from_iter_primitive::<Int32Type, _, _>(lhs_data.clone()); |
1116 | | let rhs = ListArray::from_iter_primitive::<Int32Type, _, _>(rhs_data.clone()); |
1117 | | assert_eq!(lhs == rhs, should_equal); |
1118 | | |
1119 | | let lhs = LargeListArray::from_iter_primitive::<Int32Type, _, _>(lhs_data); |
1120 | | let rhs = LargeListArray::from_iter_primitive::<Int32Type, _, _>(rhs_data); |
1121 | | assert_eq!(lhs == rhs, should_equal); |
1122 | | } |
1123 | | |
1124 | | do_comparison( |
1125 | | vec![ |
1126 | | Some(vec![Some(0), Some(1), Some(2)]), |
1127 | | None, |
1128 | | Some(vec![Some(3), None, Some(5)]), |
1129 | | Some(vec![Some(6), Some(7)]), |
1130 | | ], |
1131 | | vec![ |
1132 | | Some(vec![Some(0), Some(1), Some(2)]), |
1133 | | None, |
1134 | | Some(vec![Some(3), None, Some(5)]), |
1135 | | Some(vec![Some(6), Some(7)]), |
1136 | | ], |
1137 | | true, |
1138 | | ); |
1139 | | |
1140 | | do_comparison( |
1141 | | vec![ |
1142 | | None, |
1143 | | None, |
1144 | | Some(vec![Some(3), None, Some(5)]), |
1145 | | Some(vec![Some(6), Some(7)]), |
1146 | | ], |
1147 | | vec![ |
1148 | | Some(vec![Some(0), Some(1), Some(2)]), |
1149 | | None, |
1150 | | Some(vec![Some(3), None, Some(5)]), |
1151 | | Some(vec![Some(6), Some(7)]), |
1152 | | ], |
1153 | | false, |
1154 | | ); |
1155 | | |
1156 | | do_comparison( |
1157 | | vec![ |
1158 | | None, |
1159 | | None, |
1160 | | Some(vec![Some(3), None, Some(5)]), |
1161 | | Some(vec![Some(6), Some(7)]), |
1162 | | ], |
1163 | | vec![ |
1164 | | None, |
1165 | | None, |
1166 | | Some(vec![Some(3), None, Some(5)]), |
1167 | | Some(vec![Some(0), Some(0)]), |
1168 | | ], |
1169 | | false, |
1170 | | ); |
1171 | | |
1172 | | do_comparison( |
1173 | | vec![None, None, Some(vec![Some(1)])], |
1174 | | vec![None, None, Some(vec![Some(2)])], |
1175 | | false, |
1176 | | ); |
1177 | | } |
1178 | | |
1179 | | #[test] |
1180 | | fn test_empty_offsets() { |
1181 | | let f = Arc::new(Field::new("element", DataType::Int32, true)); |
1182 | | let string = ListArray::from( |
1183 | | ArrayData::builder(DataType::List(f.clone())) |
1184 | | .buffers(vec![Buffer::from(&[])]) |
1185 | | .add_child_data(ArrayData::new_empty(&DataType::Int32)) |
1186 | | .build() |
1187 | | .unwrap(), |
1188 | | ); |
1189 | | assert_eq!(string.value_offsets(), &[0]); |
1190 | | let string = LargeListArray::from( |
1191 | | ArrayData::builder(DataType::LargeList(f)) |
1192 | | .buffers(vec![Buffer::from(&[])]) |
1193 | | .add_child_data(ArrayData::new_empty(&DataType::Int32)) |
1194 | | .build() |
1195 | | .unwrap(), |
1196 | | ); |
1197 | | assert_eq!(string.len(), 0); |
1198 | | assert_eq!(string.value_offsets(), &[0]); |
1199 | | } |
1200 | | |
1201 | | #[test] |
1202 | | fn test_try_new() { |
1203 | | let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); |
1204 | | let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); |
1205 | | let values = Arc::new(values) as ArrayRef; |
1206 | | |
1207 | | let field = Arc::new(Field::new("element", DataType::Int32, false)); |
1208 | | ListArray::new(field.clone(), offsets.clone(), values.clone(), None); |
1209 | | |
1210 | | let nulls = NullBuffer::new_null(3); |
1211 | | ListArray::new(field.clone(), offsets, values.clone(), Some(nulls)); |
1212 | | |
1213 | | let nulls = NullBuffer::new_null(3); |
1214 | | let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into()); |
1215 | | let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) |
1216 | | .unwrap_err(); |
1217 | | |
1218 | | assert_eq!( |
1219 | | err.to_string(), |
1220 | | "Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3" |
1221 | | ); |
1222 | | |
1223 | | let field = Arc::new(Field::new("element", DataType::Int64, false)); |
1224 | | let err = LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) |
1225 | | .unwrap_err(); |
1226 | | |
1227 | | assert_eq!( |
1228 | | err.to_string(), |
1229 | | "Invalid argument error: LargeListArray expected data type Int64 got Int32 for \"element\"" |
1230 | | ); |
1231 | | |
1232 | | let nulls = NullBuffer::new_null(7); |
1233 | | let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); |
1234 | | let values = Arc::new(values); |
1235 | | |
1236 | | let err = |
1237 | | LargeListArray::try_new(field, offsets.clone(), values.clone(), None).unwrap_err(); |
1238 | | |
1239 | | assert_eq!( |
1240 | | err.to_string(), |
1241 | | "Invalid argument error: Non-nullable field of LargeListArray \"element\" cannot contain nulls" |
1242 | | ); |
1243 | | |
1244 | | let field = Arc::new(Field::new("element", DataType::Int64, true)); |
1245 | | LargeListArray::new(field.clone(), offsets.clone(), values, None); |
1246 | | |
1247 | | let values = Int64Array::new(vec![0; 2].into(), None); |
1248 | | let err = LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); |
1249 | | |
1250 | | assert_eq!( |
1251 | | err.to_string(), |
1252 | | "Invalid argument error: Max offset of 5 exceeds length of values 2" |
1253 | | ); |
1254 | | } |
1255 | | |
1256 | | #[test] |
1257 | | fn test_from_fixed_size_list() { |
1258 | | let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); |
1259 | | builder.values().append_slice(&[1, 2, 3]); |
1260 | | builder.append(true); |
1261 | | builder.values().append_slice(&[0, 0, 0]); |
1262 | | builder.append(false); |
1263 | | builder.values().append_slice(&[4, 5, 6]); |
1264 | | builder.append(true); |
1265 | | let list: ListArray = builder.finish().into(); |
1266 | | |
1267 | | let values: Vec<_> = list |
1268 | | .iter() |
1269 | | .map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec())) |
1270 | | .collect(); |
1271 | | assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) |
1272 | | } |
1273 | | |
1274 | | #[test] |
1275 | | fn test_nullable_union() { |
1276 | | let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); |
1277 | | let mut builder = UnionBuilder::new_dense(); |
1278 | | builder.append::<Int32Type>("a", 1).unwrap(); |
1279 | | builder.append::<Int32Type>("b", 2).unwrap(); |
1280 | | builder.append::<Int32Type>("b", 3).unwrap(); |
1281 | | builder.append::<Int32Type>("a", 4).unwrap(); |
1282 | | builder.append::<Int32Type>("a", 5).unwrap(); |
1283 | | let values = builder.build().unwrap(); |
1284 | | let field = Arc::new(Field::new("element", values.data_type().clone(), false)); |
1285 | | ListArray::new(field.clone(), offsets, Arc::new(values), None); |
1286 | | } |
1287 | | } |