/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/struct_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::print_long_array; |
19 | | use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch}; |
20 | | use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; |
21 | | use arrow_data::{ArrayData, ArrayDataBuilder}; |
22 | | use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; |
23 | | use std::sync::Arc; |
24 | | use std::{any::Any, ops::Index}; |
25 | | |
26 | | /// An array of [structs](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) |
27 | | /// |
28 | | /// Each child (called *field*) is represented by a separate array. |
29 | | /// |
30 | | /// # Comparison with [RecordBatch] |
31 | | /// |
32 | | /// Both [`RecordBatch`] and [`StructArray`] represent a collection of columns / arrays with the |
33 | | /// same length. |
34 | | /// |
35 | | /// However, there are a couple of key differences: |
36 | | /// |
37 | | /// * [`StructArray`] can be nested within other [`Array`], including itself |
38 | | /// * [`RecordBatch`] can contain top-level metadata on its associated [`Schema`][arrow_schema::Schema] |
39 | | /// * [`StructArray`] can contain top-level nulls, i.e. `null` |
40 | | /// * [`RecordBatch`] can only represent nulls in its child columns, i.e. `{"field": null}` |
41 | | /// |
42 | | /// [`StructArray`] is therefore a more general data container than [`RecordBatch`], and as such |
43 | | /// code that needs to handle both will typically share an implementation in terms of |
44 | | /// [`StructArray`] and convert to/from [`RecordBatch`] as necessary. |
45 | | /// |
46 | | /// [`From`] implementations are provided to facilitate this conversion, however, converting |
47 | | /// from a [`StructArray`] containing top-level nulls to a [`RecordBatch`] will panic, as there |
48 | | /// is no way to preserve them. |
49 | | /// |
50 | | /// # Example: Create an array from a vector of fields |
51 | | /// |
52 | | /// ``` |
53 | | /// use std::sync::Arc; |
54 | | /// use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, StructArray}; |
55 | | /// use arrow_schema::{DataType, Field}; |
56 | | /// |
57 | | /// let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); |
58 | | /// let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); |
59 | | /// |
60 | | /// let struct_array = StructArray::from(vec![ |
61 | | /// ( |
62 | | /// Arc::new(Field::new("b", DataType::Boolean, false)), |
63 | | /// boolean.clone() as ArrayRef, |
64 | | /// ), |
65 | | /// ( |
66 | | /// Arc::new(Field::new("c", DataType::Int32, false)), |
67 | | /// int.clone() as ArrayRef, |
68 | | /// ), |
69 | | /// ]); |
70 | | /// assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref()); |
71 | | /// assert_eq!(struct_array.column(1).as_ref(), int.as_ref()); |
72 | | /// assert_eq!(4, struct_array.len()); |
73 | | /// assert_eq!(0, struct_array.null_count()); |
74 | | /// assert_eq!(0, struct_array.offset()); |
75 | | /// ``` |
76 | | #[derive(Clone)] |
77 | | pub struct StructArray { |
78 | | len: usize, |
79 | | data_type: DataType, |
80 | | nulls: Option<NullBuffer>, |
81 | | fields: Vec<ArrayRef>, |
82 | | } |
83 | | |
84 | | impl StructArray { |
85 | | /// Create a new [`StructArray`] from the provided parts, panicking on failure |
86 | | /// |
87 | | /// # Panics |
88 | | /// |
89 | | /// Panics if [`Self::try_new`] returns an error |
90 | 87 | pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls: Option<NullBuffer>) -> Self { |
91 | 87 | Self::try_new(fields, arrays, nulls).unwrap() |
92 | 87 | } |
93 | | |
94 | | /// Create a new [`StructArray`] from the provided parts, returning an error on failure |
95 | | /// |
96 | | /// The length will be inferred from the length of the child arrays. Returns an error if |
97 | | /// there are no child arrays. Consider using [`Self::try_new_with_length`] if the length |
98 | | /// is known to avoid this. |
99 | | /// |
100 | | /// # Errors |
101 | | /// |
102 | | /// Errors if |
103 | | /// |
104 | | /// * `fields.len() == 0` |
105 | | /// * Any reason that [`Self::try_new_with_length`] would error |
106 | 87 | pub fn try_new( |
107 | 87 | fields: Fields, |
108 | 87 | arrays: Vec<ArrayRef>, |
109 | 87 | nulls: Option<NullBuffer>, |
110 | 87 | ) -> Result<Self, ArrowError> { |
111 | 87 | let len = arrays.first().map(|x| x.len()).ok_or_else(||ArrowError::InvalidArgumentError("use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly"0 .to_string0 ()))?0 ; |
112 | | |
113 | 87 | Self::try_new_with_length(fields, arrays, nulls, len) |
114 | 87 | } |
115 | | |
116 | | /// Create a new [`StructArray`] from the provided parts, returning an error on failure |
117 | | /// |
118 | | /// # Errors |
119 | | /// |
120 | | /// Errors if |
121 | | /// |
122 | | /// * `fields.len() != arrays.len()` |
123 | | /// * `fields[i].data_type() != arrays[i].data_type()` |
124 | | /// * `arrays[i].len() != arrays[j].len()` |
125 | | /// * `arrays[i].len() != nulls.len()` |
126 | | /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())` |
127 | 92 | pub fn try_new_with_length( |
128 | 92 | fields: Fields, |
129 | 92 | arrays: Vec<ArrayRef>, |
130 | 92 | nulls: Option<NullBuffer>, |
131 | 92 | len: usize, |
132 | 92 | ) -> Result<Self, ArrowError> { |
133 | 92 | if fields.len() != arrays.len() { |
134 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
135 | 0 | "Incorrect number of arrays for StructArray fields, expected {} got {}", |
136 | 0 | fields.len(), |
137 | 0 | arrays.len() |
138 | 0 | ))); |
139 | 92 | } |
140 | | |
141 | 92 | if let Some(n23 ) = nulls.as_ref() { |
142 | 23 | if n.len() != len { |
143 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
144 | 0 | "Incorrect number of nulls for StructArray, expected {len} got {}", |
145 | 0 | n.len(), |
146 | 0 | ))); |
147 | 23 | } |
148 | 69 | } |
149 | | |
150 | 161 | for (f, a) in fields.iter()92 .zip92 (&arrays92 ) { |
151 | 161 | if f.data_type() != a.data_type() { |
152 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
153 | 0 | "Incorrect datatype for StructArray field {:?}, expected {} got {}", |
154 | 0 | f.name(), |
155 | 0 | f.data_type(), |
156 | 0 | a.data_type() |
157 | 0 | ))); |
158 | 161 | } |
159 | | |
160 | 161 | if a.len() != len { |
161 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
162 | 0 | "Incorrect array length for StructArray field {:?}, expected {} got {}", |
163 | 0 | f.name(), |
164 | 0 | len, |
165 | 0 | a.len() |
166 | 0 | ))); |
167 | 161 | } |
168 | | |
169 | 161 | if !f.is_nullable() { |
170 | 56 | if let Some(a2 ) = a.logical_nulls() { |
171 | 2 | if !nulls.as_ref().map(|n| n.contains(&a)).unwrap_or_default() |
172 | 0 | && a.null_count() > 0 |
173 | | { |
174 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
175 | 0 | "Found unmasked nulls for non-nullable StructArray field {:?}", |
176 | 0 | f.name() |
177 | 0 | ))); |
178 | 2 | } |
179 | 54 | } |
180 | 105 | } |
181 | | } |
182 | | |
183 | | Ok(Self { |
184 | 92 | len, |
185 | 92 | data_type: DataType::Struct(fields), |
186 | 92 | nulls: nulls.filter(|n| n23 .null_count23 () > 0), |
187 | 92 | fields: arrays, |
188 | | }) |
189 | 92 | } |
190 | | |
191 | | /// Create a new [`StructArray`] of length `len` where all values are null |
192 | 0 | pub fn new_null(fields: Fields, len: usize) -> Self { |
193 | 0 | let arrays = fields |
194 | 0 | .iter() |
195 | 0 | .map(|f| new_null_array(f.data_type(), len)) |
196 | 0 | .collect(); |
197 | | |
198 | 0 | Self { |
199 | 0 | len, |
200 | 0 | data_type: DataType::Struct(fields), |
201 | 0 | nulls: Some(NullBuffer::new_null(len)), |
202 | 0 | fields: arrays, |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | | /// Create a new [`StructArray`] from the provided parts without validation |
207 | | /// |
208 | | /// The length will be inferred from the length of the child arrays. Panics if there are no |
209 | | /// child arrays. Consider using [`Self::new_unchecked_with_length`] if the length is known |
210 | | /// to avoid this. |
211 | | /// |
212 | | /// # Safety |
213 | | /// |
214 | | /// Safe if [`Self::new`] would not panic with the given arguments |
215 | 0 | pub unsafe fn new_unchecked( |
216 | 0 | fields: Fields, |
217 | 0 | arrays: Vec<ArrayRef>, |
218 | 0 | nulls: Option<NullBuffer>, |
219 | 0 | ) -> Self { |
220 | 0 | if cfg!(feature = "force_validate") { |
221 | 0 | return Self::new(fields, arrays, nulls); |
222 | 0 | } |
223 | | |
224 | 0 | let len = arrays.first().map(|x| x.len()).expect( |
225 | 0 | "cannot use StructArray::new_unchecked if there are no fields, length is unknown", |
226 | | ); |
227 | 0 | Self { |
228 | 0 | len, |
229 | 0 | data_type: DataType::Struct(fields), |
230 | 0 | nulls, |
231 | 0 | fields: arrays, |
232 | 0 | } |
233 | 0 | } |
234 | | |
235 | | /// Create a new [`StructArray`] from the provided parts without validation |
236 | | /// |
237 | | /// # Safety |
238 | | /// |
239 | | /// Safe if [`Self::new`] would not panic with the given arguments |
240 | 0 | pub unsafe fn new_unchecked_with_length( |
241 | 0 | fields: Fields, |
242 | 0 | arrays: Vec<ArrayRef>, |
243 | 0 | nulls: Option<NullBuffer>, |
244 | 0 | len: usize, |
245 | 0 | ) -> Self { |
246 | 0 | if cfg!(feature = "force_validate") { |
247 | 0 | return Self::try_new_with_length(fields, arrays, nulls, len).unwrap(); |
248 | 0 | } |
249 | | |
250 | 0 | Self { |
251 | 0 | len, |
252 | 0 | data_type: DataType::Struct(fields), |
253 | 0 | nulls, |
254 | 0 | fields: arrays, |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | /// Create a new [`StructArray`] containing no fields |
259 | | /// |
260 | | /// # Panics |
261 | | /// |
262 | | /// If `len != nulls.len()` |
263 | 0 | pub fn new_empty_fields(len: usize, nulls: Option<NullBuffer>) -> Self { |
264 | 0 | if let Some(n) = &nulls { |
265 | 0 | assert_eq!(len, n.len()) |
266 | 0 | } |
267 | 0 | Self { |
268 | 0 | len, |
269 | 0 | data_type: DataType::Struct(Fields::empty()), |
270 | 0 | fields: vec![], |
271 | 0 | nulls, |
272 | 0 | } |
273 | 0 | } |
274 | | |
275 | | /// Deconstruct this array into its constituent parts |
276 | 0 | pub fn into_parts(self) -> (Fields, Vec<ArrayRef>, Option<NullBuffer>) { |
277 | 0 | let f = match self.data_type { |
278 | 0 | DataType::Struct(f) => f, |
279 | 0 | _ => unreachable!(), |
280 | | }; |
281 | 0 | (f, self.fields, self.nulls) |
282 | 0 | } |
283 | | |
284 | | /// Returns the field at `pos`. |
285 | 39 | pub fn column(&self, pos: usize) -> &ArrayRef { |
286 | 39 | &self.fields[pos] |
287 | 39 | } |
288 | | |
289 | | /// Return the number of fields in this struct array |
290 | 0 | pub fn num_columns(&self) -> usize { |
291 | 0 | self.fields.len() |
292 | 0 | } |
293 | | |
294 | | /// Returns the fields of the struct array |
295 | 20 | pub fn columns(&self) -> &[ArrayRef] { |
296 | 20 | &self.fields |
297 | 20 | } |
298 | | |
299 | | /// Return field names in this struct array |
300 | 12 | pub fn column_names(&self) -> Vec<&str> { |
301 | 12 | match self.data_type() { |
302 | 12 | DataType::Struct(fields) => fields |
303 | 12 | .iter() |
304 | 19 | .map12 (|f| f.name().as_str()) |
305 | 12 | .collect::<Vec<&str>>(), |
306 | 0 | _ => unreachable!("Struct array's data type is not struct!"), |
307 | | } |
308 | 12 | } |
309 | | |
310 | | /// Returns the [`Fields`] of this [`StructArray`] |
311 | 0 | pub fn fields(&self) -> &Fields { |
312 | 0 | match self.data_type() { |
313 | 0 | DataType::Struct(f) => f, |
314 | 0 | _ => unreachable!(), |
315 | | } |
316 | 0 | } |
317 | | |
318 | | /// Return child array whose field name equals to column_name |
319 | | /// |
320 | | /// Note: A schema can currently have duplicate field names, in which case |
321 | | /// the first field will always be selected. |
322 | | /// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178) |
323 | 12 | pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> { |
324 | 12 | self.column_names() |
325 | 12 | .iter() |
326 | 14 | .position12 (|c| c == &column_name) |
327 | 12 | .map(|pos| self.column(pos)) |
328 | 12 | } |
329 | | |
330 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
331 | 17 | pub fn slice(&self, offset: usize, len: usize) -> Self { |
332 | 17 | assert!( |
333 | 17 | offset.saturating_add(len) <= self.len, |
334 | 0 | "the length + offset of the sliced StructArray cannot exceed the existing length" |
335 | | ); |
336 | | |
337 | 31 | let fields17 = self.fields.iter()17 .map17 (|a| a.slice(offset, len)).collect17 (); |
338 | | |
339 | | Self { |
340 | 17 | len, |
341 | 17 | data_type: self.data_type.clone(), |
342 | 17 | nulls: self.nulls.as_ref().map(|n| n5 .slice5 (offset5 , len5 )), |
343 | 17 | fields, |
344 | | } |
345 | 17 | } |
346 | | } |
347 | | |
348 | | impl From<ArrayData> for StructArray { |
349 | 15 | fn from(data: ArrayData) -> Self { |
350 | 15 | let parent_offset = data.offset(); |
351 | 15 | let parent_len = data.len(); |
352 | | |
353 | 15 | let fields = data |
354 | 15 | .child_data() |
355 | 15 | .iter() |
356 | 25 | .map15 (|cd| { |
357 | 25 | if parent_offset != 0 || parent_len != cd.len() { |
358 | 0 | make_array(cd.slice(parent_offset, parent_len)) |
359 | | } else { |
360 | 25 | make_array(cd.clone()) |
361 | | } |
362 | 25 | }) |
363 | 15 | .collect(); |
364 | | |
365 | 15 | Self { |
366 | 15 | len: data.len(), |
367 | 15 | data_type: data.data_type().clone(), |
368 | 15 | nulls: data.nulls().cloned(), |
369 | 15 | fields, |
370 | 15 | } |
371 | 15 | } |
372 | | } |
373 | | |
374 | | impl From<StructArray> for ArrayData { |
375 | 99 | fn from(array: StructArray) -> Self { |
376 | 99 | let builder = ArrayDataBuilder::new(array.data_type) |
377 | 99 | .len(array.len) |
378 | 99 | .nulls(array.nulls) |
379 | 172 | .child_data99 (array.fields.iter()99 .map99 (|x| x.to_data()).collect99 ()); |
380 | | |
381 | 99 | unsafe { builder.build_unchecked() } |
382 | 99 | } |
383 | | } |
384 | | |
385 | | impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray { |
386 | | type Error = ArrowError; |
387 | | |
388 | | /// builds a StructArray from a vector of names and arrays. |
389 | 0 | fn try_from(values: Vec<(&str, ArrayRef)>) -> Result<Self, ArrowError> { |
390 | 0 | let (fields, arrays): (Vec<_>, _) = values |
391 | 0 | .into_iter() |
392 | 0 | .map(|(name, array)| { |
393 | 0 | ( |
394 | 0 | Field::new(name, array.data_type().clone(), array.is_nullable()), |
395 | 0 | array, |
396 | 0 | ) |
397 | 0 | }) |
398 | 0 | .unzip(); |
399 | | |
400 | 0 | StructArray::try_new(fields.into(), arrays, None) |
401 | 0 | } |
402 | | } |
403 | | |
404 | | impl Array for StructArray { |
405 | 23 | fn as_any(&self) -> &dyn Any { |
406 | 23 | self |
407 | 23 | } |
408 | | |
409 | 95 | fn to_data(&self) -> ArrayData { |
410 | 95 | self.clone().into() |
411 | 95 | } |
412 | | |
413 | 4 | fn into_data(self) -> ArrayData { |
414 | 4 | self.into() |
415 | 4 | } |
416 | | |
417 | 139 | fn data_type(&self) -> &DataType { |
418 | 139 | &self.data_type |
419 | 139 | } |
420 | | |
421 | 16 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
422 | 16 | Arc::new(self.slice(offset, length)) |
423 | 16 | } |
424 | | |
425 | 151 | fn len(&self) -> usize { |
426 | 151 | self.len |
427 | 151 | } |
428 | | |
429 | 0 | fn is_empty(&self) -> bool { |
430 | 0 | self.len == 0 |
431 | 0 | } |
432 | | |
433 | 0 | fn shrink_to_fit(&mut self) { |
434 | 0 | if let Some(nulls) = &mut self.nulls { |
435 | 0 | nulls.shrink_to_fit(); |
436 | 0 | } |
437 | 0 | self.fields.iter_mut().for_each(|n| n.shrink_to_fit()); |
438 | 0 | } |
439 | | |
440 | 0 | fn offset(&self) -> usize { |
441 | 0 | 0 |
442 | 0 | } |
443 | | |
444 | 59 | fn nulls(&self) -> Option<&NullBuffer> { |
445 | 59 | self.nulls.as_ref() |
446 | 59 | } |
447 | | |
448 | 3 | fn logical_null_count(&self) -> usize { |
449 | | // More efficient that the default implementation |
450 | 3 | self.null_count() |
451 | 3 | } |
452 | | |
453 | 0 | fn get_buffer_memory_size(&self) -> usize { |
454 | 0 | let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum(); |
455 | 0 | if let Some(n) = self.nulls.as_ref() { |
456 | 0 | size += n.buffer().capacity(); |
457 | 0 | } |
458 | 0 | size |
459 | 0 | } |
460 | | |
461 | 0 | fn get_array_memory_size(&self) -> usize { |
462 | 0 | let mut size = self.fields.iter().map(|a| a.get_array_memory_size()).sum(); |
463 | 0 | size += std::mem::size_of::<Self>(); |
464 | 0 | if let Some(n) = self.nulls.as_ref() { |
465 | 0 | size += n.buffer().capacity(); |
466 | 0 | } |
467 | 0 | size |
468 | 0 | } |
469 | | } |
470 | | |
471 | | impl From<Vec<(FieldRef, ArrayRef)>> for StructArray { |
472 | 5 | fn from(v: Vec<(FieldRef, ArrayRef)>) -> Self { |
473 | 5 | let (fields, arrays): (Vec<_>, _) = v.into_iter().unzip(); |
474 | 5 | StructArray::new(fields.into(), arrays, None) |
475 | 5 | } |
476 | | } |
477 | | |
478 | | impl std::fmt::Debug for StructArray { |
479 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
480 | 0 | writeln!(f, "StructArray")?; |
481 | 0 | writeln!(f, "-- validity:")?; |
482 | 0 | writeln!(f, "[")?; |
483 | 0 | print_long_array(self, f, |_array, _index, f| write!(f, "valid"))?; |
484 | 0 | writeln!(f, "]\n[")?; |
485 | 0 | for (child_index, name) in self.column_names().iter().enumerate() { |
486 | 0 | let column = self.column(child_index); |
487 | 0 | writeln!( |
488 | 0 | f, |
489 | 0 | "-- child {}: \"{}\" ({:?})", |
490 | | child_index, |
491 | | name, |
492 | 0 | column.data_type() |
493 | 0 | )?; |
494 | 0 | std::fmt::Debug::fmt(column, f)?; |
495 | 0 | writeln!(f)?; |
496 | | } |
497 | 0 | write!(f, "]") |
498 | 0 | } |
499 | | } |
500 | | |
501 | | impl From<(Vec<(FieldRef, ArrayRef)>, Buffer)> for StructArray { |
502 | 0 | fn from(pair: (Vec<(FieldRef, ArrayRef)>, Buffer)) -> Self { |
503 | 0 | let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default(); |
504 | 0 | let (fields, arrays): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip(); |
505 | 0 | let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len)); |
506 | 0 | Self::new(fields.into(), arrays, Some(nulls)) |
507 | 0 | } |
508 | | } |
509 | | |
510 | | impl From<RecordBatch> for StructArray { |
511 | 0 | fn from(value: RecordBatch) -> Self { |
512 | 0 | Self { |
513 | 0 | len: value.num_rows(), |
514 | 0 | data_type: DataType::Struct(value.schema().fields().clone()), |
515 | 0 | nulls: None, |
516 | 0 | fields: value.columns().to_vec(), |
517 | 0 | } |
518 | 0 | } |
519 | | } |
520 | | |
521 | | impl Index<&str> for StructArray { |
522 | | type Output = ArrayRef; |
523 | | |
524 | | /// Get a reference to a column's array by name. |
525 | | /// |
526 | | /// Note: A schema can currently have duplicate field names, in which case |
527 | | /// the first field will always be selected. |
528 | | /// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178) |
529 | | /// |
530 | | /// # Panics |
531 | | /// |
532 | | /// Panics if the name is not in the schema. |
533 | 0 | fn index(&self, name: &str) -> &Self::Output { |
534 | 0 | self.column_by_name(name).unwrap() |
535 | 0 | } |
536 | | } |
537 | | |
538 | | #[cfg(test)] |
539 | | mod tests { |
540 | | use super::*; |
541 | | |
542 | | use crate::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray}; |
543 | | use arrow_buffer::ToByteSlice; |
544 | | |
545 | | #[test] |
546 | | fn test_struct_array_builder() { |
547 | | let boolean_array = BooleanArray::from(vec![false, false, true, true]); |
548 | | let int_array = Int64Array::from(vec![42, 28, 19, 31]); |
549 | | |
550 | | let fields = vec![ |
551 | | Field::new("a", DataType::Boolean, false), |
552 | | Field::new("b", DataType::Int64, false), |
553 | | ]; |
554 | | let struct_array_data = ArrayData::builder(DataType::Struct(fields.into())) |
555 | | .len(4) |
556 | | .add_child_data(boolean_array.to_data()) |
557 | | .add_child_data(int_array.to_data()) |
558 | | .build() |
559 | | .unwrap(); |
560 | | let struct_array = StructArray::from(struct_array_data); |
561 | | |
562 | | assert_eq!(struct_array.column(0).as_ref(), &boolean_array); |
563 | | assert_eq!(struct_array.column(1).as_ref(), &int_array); |
564 | | } |
565 | | |
566 | | #[test] |
567 | | fn test_struct_array_from() { |
568 | | let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); |
569 | | let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); |
570 | | |
571 | | let struct_array = StructArray::from(vec![ |
572 | | ( |
573 | | Arc::new(Field::new("b", DataType::Boolean, false)), |
574 | | boolean.clone() as ArrayRef, |
575 | | ), |
576 | | ( |
577 | | Arc::new(Field::new("c", DataType::Int32, false)), |
578 | | int.clone() as ArrayRef, |
579 | | ), |
580 | | ]); |
581 | | assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref()); |
582 | | assert_eq!(struct_array.column(1).as_ref(), int.as_ref()); |
583 | | assert_eq!(4, struct_array.len()); |
584 | | assert_eq!(0, struct_array.null_count()); |
585 | | assert_eq!(0, struct_array.offset()); |
586 | | } |
587 | | |
588 | | #[test] |
589 | | fn test_struct_array_from_data_with_offset_and_length() { |
590 | | // Various ways to make the struct array: |
591 | | // |
592 | | // [{x: 2}, {x: 3}, None] |
593 | | // |
594 | | // from slicing larger buffers/arrays with offsets and lengths |
595 | | let int_arr = Int32Array::from(vec![1, 2, 3, 4, 5]); |
596 | | let int_field = Field::new("x", DataType::Int32, false); |
597 | | let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, true, false])); |
598 | | let int_data = int_arr.to_data(); |
599 | | // Case 1: Offset + length, nulls are not sliced |
600 | | let case1 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()]))) |
601 | | .len(3) |
602 | | .offset(1) |
603 | | .nulls(Some(struct_nulls)) |
604 | | .add_child_data(int_data.clone()) |
605 | | .build() |
606 | | .unwrap(); |
607 | | |
608 | | // Case 2: Offset + length, nulls are sliced |
609 | | let struct_nulls = |
610 | | NullBuffer::new(BooleanBuffer::from(vec![true, true, true, false, true]).slice(1, 3)); |
611 | | let case2 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()]))) |
612 | | .len(3) |
613 | | .offset(1) |
614 | | .nulls(Some(struct_nulls.clone())) |
615 | | .add_child_data(int_data.clone()) |
616 | | .build() |
617 | | .unwrap(); |
618 | | |
619 | | // Case 3: struct length is smaller than child length but no offset |
620 | | let offset_int_data = int_data.slice(1, 4); |
621 | | let case3 = ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()]))) |
622 | | .len(3) |
623 | | .nulls(Some(struct_nulls)) |
624 | | .add_child_data(offset_int_data) |
625 | | .build() |
626 | | .unwrap(); |
627 | | |
628 | | let expected = StructArray::new( |
629 | | Fields::from(vec![int_field.clone()]), |
630 | | vec![Arc::new(int_arr)], |
631 | | Some(NullBuffer::new(BooleanBuffer::from(vec![ |
632 | | true, true, true, false, true, |
633 | | ]))), |
634 | | ) |
635 | | .slice(1, 3); |
636 | | |
637 | | for case in [case1, case2, case3] { |
638 | | let struct_arr_from_data = StructArray::from(case); |
639 | | assert_eq!(struct_arr_from_data, expected); |
640 | | assert_eq!(struct_arr_from_data.column(0), expected.column(0)); |
641 | | } |
642 | | } |
643 | | |
644 | | #[test] |
645 | | #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] |
646 | | fn test_struct_array_from_data_with_offset_and_length_error() { |
647 | | let int_arr = Int32Array::from(vec![1, 2, 3, 4, 5]); |
648 | | let int_field = Field::new("x", DataType::Int32, false); |
649 | | let struct_nulls = NullBuffer::new(BooleanBuffer::from(vec![true, true, false])); |
650 | | let int_data = int_arr.to_data(); |
651 | | // If parent offset is 3 and len is 3 then child must have 6 items |
652 | | let struct_data = |
653 | | ArrayData::builder(DataType::Struct(Fields::from(vec![int_field.clone()]))) |
654 | | .len(3) |
655 | | .offset(3) |
656 | | .nulls(Some(struct_nulls)) |
657 | | .add_child_data(int_data) |
658 | | .build() |
659 | | .unwrap(); |
660 | | let _ = StructArray::from(struct_data); |
661 | | } |
662 | | |
663 | | /// validates that struct can be accessed using `column_name` as index i.e. `struct_array["column_name"]`. |
664 | | #[test] |
665 | | fn test_struct_array_index_access() { |
666 | | let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); |
667 | | let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); |
668 | | |
669 | | let struct_array = StructArray::from(vec![ |
670 | | ( |
671 | | Arc::new(Field::new("b", DataType::Boolean, false)), |
672 | | boolean.clone() as ArrayRef, |
673 | | ), |
674 | | ( |
675 | | Arc::new(Field::new("c", DataType::Int32, false)), |
676 | | int.clone() as ArrayRef, |
677 | | ), |
678 | | ]); |
679 | | assert_eq!(struct_array["b"].as_ref(), boolean.as_ref()); |
680 | | assert_eq!(struct_array["c"].as_ref(), int.as_ref()); |
681 | | } |
682 | | |
683 | | /// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) |
684 | | #[test] |
685 | | fn test_struct_array_from_vec() { |
686 | | let strings: ArrayRef = Arc::new(StringArray::from(vec![ |
687 | | Some("joe"), |
688 | | None, |
689 | | None, |
690 | | Some("mark"), |
691 | | ])); |
692 | | let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); |
693 | | |
694 | | let arr = |
695 | | StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap(); |
696 | | |
697 | | let struct_data = arr.into_data(); |
698 | | assert_eq!(4, struct_data.len()); |
699 | | assert_eq!(0, struct_data.null_count()); |
700 | | |
701 | | let expected_string_data = ArrayData::builder(DataType::Utf8) |
702 | | .len(4) |
703 | | .null_bit_buffer(Some(Buffer::from(&[9_u8]))) |
704 | | .add_buffer(Buffer::from([0, 3, 3, 3, 7].to_byte_slice())) |
705 | | .add_buffer(Buffer::from(b"joemark")) |
706 | | .build() |
707 | | .unwrap(); |
708 | | |
709 | | let expected_int_data = ArrayData::builder(DataType::Int32) |
710 | | .len(4) |
711 | | .null_bit_buffer(Some(Buffer::from(&[11_u8]))) |
712 | | .add_buffer(Buffer::from([1, 2, 0, 4].to_byte_slice())) |
713 | | .build() |
714 | | .unwrap(); |
715 | | |
716 | | assert_eq!(expected_string_data, struct_data.child_data()[0]); |
717 | | assert_eq!(expected_int_data, struct_data.child_data()[1]); |
718 | | } |
719 | | |
720 | | #[test] |
721 | | fn test_struct_array_from_vec_error() { |
722 | | let strings: ArrayRef = Arc::new(StringArray::from(vec![ |
723 | | Some("joe"), |
724 | | None, |
725 | | None, |
726 | | // 3 elements, not 4 |
727 | | ])); |
728 | | let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); |
729 | | |
730 | | let err = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) |
731 | | .unwrap_err() |
732 | | .to_string(); |
733 | | |
734 | | assert_eq!( |
735 | | err, |
736 | | "Invalid argument error: Incorrect array length for StructArray field \"f2\", expected 3 got 4" |
737 | | ) |
738 | | } |
739 | | |
740 | | #[test] |
741 | | #[should_panic( |
742 | | expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean" |
743 | | )] |
744 | | fn test_struct_array_from_mismatched_types_single() { |
745 | | drop(StructArray::from(vec![( |
746 | | Arc::new(Field::new("b", DataType::Int16, false)), |
747 | | Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>, |
748 | | )])); |
749 | | } |
750 | | |
751 | | #[test] |
752 | | #[should_panic( |
753 | | expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean" |
754 | | )] |
755 | | fn test_struct_array_from_mismatched_types_multiple() { |
756 | | drop(StructArray::from(vec![ |
757 | | ( |
758 | | Arc::new(Field::new("b", DataType::Int16, false)), |
759 | | Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>, |
760 | | ), |
761 | | ( |
762 | | Arc::new(Field::new("c", DataType::Utf8, false)), |
763 | | Arc::new(Int32Array::from(vec![42, 28, 19, 31])), |
764 | | ), |
765 | | ])); |
766 | | } |
767 | | |
768 | | #[test] |
769 | | fn test_struct_array_slice() { |
770 | | let boolean_data = ArrayData::builder(DataType::Boolean) |
771 | | .len(5) |
772 | | .add_buffer(Buffer::from([0b00010000])) |
773 | | .null_bit_buffer(Some(Buffer::from([0b00010001]))) |
774 | | .build() |
775 | | .unwrap(); |
776 | | let int_data = ArrayData::builder(DataType::Int32) |
777 | | .len(5) |
778 | | .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice())) |
779 | | .null_bit_buffer(Some(Buffer::from([0b00000110]))) |
780 | | .build() |
781 | | .unwrap(); |
782 | | |
783 | | let field_types = vec![ |
784 | | Field::new("a", DataType::Boolean, true), |
785 | | Field::new("b", DataType::Int32, true), |
786 | | ]; |
787 | | let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into())) |
788 | | .len(5) |
789 | | .add_child_data(boolean_data.clone()) |
790 | | .add_child_data(int_data.clone()) |
791 | | .null_bit_buffer(Some(Buffer::from([0b00010111]))) |
792 | | .build() |
793 | | .unwrap(); |
794 | | let struct_array = StructArray::from(struct_array_data); |
795 | | |
796 | | assert_eq!(5, struct_array.len()); |
797 | | assert_eq!(1, struct_array.null_count()); |
798 | | assert!(struct_array.is_valid(0)); |
799 | | assert!(struct_array.is_valid(1)); |
800 | | assert!(struct_array.is_valid(2)); |
801 | | assert!(struct_array.is_null(3)); |
802 | | assert!(struct_array.is_valid(4)); |
803 | | assert_eq!(boolean_data, struct_array.column(0).to_data()); |
804 | | assert_eq!(int_data, struct_array.column(1).to_data()); |
805 | | |
806 | | let c0 = struct_array.column(0); |
807 | | let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap(); |
808 | | assert_eq!(5, c0.len()); |
809 | | assert_eq!(3, c0.null_count()); |
810 | | assert!(c0.is_valid(0)); |
811 | | assert!(!c0.value(0)); |
812 | | assert!(c0.is_null(1)); |
813 | | assert!(c0.is_null(2)); |
814 | | assert!(c0.is_null(3)); |
815 | | assert!(c0.is_valid(4)); |
816 | | assert!(c0.value(4)); |
817 | | |
818 | | let c1 = struct_array.column(1); |
819 | | let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap(); |
820 | | assert_eq!(5, c1.len()); |
821 | | assert_eq!(3, c1.null_count()); |
822 | | assert!(c1.is_null(0)); |
823 | | assert!(c1.is_valid(1)); |
824 | | assert_eq!(28, c1.value(1)); |
825 | | assert!(c1.is_valid(2)); |
826 | | assert_eq!(42, c1.value(2)); |
827 | | assert!(c1.is_null(3)); |
828 | | assert!(c1.is_null(4)); |
829 | | |
830 | | let sliced_array = struct_array.slice(2, 3); |
831 | | let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap(); |
832 | | assert_eq!(3, sliced_array.len()); |
833 | | assert_eq!(1, sliced_array.null_count()); |
834 | | assert!(sliced_array.is_valid(0)); |
835 | | assert!(sliced_array.is_null(1)); |
836 | | assert!(sliced_array.is_valid(2)); |
837 | | |
838 | | let sliced_c0 = sliced_array.column(0); |
839 | | let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap(); |
840 | | assert_eq!(3, sliced_c0.len()); |
841 | | assert!(sliced_c0.is_null(0)); |
842 | | assert!(sliced_c0.is_null(1)); |
843 | | assert!(sliced_c0.is_valid(2)); |
844 | | assert!(sliced_c0.value(2)); |
845 | | |
846 | | let sliced_c1 = sliced_array.column(1); |
847 | | let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap(); |
848 | | assert_eq!(3, sliced_c1.len()); |
849 | | assert!(sliced_c1.is_valid(0)); |
850 | | assert_eq!(42, sliced_c1.value(0)); |
851 | | assert!(sliced_c1.is_null(1)); |
852 | | assert!(sliced_c1.is_null(2)); |
853 | | } |
854 | | |
855 | | #[test] |
856 | | #[should_panic( |
857 | | expected = "Incorrect array length for StructArray field \\\"c\\\", expected 1 got 2" |
858 | | )] |
859 | | fn test_invalid_struct_child_array_lengths() { |
860 | | drop(StructArray::from(vec![ |
861 | | ( |
862 | | Arc::new(Field::new("b", DataType::Float32, false)), |
863 | | Arc::new(Float32Array::from(vec![1.1])) as Arc<dyn Array>, |
864 | | ), |
865 | | ( |
866 | | Arc::new(Field::new("c", DataType::Float64, false)), |
867 | | Arc::new(Float64Array::from(vec![2.2, 3.3])), |
868 | | ), |
869 | | ])); |
870 | | } |
871 | | |
872 | | #[test] |
873 | | #[should_panic(expected = "use StructArray::try_new_with_length")] |
874 | | fn test_struct_array_from_empty() { |
875 | | // This can't work because we don't know how many rows the array should have. Previously we inferred 0 but |
876 | | // that often led to bugs. |
877 | | let _ = StructArray::from(vec![]); |
878 | | } |
879 | | |
880 | | #[test] |
881 | | fn test_empty_struct_array() { |
882 | | assert!(StructArray::try_new(Fields::empty(), vec![], None).is_err()); |
883 | | |
884 | | let arr = StructArray::new_empty_fields(10, None); |
885 | | assert_eq!(arr.len(), 10); |
886 | | assert_eq!(arr.null_count(), 0); |
887 | | assert_eq!(arr.num_columns(), 0); |
888 | | |
889 | | let arr2 = StructArray::try_new_with_length(Fields::empty(), vec![], None, 10).unwrap(); |
890 | | assert_eq!(arr2.len(), 10); |
891 | | |
892 | | let arr = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10))); |
893 | | assert_eq!(arr.len(), 10); |
894 | | assert_eq!(arr.null_count(), 10); |
895 | | assert_eq!(arr.num_columns(), 0); |
896 | | |
897 | | let arr2 = StructArray::try_new_with_length( |
898 | | Fields::empty(), |
899 | | vec![], |
900 | | Some(NullBuffer::new_null(10)), |
901 | | 10, |
902 | | ) |
903 | | .unwrap(); |
904 | | assert_eq!(arr2.len(), 10); |
905 | | } |
906 | | |
907 | | #[test] |
908 | | #[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")] |
909 | | fn test_struct_array_from_mismatched_nullability() { |
910 | | drop(StructArray::from(vec![( |
911 | | Arc::new(Field::new("c", DataType::Int32, false)), |
912 | | Arc::new(Int32Array::from(vec![Some(42), None, Some(19)])) as ArrayRef, |
913 | | )])); |
914 | | } |
915 | | |
916 | | #[test] |
917 | | fn test_struct_array_fmt_debug() { |
918 | | let arr: StructArray = StructArray::new( |
919 | | vec![Arc::new(Field::new("c", DataType::Int32, true))].into(), |
920 | | vec![Arc::new(Int32Array::from((0..30).collect::<Vec<_>>())) as ArrayRef], |
921 | | Some(NullBuffer::new(BooleanBuffer::from( |
922 | | (0..30).map(|i| i % 2 == 0).collect::<Vec<_>>(), |
923 | | ))), |
924 | | ); |
925 | | assert_eq!(format!("{arr:?}"), "StructArray\n-- validity:\n[\n valid,\n null,\n valid,\n null,\n valid,\n null,\n valid,\n null,\n valid,\n null,\n ...10 elements...,\n valid,\n null,\n valid,\n null,\n valid,\n null,\n valid,\n null,\n valid,\n null,\n]\n[\n-- child 0: \"c\" (Int32)\nPrimitiveArray<Int32>\n[\n 0,\n 1,\n 2,\n 3,\n 4,\n 5,\n 6,\n 7,\n 8,\n 9,\n ...10 elements...,\n 20,\n 21,\n 22,\n 23,\n 24,\n 25,\n 26,\n 27,\n 28,\n 29,\n]\n]") |
926 | | } |
927 | | |
928 | | #[test] |
929 | | fn test_struct_array_logical_nulls() { |
930 | | // Field is non-nullable |
931 | | let field = Field::new("a", DataType::Int32, false); |
932 | | let values = vec![1, 2, 3]; |
933 | | // Create a NullBuffer with all bits set to valid (true) |
934 | | let nulls = NullBuffer::from(vec![true, true, true]); |
935 | | let array = Int32Array::new(values.into(), Some(nulls)); |
936 | | let child = Arc::new(array) as ArrayRef; |
937 | | assert!(child.logical_nulls().is_some()); |
938 | | assert_eq!(child.logical_nulls().unwrap().null_count(), 0); |
939 | | |
940 | | let fields = Fields::from(vec![field]); |
941 | | let arrays = vec![child]; |
942 | | let nulls = None; |
943 | | |
944 | | StructArray::try_new(fields, arrays, nulls).expect("should not error"); |
945 | | } |
946 | | } |