/Users/andrewlamb/Software/arrow-rs/arrow-select/src/concat.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines concat kernel for `ArrayRef` |
19 | | //! |
20 | | //! Example: |
21 | | //! |
22 | | //! ``` |
23 | | //! use arrow_array::{ArrayRef, StringArray}; |
24 | | //! use arrow_select::concat::concat; |
25 | | //! |
26 | | //! let arr = concat(&[ |
27 | | //! &StringArray::from(vec!["hello", "world"]), |
28 | | //! &StringArray::from(vec!["!"]), |
29 | | //! ]).unwrap(); |
30 | | //! assert_eq!(arr.len(), 3); |
31 | | //! ``` |
32 | | |
33 | | use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values}; |
34 | | use arrow_array::builder::{ |
35 | | BooleanBuilder, GenericByteBuilder, GenericByteViewBuilder, PrimitiveBuilder, |
36 | | }; |
37 | | use arrow_array::cast::AsArray; |
38 | | use arrow_array::types::*; |
39 | | use arrow_array::*; |
40 | | use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer}; |
41 | | use arrow_data::transform::{Capacities, MutableArrayData}; |
42 | | use arrow_data::ArrayDataBuilder; |
43 | | use arrow_schema::{ArrowError, DataType, FieldRef, Fields, SchemaRef}; |
44 | | use std::{collections::HashSet, ops::Add, sync::Arc}; |
45 | | |
46 | 13 | fn binary_capacity<T: ByteArrayType>(arrays: &[&dyn Array]) -> Capacities { |
47 | 13 | let mut item_capacity = 0; |
48 | 13 | let mut bytes_capacity = 0; |
49 | 50 | for array37 in arrays { |
50 | 37 | let a = array.as_bytes::<T>(); |
51 | | |
52 | | // Guaranteed to always have at least one element |
53 | 37 | let offsets = a.value_offsets(); |
54 | 37 | bytes_capacity += offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize(); |
55 | 37 | item_capacity += a.len() |
56 | | } |
57 | | |
58 | 13 | Capacities::Binary(item_capacity, Some(bytes_capacity)) |
59 | 13 | } |
60 | | |
61 | 0 | fn fixed_size_list_capacity(arrays: &[&dyn Array], data_type: &DataType) -> Capacities { |
62 | 0 | if let DataType::FixedSizeList(f, _) = data_type { |
63 | 0 | let item_capacity = arrays.iter().map(|a| a.len()).sum(); |
64 | 0 | let child_data_type = f.data_type(); |
65 | 0 | match child_data_type { |
66 | | // These types should match the types that `get_capacity` |
67 | | // has special handling for. |
68 | | DataType::Utf8 |
69 | | | DataType::LargeUtf8 |
70 | | | DataType::Binary |
71 | | | DataType::LargeBinary |
72 | | | DataType::FixedSizeList(_, _) => { |
73 | 0 | let values: Vec<&dyn arrow_array::Array> = arrays |
74 | 0 | .iter() |
75 | 0 | .map(|a| a.as_fixed_size_list().values().as_ref()) |
76 | 0 | .collect(); |
77 | 0 | Capacities::List( |
78 | 0 | item_capacity, |
79 | 0 | Some(Box::new(get_capacity(&values, child_data_type))), |
80 | 0 | ) |
81 | | } |
82 | 0 | _ => Capacities::Array(item_capacity), |
83 | | } |
84 | | } else { |
85 | 0 | unreachable!("illegal data type for fixed size list") |
86 | | } |
87 | 0 | } |
88 | | |
89 | 0 | fn concat_byte_view<B: ByteViewType>(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> { |
90 | 0 | let mut builder = |
91 | 0 | GenericByteViewBuilder::<B>::with_capacity(arrays.iter().map(|a| a.len()).sum()); |
92 | 0 | for &array in arrays.iter() { |
93 | 0 | builder.append_array(array.as_byte_view()); |
94 | 0 | } |
95 | 0 | Ok(Arc::new(builder.finish())) |
96 | 0 | } |
97 | | |
98 | 3 | fn concat_dictionaries<K: ArrowDictionaryKeyType>( |
99 | 3 | arrays: &[&dyn Array], |
100 | 3 | ) -> Result<ArrayRef, ArrowError> { |
101 | 3 | let mut output_len = 0; |
102 | 3 | let dictionaries: Vec<_> = arrays |
103 | 3 | .iter() |
104 | 6 | .map3 (|x| x.as_dictionary::<K>()) |
105 | 6 | .inspect3 (|d| output_len += d.len()) |
106 | 3 | .collect(); |
107 | | |
108 | 3 | if !should_merge_dictionary_values::<K>(&dictionaries, output_len) { |
109 | 0 | return concat_fallback(arrays, Capacities::Array(output_len)); |
110 | 3 | } |
111 | | |
112 | 3 | let merged = merge_dictionary_values(&dictionaries, None)?0 ; |
113 | | |
114 | | // Recompute keys |
115 | 3 | let mut key_values = Vec::with_capacity(output_len); |
116 | | |
117 | 3 | let mut has_nulls = false; |
118 | 6 | for (d, mapping) in dictionaries.iter()3 .zip3 (merged.key_mappings3 ) { |
119 | 6 | has_nulls |= d.null_count() != 0; |
120 | 12 | for key in d.keys()6 .values6 () { |
121 | | // Use get to safely handle nulls |
122 | 12 | key_values.push(mapping.get(key.as_usize()).copied().unwrap_or_default()) |
123 | | } |
124 | | } |
125 | | |
126 | 3 | let nulls = has_nulls.then(|| {1 |
127 | 1 | let mut nulls = BooleanBufferBuilder::new(output_len); |
128 | 3 | for d2 in &dictionaries { |
129 | 2 | match d.nulls() { |
130 | 1 | Some(n) => nulls.append_buffer(n.inner()), |
131 | 1 | None => nulls.append_n(d.len(), true), |
132 | | } |
133 | | } |
134 | 1 | NullBuffer::new(nulls.finish()) |
135 | 1 | }); |
136 | | |
137 | 3 | let keys = PrimitiveArray::<K>::new(key_values.into(), nulls); |
138 | | // Sanity check |
139 | 3 | assert_eq!(keys.len(), output_len); |
140 | | |
141 | 3 | let array = unsafe { DictionaryArray::new_unchecked(keys, merged.values) }; |
142 | 3 | Ok(Arc::new(array)) |
143 | 3 | } |
144 | | |
145 | 8 | fn concat_lists<OffsetSize: OffsetSizeTrait>( |
146 | 8 | arrays: &[&dyn Array], |
147 | 8 | field: &FieldRef, |
148 | 8 | ) -> Result<ArrayRef, ArrowError> { |
149 | 8 | let mut output_len = 0; |
150 | 8 | let mut list_has_nulls = false; |
151 | 8 | let mut list_has_slices = false; |
152 | | |
153 | 8 | let lists = arrays |
154 | 8 | .iter() |
155 | 23 | .map8 (|x| x.as_list::<OffsetSize>()) |
156 | 23 | .inspect8 (|l| { |
157 | 23 | output_len += l.len(); |
158 | 23 | list_has_nulls |= l.null_count() != 0; |
159 | 23 | list_has_slices |= l.offsets()[0] > OffsetSize::zero() |
160 | 23 | || l.offsets().last().unwrap().as_usize() < l.values().len(); |
161 | 23 | }) |
162 | 8 | .collect::<Vec<_>>(); |
163 | | |
164 | 8 | let lists_nulls = list_has_nulls.then(|| { |
165 | 8 | let mut nulls = BooleanBufferBuilder::new(output_len); |
166 | 31 | for l23 in &lists { |
167 | 23 | match l.nulls() { |
168 | 13 | Some(n) => nulls.append_buffer(n.inner()), |
169 | 10 | None => nulls.append_n(l.len(), true), |
170 | | } |
171 | | } |
172 | 8 | NullBuffer::new(nulls.finish()) |
173 | 8 | }); |
174 | | |
175 | | // If any of the lists have slices, we need to slice the values |
176 | | // to ensure that the offsets are correct |
177 | | let mut sliced_values; |
178 | 8 | let values: Vec<&dyn Array> = if list_has_slices { |
179 | 0 | sliced_values = Vec::with_capacity(lists.len()); |
180 | 0 | for l in &lists { |
181 | 0 | // if the first offset is non-zero, we need to slice the values so when |
182 | 0 | // we concatenate them below only the relevant values are included |
183 | 0 | let offsets = l.offsets(); |
184 | 0 | let start_offset = offsets[0].as_usize(); |
185 | 0 | let end_offset = offsets.last().unwrap().as_usize(); |
186 | 0 | sliced_values.push(l.values().slice(start_offset, end_offset - start_offset)); |
187 | 0 | } |
188 | 0 | sliced_values.iter().map(|a| a.as_ref()).collect() |
189 | | } else { |
190 | 23 | lists.iter()8 .map8 (|x| x.values().as_ref()).collect8 () |
191 | | }; |
192 | | |
193 | 8 | let concatenated_values = concat(values.as_slice())?0 ; |
194 | | |
195 | | // Merge value offsets from the lists |
196 | 8 | let value_offset_buffer = |
197 | 23 | OffsetBuffer::<OffsetSize>::from_lengths8 (lists.iter()8 .flat_map8 (|x| x.offsets().lengths())); |
198 | | |
199 | 8 | let array = GenericListArray::<OffsetSize>::try_new( |
200 | 8 | Arc::clone(field), |
201 | 8 | value_offset_buffer, |
202 | 8 | concatenated_values, |
203 | 8 | lists_nulls, |
204 | 0 | )?; |
205 | | |
206 | 8 | Ok(Arc::new(array)) |
207 | 8 | } |
208 | | |
209 | 57 | fn concat_primitives<T: ArrowPrimitiveType>(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> { |
210 | 188 | let mut builder57 = PrimitiveBuilder::<T>::with_capacity57 (arrays57 .iter57 ().map57 (|a| a.len()).sum57 ()) |
211 | 57 | .with_data_type(arrays[0].data_type().clone()); |
212 | | |
213 | 245 | for array188 in arrays { |
214 | 188 | builder.append_array(array.as_primitive()); |
215 | 188 | } |
216 | | |
217 | 57 | Ok(Arc::new(builder.finish())) |
218 | 57 | } |
219 | | |
220 | 5 | fn concat_boolean(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> { |
221 | 15 | let mut builder5 = BooleanBuilder::with_capacity5 (arrays5 .iter5 ().map5 (|a| a.len()).sum5 ()); |
222 | | |
223 | 20 | for array15 in arrays { |
224 | 15 | builder.append_array(array.as_boolean()); |
225 | 15 | } |
226 | | |
227 | 5 | Ok(Arc::new(builder.finish())) |
228 | 5 | } |
229 | | |
230 | 13 | fn concat_bytes<T: ByteArrayType>(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> { |
231 | 13 | let (item_capacity, bytes_capacity) = match binary_capacity::<T>(arrays) { |
232 | 13 | Capacities::Binary(item_capacity, Some(bytes_capacity)) => (item_capacity, bytes_capacity), |
233 | 0 | _ => unreachable!(), |
234 | | }; |
235 | | |
236 | 13 | let mut builder = GenericByteBuilder::<T>::with_capacity(item_capacity, bytes_capacity); |
237 | | |
238 | 50 | for array37 in arrays { |
239 | 37 | builder.append_array(array.as_bytes::<T>()); |
240 | 37 | } |
241 | | |
242 | 13 | Ok(Arc::new(builder.finish())) |
243 | 13 | } |
244 | | |
245 | 5 | fn concat_structs(arrays: &[&dyn Array], fields: &Fields) -> Result<ArrayRef, ArrowError> { |
246 | 5 | let mut len = 0; |
247 | 5 | let mut has_nulls = false; |
248 | 5 | let structs = arrays |
249 | 5 | .iter() |
250 | 13 | .map5 (|a| { |
251 | 13 | len += a.len(); |
252 | 13 | has_nulls |= a.null_count() > 0; |
253 | 13 | a.as_struct() |
254 | 13 | }) |
255 | 5 | .collect::<Vec<_>>(); |
256 | | |
257 | 5 | let nulls = has_nulls.then(|| {4 |
258 | 4 | let mut b = BooleanBufferBuilder::new(len); |
259 | 15 | for s11 in &structs { |
260 | 11 | match s.nulls() { |
261 | 5 | Some(n) => b.append_buffer(n.inner()), |
262 | 6 | None => b.append_n(s.len(), true), |
263 | | } |
264 | | } |
265 | 4 | NullBuffer::new(b.finish()) |
266 | 4 | }); |
267 | | |
268 | 5 | let column_concat_result = (0..fields.len()) |
269 | 10 | .map5 (|i| { |
270 | 10 | let extracted_cols = structs |
271 | 10 | .iter() |
272 | 27 | .map10 (|s| s.column(i).as_ref()) |
273 | 10 | .collect::<Vec<_>>(); |
274 | 10 | concat(&extracted_cols) |
275 | 10 | }) |
276 | 5 | .collect::<Result<Vec<_>, ArrowError>>()?0 ; |
277 | | |
278 | 5 | Ok(Arc::new(StructArray::try_new_with_length( |
279 | 5 | fields.clone(), |
280 | 5 | column_concat_result, |
281 | 5 | nulls, |
282 | 5 | len, |
283 | 0 | )?)) |
284 | 5 | } |
285 | | |
286 | | /// Concatenate multiple RunArray instances into a single RunArray. |
287 | | /// |
288 | | /// This function handles the special case of concatenating RunArrays by: |
289 | | /// 1. Collecting all run ends and values from input arrays |
290 | | /// 2. Adjusting run ends to account for the length of previous arrays |
291 | | /// 3. Creating a new RunArray with the combined data |
292 | 0 | fn concat_run_arrays<R: RunEndIndexType>(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> |
293 | 0 | where |
294 | 0 | R::Native: Add<Output = R::Native>, |
295 | | { |
296 | 0 | let run_arrays: Vec<_> = arrays |
297 | 0 | .iter() |
298 | 0 | .map(|x| x.as_run::<R>()) |
299 | 0 | .filter(|x| !x.run_ends().is_empty()) |
300 | 0 | .collect(); |
301 | | |
302 | | // The run ends need to be adjusted by the sum of the lengths of the previous arrays. |
303 | 0 | let needed_run_end_adjustments = std::iter::once(R::default_value()) |
304 | 0 | .chain( |
305 | 0 | run_arrays |
306 | 0 | .iter() |
307 | 0 | .scan(R::default_value(), |acc, run_array| { |
308 | 0 | *acc = *acc + *run_array.run_ends().values().last().unwrap(); |
309 | 0 | Some(*acc) |
310 | 0 | }), |
311 | | ) |
312 | 0 | .collect::<Vec<_>>(); |
313 | | |
314 | | // This works out nicely to be the total (logical) length of the resulting array. |
315 | 0 | let total_len = needed_run_end_adjustments.last().unwrap().as_usize(); |
316 | | |
317 | 0 | let run_ends_array = |
318 | 0 | PrimitiveArray::<R>::from_iter_values(run_arrays.iter().enumerate().flat_map( |
319 | 0 | move |(i, run_array)| { |
320 | 0 | let adjustment = needed_run_end_adjustments[i]; |
321 | 0 | run_array |
322 | 0 | .run_ends() |
323 | 0 | .values() |
324 | 0 | .iter() |
325 | 0 | .map(move |run_end| *run_end + adjustment) |
326 | 0 | }, |
327 | | )); |
328 | | |
329 | 0 | let all_values = concat( |
330 | 0 | &run_arrays |
331 | 0 | .iter() |
332 | 0 | .map(|x| x.values().as_ref()) |
333 | 0 | .collect::<Vec<_>>(), |
334 | 0 | )?; |
335 | | |
336 | 0 | let builder = ArrayDataBuilder::new(run_arrays[0].data_type().clone()) |
337 | 0 | .len(total_len) |
338 | 0 | .child_data(vec![run_ends_array.into_data(), all_values.into_data()]); |
339 | | |
340 | | // `build_unchecked` is used to avoid recursive validation of child arrays. |
341 | 0 | let array_data = unsafe { builder.build_unchecked() }; |
342 | 0 | array_data.validate_data()?; |
343 | | |
344 | 0 | Ok(Arc::<RunArray<R>>::new(array_data.into())) |
345 | 0 | } |
346 | | |
347 | | macro_rules! dict_helper { |
348 | | ($t:ty, $arrays:expr) => { |
349 | | return Ok(Arc::new(concat_dictionaries::<$t>($arrays)?) as _) |
350 | | }; |
351 | | } |
352 | | |
353 | | macro_rules! primitive_concat { |
354 | | ($t:ty, $arrays:expr) => { |
355 | | return Ok(Arc::new(concat_primitives::<$t>($arrays)?) as _) |
356 | | }; |
357 | | } |
358 | | |
359 | 6 | fn get_capacity(arrays: &[&dyn Array], data_type: &DataType) -> Capacities { |
360 | 6 | match data_type { |
361 | 0 | DataType::Utf8 => binary_capacity::<Utf8Type>(arrays), |
362 | 0 | DataType::LargeUtf8 => binary_capacity::<LargeUtf8Type>(arrays), |
363 | 0 | DataType::Binary => binary_capacity::<BinaryType>(arrays), |
364 | 0 | DataType::LargeBinary => binary_capacity::<LargeBinaryType>(arrays), |
365 | 0 | DataType::FixedSizeList(_, _) => fixed_size_list_capacity(arrays, data_type), |
366 | 15 | _ => Capacities::Array(arrays6 .iter6 ().map6 (|a| a.len()).sum6 ()), |
367 | | } |
368 | 6 | } |
369 | | |
370 | | /// Concatenate multiple [Array] of the same type into a single [ArrayRef]. |
371 | 523 | pub fn concat(arrays: &[&dyn Array]) -> Result<ArrayRef, ArrowError> { |
372 | 523 | if arrays.is_empty() { |
373 | 0 | return Err(ArrowError::ComputeError( |
374 | 0 | "concat requires input of at least one array".to_string(), |
375 | 0 | )); |
376 | 523 | } else if arrays.len() == 1 { |
377 | 426 | let array = arrays[0]; |
378 | 426 | return Ok(array.slice(0, array.len())); |
379 | 97 | } |
380 | | |
381 | 97 | let d = arrays[0].data_type(); |
382 | 200 | if arrays97 .iter().skip(1).any97 (|array| array.data_type() != d) { |
383 | | // Create error message with up to 10 unique data types in the order they appear |
384 | 0 | let error_message = { |
385 | | // 10 max unique data types to print and another 1 to know if there are more |
386 | 0 | let mut unique_data_types = HashSet::with_capacity(11); |
387 | | |
388 | 0 | let mut error_message = |
389 | 0 | format!("It is not possible to concatenate arrays of different data types ({d}"); |
390 | 0 | unique_data_types.insert(d); |
391 | | |
392 | 0 | for array in arrays { |
393 | 0 | let is_unique = unique_data_types.insert(array.data_type()); |
394 | | |
395 | 0 | if unique_data_types.len() == 11 { |
396 | 0 | error_message.push_str(", ..."); |
397 | 0 | break; |
398 | 0 | } |
399 | | |
400 | 0 | if is_unique { |
401 | 0 | error_message.push_str(", "); |
402 | 0 | error_message.push_str(&array.data_type().to_string()); |
403 | 0 | } |
404 | | } |
405 | | |
406 | 0 | error_message.push_str(")."); |
407 | | |
408 | 0 | error_message |
409 | | }; |
410 | | |
411 | 0 | return Err(ArrowError::InvalidArgumentError(error_message)); |
412 | 97 | } |
413 | | |
414 | 0 | downcast_primitive! { |
415 | 5 | d => (primitive_concat, arrays0 ), |
416 | 5 | DataType::Boolean => concat_boolean(arrays), |
417 | 3 | DataType::Dictionary(k, _) => { |
418 | 0 | downcast_integer! { |
419 | 3 | k.as_ref() => (dict_helper, arrays0 ), |
420 | 0 | _ => unreachable!("illegal dictionary key type {k}") |
421 | | } |
422 | | } |
423 | 8 | DataType::List(field) => concat_lists::<i32>(arrays, field), |
424 | 0 | DataType::LargeList(field) => concat_lists::<i64>(arrays, field), |
425 | 5 | DataType::Struct(fields) => concat_structs(arrays, fields), |
426 | 2 | DataType::Utf8 => concat_bytes::<Utf8Type>(arrays), |
427 | 0 | DataType::LargeUtf8 => concat_bytes::<LargeUtf8Type>(arrays), |
428 | 11 | DataType::Binary => concat_bytes::<BinaryType>(arrays), |
429 | 0 | DataType::LargeBinary => concat_bytes::<LargeBinaryType>(arrays), |
430 | 0 | DataType::RunEndEncoded(r, _) => { |
431 | | // Handle RunEndEncoded arrays with special concat function |
432 | | // We need to downcast based on the run end type |
433 | 0 | match r.data_type() { |
434 | 0 | DataType::Int16 => concat_run_arrays::<Int16Type>(arrays), |
435 | 0 | DataType::Int32 => concat_run_arrays::<Int32Type>(arrays), |
436 | 0 | DataType::Int64 => concat_run_arrays::<Int64Type>(arrays), |
437 | 0 | _ => unreachable!("Unsupported run end index type: {r:?}"), |
438 | | } |
439 | | } |
440 | 0 | DataType::Utf8View => concat_byte_view::<StringViewType>(arrays), |
441 | 0 | DataType::BinaryView => concat_byte_view::<BinaryViewType>(arrays), |
442 | | _ => { |
443 | 6 | let capacity = get_capacity(arrays, d); |
444 | 6 | concat_fallback(arrays, capacity) |
445 | | } |
446 | | } |
447 | 523 | } |
448 | | |
449 | | /// Concatenates arrays using MutableArrayData |
450 | | /// |
451 | | /// This will naively concatenate dictionaries |
452 | 6 | fn concat_fallback(arrays: &[&dyn Array], capacity: Capacities) -> Result<ArrayRef, ArrowError> { |
453 | 15 | let array_data6 : Vec<_>6 = arrays6 .iter6 ().map6 (|a| a.to_data()).collect6 ::<Vec<_>>(); |
454 | 6 | let array_data = array_data.iter().collect(); |
455 | 6 | let mut mutable = MutableArrayData::with_capacities(array_data, false, capacity); |
456 | | |
457 | 15 | for (i, a) in arrays6 .iter6 ().enumerate6 () { |
458 | 15 | mutable.extend(i, 0, a.len()) |
459 | | } |
460 | | |
461 | 6 | Ok(make_array(mutable.freeze())) |
462 | 6 | } |
463 | | |
464 | | /// Concatenates `batches` together into a single [`RecordBatch`]. |
465 | | /// |
466 | | /// The output batch has the specified `schemas`; The schema of the |
467 | | /// input are ignored. |
468 | | /// |
469 | | /// Returns an error if the types of underlying arrays are different. |
470 | 74 | pub fn concat_batches<'a>( |
471 | 74 | schema: &SchemaRef, |
472 | 74 | input_batches: impl IntoIterator<Item = &'a RecordBatch>, |
473 | 74 | ) -> Result<RecordBatch, ArrowError> { |
474 | | // When schema is empty, sum the number of the rows of all batches |
475 | 74 | if schema.fields().is_empty() { |
476 | 0 | let num_rows: usize = input_batches.into_iter().map(RecordBatch::num_rows).sum(); |
477 | 0 | let mut options = RecordBatchOptions::default(); |
478 | 0 | options.row_count = Some(num_rows); |
479 | 0 | return RecordBatch::try_new_with_options(schema.clone(), vec![], &options); |
480 | 74 | } |
481 | | |
482 | 74 | let batches: Vec<&RecordBatch> = input_batches.into_iter().collect(); |
483 | 74 | if batches.is_empty() { |
484 | 0 | return Ok(RecordBatch::new_empty(schema.clone())); |
485 | 74 | } |
486 | 74 | let field_num = schema.fields().len(); |
487 | 74 | let mut arrays = Vec::with_capacity(field_num); |
488 | 505 | for i in 0..field_num74 { |
489 | 505 | let array = concat( |
490 | 505 | &batches |
491 | 505 | .iter() |
492 | 673 | .map505 (|batch| batch.column(i).as_ref()) |
493 | 505 | .collect::<Vec<_>>(), |
494 | 0 | )?; |
495 | 505 | arrays.push(array); |
496 | | } |
497 | 74 | RecordBatch::try_new(schema.clone(), arrays) |
498 | 74 | } |
499 | | |
500 | | #[cfg(test)] |
501 | | mod tests { |
502 | | use super::*; |
503 | | use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder}; |
504 | | use arrow_schema::{Field, Schema}; |
505 | | use std::fmt::Debug; |
506 | | |
507 | | #[test] |
508 | | fn test_concat_empty_vec() { |
509 | | let re = concat(&[]); |
510 | | assert!(re.is_err()); |
511 | | } |
512 | | |
513 | | #[test] |
514 | | fn test_concat_batches_no_columns() { |
515 | | // Test concat using empty schema / batches without columns |
516 | | let schema = Arc::new(Schema::empty()); |
517 | | |
518 | | let mut options = RecordBatchOptions::default(); |
519 | | options.row_count = Some(100); |
520 | | let batch = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); |
521 | | // put in 2 batches of 100 rows each |
522 | | let re = concat_batches(&schema, &[batch.clone(), batch]).unwrap(); |
523 | | |
524 | | assert_eq!(re.num_rows(), 200); |
525 | | } |
526 | | |
527 | | #[test] |
528 | | fn test_concat_one_element_vec() { |
529 | | let arr = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
530 | | Some(-1), |
531 | | Some(2), |
532 | | None, |
533 | | ])) as ArrayRef; |
534 | | let result = concat(&[arr.as_ref()]).unwrap(); |
535 | | assert_eq!( |
536 | | &arr, &result, |
537 | | "concatenating single element array gives back the same result" |
538 | | ); |
539 | | } |
540 | | |
541 | | #[test] |
542 | | fn test_concat_incompatible_datatypes() { |
543 | | let re = concat(&[ |
544 | | &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(2), None]), |
545 | | // 2 string to make sure we only mention unique types |
546 | | &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), |
547 | | &StringArray::from(vec![Some("hey"), Some(""), Some("you")]), |
548 | | // Another type to make sure we are showing all the incompatible types |
549 | | &PrimitiveArray::<Int32Type>::from(vec![Some(-1), Some(2), None]), |
550 | | ]); |
551 | | |
552 | | assert_eq!(re.unwrap_err().to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types (Int64, Utf8, Int32)."); |
553 | | } |
554 | | |
555 | | #[test] |
556 | | fn test_concat_10_incompatible_datatypes_should_include_all_of_them() { |
557 | | let re = concat(&[ |
558 | | &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(2), None]), |
559 | | // 2 string to make sure we only mention unique types |
560 | | &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), |
561 | | &StringArray::from(vec![Some("hey"), Some(""), Some("you")]), |
562 | | // Another type to make sure we are showing all the incompatible types |
563 | | &PrimitiveArray::<Int32Type>::from(vec![Some(-1), Some(2), None]), |
564 | | &PrimitiveArray::<Int8Type>::from(vec![Some(-1), Some(2), None]), |
565 | | &PrimitiveArray::<Int16Type>::from(vec![Some(-1), Some(2), None]), |
566 | | &PrimitiveArray::<UInt8Type>::from(vec![Some(1), Some(2), None]), |
567 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
568 | | &PrimitiveArray::<UInt32Type>::from(vec![Some(1), Some(2), None]), |
569 | | // Non unique |
570 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
571 | | &PrimitiveArray::<UInt64Type>::from(vec![Some(1), Some(2), None]), |
572 | | &PrimitiveArray::<Float32Type>::from(vec![Some(1.0), Some(2.0), None]), |
573 | | ]); |
574 | | |
575 | | assert_eq!(re.unwrap_err().to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types (Int64, Utf8, Int32, Int8, Int16, UInt8, UInt16, UInt32, UInt64, Float32)."); |
576 | | } |
577 | | |
578 | | #[test] |
579 | | fn test_concat_11_incompatible_datatypes_should_only_include_10() { |
580 | | let re = concat(&[ |
581 | | &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(2), None]), |
582 | | // 2 string to make sure we only mention unique types |
583 | | &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), |
584 | | &StringArray::from(vec![Some("hey"), Some(""), Some("you")]), |
585 | | // Another type to make sure we are showing all the incompatible types |
586 | | &PrimitiveArray::<Int32Type>::from(vec![Some(-1), Some(2), None]), |
587 | | &PrimitiveArray::<Int8Type>::from(vec![Some(-1), Some(2), None]), |
588 | | &PrimitiveArray::<Int16Type>::from(vec![Some(-1), Some(2), None]), |
589 | | &PrimitiveArray::<UInt8Type>::from(vec![Some(1), Some(2), None]), |
590 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
591 | | &PrimitiveArray::<UInt32Type>::from(vec![Some(1), Some(2), None]), |
592 | | // Non unique |
593 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
594 | | &PrimitiveArray::<UInt64Type>::from(vec![Some(1), Some(2), None]), |
595 | | &PrimitiveArray::<Float32Type>::from(vec![Some(1.0), Some(2.0), None]), |
596 | | &PrimitiveArray::<Float64Type>::from(vec![Some(1.0), Some(2.0), None]), |
597 | | ]); |
598 | | |
599 | | assert_eq!(re.unwrap_err().to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types (Int64, Utf8, Int32, Int8, Int16, UInt8, UInt16, UInt32, UInt64, Float32, ...)."); |
600 | | } |
601 | | |
602 | | #[test] |
603 | | fn test_concat_13_incompatible_datatypes_should_not_include_all_of_them() { |
604 | | let re = concat(&[ |
605 | | &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(2), None]), |
606 | | // 2 string to make sure we only mention unique types |
607 | | &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), |
608 | | &StringArray::from(vec![Some("hey"), Some(""), Some("you")]), |
609 | | // Another type to make sure we are showing all the incompatible types |
610 | | &PrimitiveArray::<Int32Type>::from(vec![Some(-1), Some(2), None]), |
611 | | &PrimitiveArray::<Int8Type>::from(vec![Some(-1), Some(2), None]), |
612 | | &PrimitiveArray::<Int16Type>::from(vec![Some(-1), Some(2), None]), |
613 | | &PrimitiveArray::<UInt8Type>::from(vec![Some(1), Some(2), None]), |
614 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
615 | | &PrimitiveArray::<UInt32Type>::from(vec![Some(1), Some(2), None]), |
616 | | // Non unique |
617 | | &PrimitiveArray::<UInt16Type>::from(vec![Some(1), Some(2), None]), |
618 | | &PrimitiveArray::<UInt64Type>::from(vec![Some(1), Some(2), None]), |
619 | | &PrimitiveArray::<Float32Type>::from(vec![Some(1.0), Some(2.0), None]), |
620 | | &PrimitiveArray::<Float64Type>::from(vec![Some(1.0), Some(2.0), None]), |
621 | | &PrimitiveArray::<Float16Type>::new_null(3), |
622 | | &BooleanArray::from(vec![Some(true), Some(false), None]), |
623 | | ]); |
624 | | |
625 | | assert_eq!(re.unwrap_err().to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types (Int64, Utf8, Int32, Int8, Int16, UInt8, UInt16, UInt32, UInt64, Float32, ...)."); |
626 | | } |
627 | | |
628 | | #[test] |
629 | | fn test_concat_string_arrays() { |
630 | | let arr = concat(&[ |
631 | | &StringArray::from(vec!["hello", "world"]), |
632 | | &StringArray::from(vec!["2", "3", "4"]), |
633 | | &StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), |
634 | | ]) |
635 | | .unwrap(); |
636 | | |
637 | | let expected_output = Arc::new(StringArray::from(vec![ |
638 | | Some("hello"), |
639 | | Some("world"), |
640 | | Some("2"), |
641 | | Some("3"), |
642 | | Some("4"), |
643 | | Some("foo"), |
644 | | Some("bar"), |
645 | | None, |
646 | | Some("baz"), |
647 | | ])) as ArrayRef; |
648 | | |
649 | | assert_eq!(&arr, &expected_output); |
650 | | } |
651 | | |
652 | | #[test] |
653 | | fn test_concat_string_view_arrays() { |
654 | | let arr = concat(&[ |
655 | | &StringViewArray::from(vec!["helloxxxxxxxxxxa", "world____________"]), |
656 | | &StringViewArray::from(vec!["helloxxxxxxxxxxy", "3", "4"]), |
657 | | &StringViewArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), |
658 | | ]) |
659 | | .unwrap(); |
660 | | |
661 | | let expected_output = Arc::new(StringViewArray::from(vec![ |
662 | | Some("helloxxxxxxxxxxa"), |
663 | | Some("world____________"), |
664 | | Some("helloxxxxxxxxxxy"), |
665 | | Some("3"), |
666 | | Some("4"), |
667 | | Some("foo"), |
668 | | Some("bar"), |
669 | | None, |
670 | | Some("baz"), |
671 | | ])) as ArrayRef; |
672 | | |
673 | | assert_eq!(&arr, &expected_output); |
674 | | } |
675 | | |
676 | | #[test] |
677 | | fn test_concat_primitive_arrays() { |
678 | | let arr = concat(&[ |
679 | | &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(-1), Some(2), None, None]), |
680 | | &PrimitiveArray::<Int64Type>::from(vec![Some(101), Some(102), Some(103), None]), |
681 | | &PrimitiveArray::<Int64Type>::from(vec![Some(256), Some(512), Some(1024)]), |
682 | | ]) |
683 | | .unwrap(); |
684 | | |
685 | | let expected_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
686 | | Some(-1), |
687 | | Some(-1), |
688 | | Some(2), |
689 | | None, |
690 | | None, |
691 | | Some(101), |
692 | | Some(102), |
693 | | Some(103), |
694 | | None, |
695 | | Some(256), |
696 | | Some(512), |
697 | | Some(1024), |
698 | | ])) as ArrayRef; |
699 | | |
700 | | assert_eq!(&arr, &expected_output); |
701 | | } |
702 | | |
703 | | #[test] |
704 | | fn test_concat_primitive_array_slices() { |
705 | | let input_1 = |
706 | | PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(-1), Some(2), None, None]) |
707 | | .slice(1, 3); |
708 | | |
709 | | let input_2 = |
710 | | PrimitiveArray::<Int64Type>::from(vec![Some(101), Some(102), Some(103), None]) |
711 | | .slice(1, 3); |
712 | | let arr = concat(&[&input_1, &input_2]).unwrap(); |
713 | | |
714 | | let expected_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
715 | | Some(-1), |
716 | | Some(2), |
717 | | None, |
718 | | Some(102), |
719 | | Some(103), |
720 | | None, |
721 | | ])) as ArrayRef; |
722 | | |
723 | | assert_eq!(&arr, &expected_output); |
724 | | } |
725 | | |
726 | | #[test] |
727 | | fn test_concat_boolean_primitive_arrays() { |
728 | | let arr = concat(&[ |
729 | | &BooleanArray::from(vec![ |
730 | | Some(true), |
731 | | Some(true), |
732 | | Some(false), |
733 | | None, |
734 | | None, |
735 | | Some(false), |
736 | | ]), |
737 | | &BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]), |
738 | | ]) |
739 | | .unwrap(); |
740 | | |
741 | | let expected_output = Arc::new(BooleanArray::from(vec![ |
742 | | Some(true), |
743 | | Some(true), |
744 | | Some(false), |
745 | | None, |
746 | | None, |
747 | | Some(false), |
748 | | None, |
749 | | Some(false), |
750 | | Some(true), |
751 | | Some(false), |
752 | | ])) as ArrayRef; |
753 | | |
754 | | assert_eq!(&arr, &expected_output); |
755 | | } |
756 | | |
757 | | #[test] |
758 | | fn test_concat_primitive_list_arrays() { |
759 | | let list1 = vec![ |
760 | | Some(vec![Some(-1), Some(-1), Some(2), None, None]), |
761 | | Some(vec![]), |
762 | | None, |
763 | | Some(vec![Some(10)]), |
764 | | ]; |
765 | | let list1_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list1.clone()); |
766 | | |
767 | | let list2 = vec![ |
768 | | None, |
769 | | Some(vec![Some(100), None, Some(101)]), |
770 | | Some(vec![Some(102)]), |
771 | | ]; |
772 | | let list2_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list2.clone()); |
773 | | |
774 | | let list3 = vec![Some(vec![Some(1000), Some(1001)])]; |
775 | | let list3_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list3.clone()); |
776 | | |
777 | | let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); |
778 | | |
779 | | let expected = list1.into_iter().chain(list2).chain(list3); |
780 | | let array_expected = ListArray::from_iter_primitive::<Int64Type, _, _>(expected); |
781 | | |
782 | | assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); |
783 | | } |
784 | | |
785 | | #[test] |
786 | | fn test_concat_primitive_list_arrays_slices() { |
787 | | let list1 = vec![ |
788 | | Some(vec![Some(-1), Some(-1), Some(2), None, None]), |
789 | | Some(vec![]), // In slice |
790 | | None, // In slice |
791 | | Some(vec![Some(10)]), |
792 | | ]; |
793 | | let list1_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list1.clone()); |
794 | | let list1_array = list1_array.slice(1, 2); |
795 | | let list1_values = list1.into_iter().skip(1).take(2); |
796 | | |
797 | | let list2 = vec![ |
798 | | None, |
799 | | Some(vec![Some(100), None, Some(101)]), |
800 | | Some(vec![Some(102)]), |
801 | | ]; |
802 | | let list2_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list2.clone()); |
803 | | |
804 | | // verify that this test covers the case when the first offset is non zero |
805 | | assert!(list1_array.offsets()[0].as_usize() > 0); |
806 | | let array_result = concat(&[&list1_array, &list2_array]).unwrap(); |
807 | | |
808 | | let expected = list1_values.chain(list2); |
809 | | let array_expected = ListArray::from_iter_primitive::<Int64Type, _, _>(expected); |
810 | | |
811 | | assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); |
812 | | } |
813 | | |
814 | | #[test] |
815 | | fn test_concat_primitive_list_arrays_sliced_lengths() { |
816 | | let list1 = vec![ |
817 | | Some(vec![Some(-1), Some(-1), Some(2), None, None]), // In slice |
818 | | Some(vec![]), // In slice |
819 | | None, // In slice |
820 | | Some(vec![Some(10)]), |
821 | | ]; |
822 | | let list1_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list1.clone()); |
823 | | let list1_array = list1_array.slice(0, 3); // no offset, but not all values |
824 | | let list1_values = list1.into_iter().take(3); |
825 | | |
826 | | let list2 = vec![ |
827 | | None, |
828 | | Some(vec![Some(100), None, Some(101)]), |
829 | | Some(vec![Some(102)]), |
830 | | ]; |
831 | | let list2_array = ListArray::from_iter_primitive::<Int64Type, _, _>(list2.clone()); |
832 | | |
833 | | // verify that this test covers the case when the first offset is zero, but the |
834 | | // last offset doesn't cover the entire array |
835 | | assert_eq!(list1_array.offsets()[0].as_usize(), 0); |
836 | | assert!(list1_array.offsets().last().unwrap().as_usize() < list1_array.values().len()); |
837 | | let array_result = concat(&[&list1_array, &list2_array]).unwrap(); |
838 | | |
839 | | let expected = list1_values.chain(list2); |
840 | | let array_expected = ListArray::from_iter_primitive::<Int64Type, _, _>(expected); |
841 | | |
842 | | assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); |
843 | | } |
844 | | |
845 | | #[test] |
846 | | fn test_concat_primitive_fixed_size_list_arrays() { |
847 | | let list1 = vec![ |
848 | | Some(vec![Some(-1), None]), |
849 | | None, |
850 | | Some(vec![Some(10), Some(20)]), |
851 | | ]; |
852 | | let list1_array = |
853 | | FixedSizeListArray::from_iter_primitive::<Int64Type, _, _>(list1.clone(), 2); |
854 | | |
855 | | let list2 = vec![ |
856 | | None, |
857 | | Some(vec![Some(100), None]), |
858 | | Some(vec![Some(102), Some(103)]), |
859 | | ]; |
860 | | let list2_array = |
861 | | FixedSizeListArray::from_iter_primitive::<Int64Type, _, _>(list2.clone(), 2); |
862 | | |
863 | | let list3 = vec![Some(vec![Some(1000), Some(1001)])]; |
864 | | let list3_array = |
865 | | FixedSizeListArray::from_iter_primitive::<Int64Type, _, _>(list3.clone(), 2); |
866 | | |
867 | | let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); |
868 | | |
869 | | let expected = list1.into_iter().chain(list2).chain(list3); |
870 | | let array_expected = |
871 | | FixedSizeListArray::from_iter_primitive::<Int64Type, _, _>(expected, 2); |
872 | | |
873 | | assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); |
874 | | } |
875 | | |
876 | | #[test] |
877 | | fn test_concat_struct_arrays() { |
878 | | let field = Arc::new(Field::new("field", DataType::Int64, true)); |
879 | | let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
880 | | Some(-1), |
881 | | Some(-1), |
882 | | Some(2), |
883 | | None, |
884 | | None, |
885 | | ])); |
886 | | let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); |
887 | | |
888 | | let input_primitive_2: ArrayRef = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
889 | | Some(101), |
890 | | Some(102), |
891 | | Some(103), |
892 | | None, |
893 | | ])); |
894 | | let input_struct_2 = StructArray::from(vec![(field.clone(), input_primitive_2)]); |
895 | | |
896 | | let input_primitive_3: ArrayRef = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
897 | | Some(256), |
898 | | Some(512), |
899 | | Some(1024), |
900 | | ])); |
901 | | let input_struct_3 = StructArray::from(vec![(field, input_primitive_3)]); |
902 | | |
903 | | let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3]).unwrap(); |
904 | | |
905 | | let expected_primitive_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
906 | | Some(-1), |
907 | | Some(-1), |
908 | | Some(2), |
909 | | None, |
910 | | None, |
911 | | Some(101), |
912 | | Some(102), |
913 | | Some(103), |
914 | | None, |
915 | | Some(256), |
916 | | Some(512), |
917 | | Some(1024), |
918 | | ])) as ArrayRef; |
919 | | |
920 | | let actual_primitive = arr |
921 | | .as_any() |
922 | | .downcast_ref::<StructArray>() |
923 | | .unwrap() |
924 | | .column(0); |
925 | | assert_eq!(actual_primitive, &expected_primitive_output); |
926 | | } |
927 | | |
928 | | #[test] |
929 | | fn test_concat_struct_array_slices() { |
930 | | let field = Arc::new(Field::new("field", DataType::Int64, true)); |
931 | | let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
932 | | Some(-1), |
933 | | Some(-1), |
934 | | Some(2), |
935 | | None, |
936 | | None, |
937 | | ])); |
938 | | let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); |
939 | | |
940 | | let input_primitive_2: ArrayRef = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
941 | | Some(101), |
942 | | Some(102), |
943 | | Some(103), |
944 | | None, |
945 | | ])); |
946 | | let input_struct_2 = StructArray::from(vec![(field, input_primitive_2)]); |
947 | | |
948 | | let arr = concat(&[&input_struct_1.slice(1, 3), &input_struct_2.slice(1, 2)]).unwrap(); |
949 | | |
950 | | let expected_primitive_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
951 | | Some(-1), |
952 | | Some(2), |
953 | | None, |
954 | | Some(102), |
955 | | Some(103), |
956 | | ])) as ArrayRef; |
957 | | |
958 | | let actual_primitive = arr |
959 | | .as_any() |
960 | | .downcast_ref::<StructArray>() |
961 | | .unwrap() |
962 | | .column(0); |
963 | | assert_eq!(actual_primitive, &expected_primitive_output); |
964 | | } |
965 | | |
966 | | #[test] |
967 | | fn test_concat_struct_arrays_no_nulls() { |
968 | | let input_1a = vec![1, 2, 3]; |
969 | | let input_1b = vec!["one", "two", "three"]; |
970 | | let input_2a = vec![4, 5, 6, 7]; |
971 | | let input_2b = vec!["four", "five", "six", "seven"]; |
972 | | |
973 | | let struct_from_primitives = |ints: Vec<i64>, strings: Vec<&str>| { |
974 | | StructArray::try_from(vec![ |
975 | | ("ints", Arc::new(Int64Array::from(ints)) as _), |
976 | | ("strings", Arc::new(StringArray::from(strings)) as _), |
977 | | ]) |
978 | | }; |
979 | | |
980 | | let expected_output = struct_from_primitives( |
981 | | [input_1a.clone(), input_2a.clone()].concat(), |
982 | | [input_1b.clone(), input_2b.clone()].concat(), |
983 | | ) |
984 | | .unwrap(); |
985 | | |
986 | | let input_1 = struct_from_primitives(input_1a, input_1b).unwrap(); |
987 | | let input_2 = struct_from_primitives(input_2a, input_2b).unwrap(); |
988 | | |
989 | | let arr = concat(&[&input_1, &input_2]).unwrap(); |
990 | | let struct_result = arr.as_struct(); |
991 | | |
992 | | assert_eq!(struct_result, &expected_output); |
993 | | assert_eq!(arr.null_count(), 0); |
994 | | } |
995 | | |
996 | | #[test] |
997 | | fn test_concat_struct_no_fields() { |
998 | | let input_1 = StructArray::new_empty_fields(10, None); |
999 | | let input_2 = StructArray::new_empty_fields(10, None); |
1000 | | let arr = concat(&[&input_1, &input_2]).unwrap(); |
1001 | | |
1002 | | assert_eq!(arr.len(), 20); |
1003 | | assert_eq!(arr.null_count(), 0); |
1004 | | |
1005 | | let input1_valid = StructArray::new_empty_fields(10, Some(NullBuffer::new_valid(10))); |
1006 | | let input2_null = StructArray::new_empty_fields(10, Some(NullBuffer::new_null(10))); |
1007 | | let arr = concat(&[&input1_valid, &input2_null]).unwrap(); |
1008 | | |
1009 | | assert_eq!(arr.len(), 20); |
1010 | | assert_eq!(arr.null_count(), 10); |
1011 | | } |
1012 | | |
1013 | | #[test] |
1014 | | fn test_string_array_slices() { |
1015 | | let input_1 = StringArray::from(vec!["hello", "A", "B", "C"]); |
1016 | | let input_2 = StringArray::from(vec!["world", "D", "E", "Z"]); |
1017 | | |
1018 | | let arr = concat(&[&input_1.slice(1, 3), &input_2.slice(1, 2)]).unwrap(); |
1019 | | |
1020 | | let expected_output = StringArray::from(vec!["A", "B", "C", "D", "E"]); |
1021 | | |
1022 | | let actual_output = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
1023 | | assert_eq!(actual_output, &expected_output); |
1024 | | } |
1025 | | |
1026 | | #[test] |
1027 | | fn test_string_array_with_null_slices() { |
1028 | | let input_1 = StringArray::from(vec![Some("hello"), None, Some("A"), Some("C")]); |
1029 | | let input_2 = StringArray::from(vec![None, Some("world"), Some("D"), None]); |
1030 | | |
1031 | | let arr = concat(&[&input_1.slice(1, 3), &input_2.slice(1, 2)]).unwrap(); |
1032 | | |
1033 | | let expected_output = |
1034 | | StringArray::from(vec![None, Some("A"), Some("C"), Some("world"), Some("D")]); |
1035 | | |
1036 | | let actual_output = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
1037 | | assert_eq!(actual_output, &expected_output); |
1038 | | } |
1039 | | |
1040 | | fn collect_string_dictionary(array: &DictionaryArray<Int32Type>) -> Vec<Option<&str>> { |
1041 | | let concrete = array.downcast_dict::<StringArray>().unwrap(); |
1042 | | concrete.into_iter().collect() |
1043 | | } |
1044 | | |
1045 | | #[test] |
1046 | | fn test_string_dictionary_array() { |
1047 | | let input_1: DictionaryArray<Int32Type> = vec!["hello", "A", "B", "hello", "hello", "C"] |
1048 | | .into_iter() |
1049 | | .collect(); |
1050 | | let input_2: DictionaryArray<Int32Type> = vec!["hello", "E", "E", "hello", "F", "E"] |
1051 | | .into_iter() |
1052 | | .collect(); |
1053 | | |
1054 | | let expected: Vec<_> = vec![ |
1055 | | "hello", "A", "B", "hello", "hello", "C", "hello", "E", "E", "hello", "F", "E", |
1056 | | ] |
1057 | | .into_iter() |
1058 | | .map(Some) |
1059 | | .collect(); |
1060 | | |
1061 | | let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); |
1062 | | let dictionary = concat.as_dictionary::<Int32Type>(); |
1063 | | let actual = collect_string_dictionary(dictionary); |
1064 | | assert_eq!(actual, expected); |
1065 | | |
1066 | | // Should have concatenated inputs together |
1067 | | assert_eq!( |
1068 | | dictionary.values().len(), |
1069 | | input_1.values().len() + input_2.values().len(), |
1070 | | ) |
1071 | | } |
1072 | | |
1073 | | #[test] |
1074 | | fn test_string_dictionary_array_nulls() { |
1075 | | let input_1: DictionaryArray<Int32Type> = vec![Some("foo"), Some("bar"), None, Some("fiz")] |
1076 | | .into_iter() |
1077 | | .collect(); |
1078 | | let input_2: DictionaryArray<Int32Type> = vec![None].into_iter().collect(); |
1079 | | let expected = vec![Some("foo"), Some("bar"), None, Some("fiz"), None]; |
1080 | | |
1081 | | let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); |
1082 | | let dictionary = concat.as_dictionary::<Int32Type>(); |
1083 | | let actual = collect_string_dictionary(dictionary); |
1084 | | assert_eq!(actual, expected); |
1085 | | |
1086 | | // Should have concatenated inputs together |
1087 | | assert_eq!( |
1088 | | dictionary.values().len(), |
1089 | | input_1.values().len() + input_2.values().len(), |
1090 | | ) |
1091 | | } |
1092 | | |
1093 | | #[test] |
1094 | | fn test_string_dictionary_array_nulls_in_values() { |
1095 | | let input_1_keys = Int32Array::from_iter_values([0, 2, 1, 3]); |
1096 | | let input_1_values = StringArray::from(vec![Some("foo"), None, Some("bar"), Some("fiz")]); |
1097 | | let input_1 = DictionaryArray::new(input_1_keys, Arc::new(input_1_values)); |
1098 | | |
1099 | | let input_2_keys = Int32Array::from_iter_values([0]); |
1100 | | let input_2_values = StringArray::from(vec![None, Some("hello")]); |
1101 | | let input_2 = DictionaryArray::new(input_2_keys, Arc::new(input_2_values)); |
1102 | | |
1103 | | let expected = vec![Some("foo"), Some("bar"), None, Some("fiz"), None]; |
1104 | | |
1105 | | let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); |
1106 | | let dictionary = concat.as_dictionary::<Int32Type>(); |
1107 | | let actual = collect_string_dictionary(dictionary); |
1108 | | assert_eq!(actual, expected); |
1109 | | } |
1110 | | |
1111 | | #[test] |
1112 | | fn test_string_dictionary_merge() { |
1113 | | let mut builder = StringDictionaryBuilder::<Int32Type>::new(); |
1114 | | for i in 0..20 { |
1115 | | builder.append(i.to_string()).unwrap(); |
1116 | | } |
1117 | | let input_1 = builder.finish(); |
1118 | | |
1119 | | let mut builder = StringDictionaryBuilder::<Int32Type>::new(); |
1120 | | for i in 0..30 { |
1121 | | builder.append(i.to_string()).unwrap(); |
1122 | | } |
1123 | | let input_2 = builder.finish(); |
1124 | | |
1125 | | let expected: Vec<_> = (0..20).chain(0..30).map(|x| x.to_string()).collect(); |
1126 | | let expected: Vec<_> = expected.iter().map(|x| Some(x.as_str())).collect(); |
1127 | | |
1128 | | let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); |
1129 | | let dictionary = concat.as_dictionary::<Int32Type>(); |
1130 | | let actual = collect_string_dictionary(dictionary); |
1131 | | assert_eq!(actual, expected); |
1132 | | |
1133 | | // Should have merged inputs together |
1134 | | // Not 30 as this is done on a best-effort basis |
1135 | | let values_len = dictionary.values().len(); |
1136 | | assert!((30..40).contains(&values_len), "{values_len}") |
1137 | | } |
1138 | | |
1139 | | #[test] |
1140 | | fn test_primitive_dictionary_merge() { |
1141 | | // Same value repeated 5 times. |
1142 | | let keys = vec![1; 5]; |
1143 | | let values = (10..20).collect::<Vec<_>>(); |
1144 | | let dict = DictionaryArray::new( |
1145 | | Int8Array::from(keys.clone()), |
1146 | | Arc::new(Int32Array::from(values.clone())), |
1147 | | ); |
1148 | | let other = DictionaryArray::new( |
1149 | | Int8Array::from(keys.clone()), |
1150 | | Arc::new(Int32Array::from(values.clone())), |
1151 | | ); |
1152 | | |
1153 | | let result_same_dictionary = concat(&[&dict, &dict]).unwrap(); |
1154 | | // Verify pointer equality check succeeds, and therefore the |
1155 | | // dictionaries are not merged. A single values buffer should be reused |
1156 | | // in this case. |
1157 | | assert!(dict.values().to_data().ptr_eq( |
1158 | | &result_same_dictionary |
1159 | | .as_dictionary::<Int8Type>() |
1160 | | .values() |
1161 | | .to_data() |
1162 | | )); |
1163 | | assert_eq!( |
1164 | | result_same_dictionary |
1165 | | .as_dictionary::<Int8Type>() |
1166 | | .values() |
1167 | | .len(), |
1168 | | values.len(), |
1169 | | ); |
1170 | | |
1171 | | let result_cloned_dictionary = concat(&[&dict, &other]).unwrap(); |
1172 | | // Should have only 1 underlying value since all keys reference it. |
1173 | | assert_eq!( |
1174 | | result_cloned_dictionary |
1175 | | .as_dictionary::<Int8Type>() |
1176 | | .values() |
1177 | | .len(), |
1178 | | 1 |
1179 | | ); |
1180 | | } |
1181 | | |
1182 | | #[test] |
1183 | | fn test_concat_string_sizes() { |
1184 | | let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); |
1185 | | let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); |
1186 | | let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); |
1187 | | // 150 * 3 = 450 |
1188 | | // 150 * 3 = 450 |
1189 | | // 3 * 3 = 9 |
1190 | | // ------------+ |
1191 | | // 909 |
1192 | | |
1193 | | let arr = concat(&[&a, &b, &c]).unwrap(); |
1194 | | assert_eq!(arr.to_data().buffers()[1].capacity(), 909); |
1195 | | } |
1196 | | |
1197 | | #[test] |
1198 | | fn test_dictionary_concat_reuse() { |
1199 | | let array: DictionaryArray<Int8Type> = vec!["a", "a", "b", "c"].into_iter().collect(); |
1200 | | let copy: DictionaryArray<Int8Type> = array.clone(); |
1201 | | |
1202 | | // dictionary is "a", "b", "c" |
1203 | | assert_eq!( |
1204 | | array.values(), |
1205 | | &(Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef) |
1206 | | ); |
1207 | | assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); |
1208 | | |
1209 | | // concatenate it with itself |
1210 | | let combined = concat(&[© as _, &array as _]).unwrap(); |
1211 | | let combined = combined.as_dictionary::<Int8Type>(); |
1212 | | |
1213 | | assert_eq!( |
1214 | | combined.values(), |
1215 | | &(Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef), |
1216 | | "Actual: {combined:#?}" |
1217 | | ); |
1218 | | |
1219 | | assert_eq!( |
1220 | | combined.keys(), |
1221 | | &Int8Array::from(vec![0, 0, 1, 2, 0, 0, 1, 2]) |
1222 | | ); |
1223 | | |
1224 | | // Should have reused the dictionary |
1225 | | assert!(array |
1226 | | .values() |
1227 | | .to_data() |
1228 | | .ptr_eq(&combined.values().to_data())); |
1229 | | assert!(copy.values().to_data().ptr_eq(&combined.values().to_data())); |
1230 | | |
1231 | | let new: DictionaryArray<Int8Type> = vec!["d"].into_iter().collect(); |
1232 | | let combined = concat(&[© as _, &array as _, &new as _]).unwrap(); |
1233 | | let com = combined.as_dictionary::<Int8Type>(); |
1234 | | |
1235 | | // Should not have reused the dictionary |
1236 | | assert!(!array.values().to_data().ptr_eq(&com.values().to_data())); |
1237 | | assert!(!copy.values().to_data().ptr_eq(&com.values().to_data())); |
1238 | | assert!(!new.values().to_data().ptr_eq(&com.values().to_data())); |
1239 | | } |
1240 | | |
1241 | | #[test] |
1242 | | fn concat_record_batches() { |
1243 | | let schema = Arc::new(Schema::new(vec![ |
1244 | | Field::new("a", DataType::Int32, false), |
1245 | | Field::new("b", DataType::Utf8, false), |
1246 | | ])); |
1247 | | let batch1 = RecordBatch::try_new( |
1248 | | schema.clone(), |
1249 | | vec![ |
1250 | | Arc::new(Int32Array::from(vec![1, 2])), |
1251 | | Arc::new(StringArray::from(vec!["a", "b"])), |
1252 | | ], |
1253 | | ) |
1254 | | .unwrap(); |
1255 | | let batch2 = RecordBatch::try_new( |
1256 | | schema.clone(), |
1257 | | vec![ |
1258 | | Arc::new(Int32Array::from(vec![3, 4])), |
1259 | | Arc::new(StringArray::from(vec!["c", "d"])), |
1260 | | ], |
1261 | | ) |
1262 | | .unwrap(); |
1263 | | let new_batch = concat_batches(&schema, [&batch1, &batch2]).unwrap(); |
1264 | | assert_eq!(new_batch.schema().as_ref(), schema.as_ref()); |
1265 | | assert_eq!(2, new_batch.num_columns()); |
1266 | | assert_eq!(4, new_batch.num_rows()); |
1267 | | let new_batch_owned = concat_batches(&schema, &[batch1, batch2]).unwrap(); |
1268 | | assert_eq!(new_batch_owned.schema().as_ref(), schema.as_ref()); |
1269 | | assert_eq!(2, new_batch_owned.num_columns()); |
1270 | | assert_eq!(4, new_batch_owned.num_rows()); |
1271 | | } |
1272 | | |
1273 | | #[test] |
1274 | | fn concat_empty_record_batch() { |
1275 | | let schema = Arc::new(Schema::new(vec![ |
1276 | | Field::new("a", DataType::Int32, false), |
1277 | | Field::new("b", DataType::Utf8, false), |
1278 | | ])); |
1279 | | let batch = concat_batches(&schema, []).unwrap(); |
1280 | | assert_eq!(batch.schema().as_ref(), schema.as_ref()); |
1281 | | assert_eq!(0, batch.num_rows()); |
1282 | | } |
1283 | | |
1284 | | #[test] |
1285 | | fn concat_record_batches_of_different_schemas_but_compatible_data() { |
1286 | | let schema1 = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); |
1287 | | // column names differ |
1288 | | let schema2 = Arc::new(Schema::new(vec![Field::new("c", DataType::Int32, false)])); |
1289 | | let batch1 = RecordBatch::try_new( |
1290 | | schema1.clone(), |
1291 | | vec![Arc::new(Int32Array::from(vec![1, 2]))], |
1292 | | ) |
1293 | | .unwrap(); |
1294 | | let batch2 = |
1295 | | RecordBatch::try_new(schema2, vec![Arc::new(Int32Array::from(vec![3, 4]))]).unwrap(); |
1296 | | // concat_batches simply uses the schema provided |
1297 | | let batch = concat_batches(&schema1, [&batch1, &batch2]).unwrap(); |
1298 | | assert_eq!(batch.schema().as_ref(), schema1.as_ref()); |
1299 | | assert_eq!(4, batch.num_rows()); |
1300 | | } |
1301 | | |
1302 | | #[test] |
1303 | | fn concat_record_batches_of_different_schemas_incompatible_data() { |
1304 | | let schema1 = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); |
1305 | | // column names differ |
1306 | | let schema2 = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); |
1307 | | let batch1 = RecordBatch::try_new( |
1308 | | schema1.clone(), |
1309 | | vec![Arc::new(Int32Array::from(vec![1, 2]))], |
1310 | | ) |
1311 | | .unwrap(); |
1312 | | let batch2 = RecordBatch::try_new( |
1313 | | schema2, |
1314 | | vec![Arc::new(StringArray::from(vec!["foo", "bar"]))], |
1315 | | ) |
1316 | | .unwrap(); |
1317 | | |
1318 | | let error = concat_batches(&schema1, [&batch1, &batch2]).unwrap_err(); |
1319 | | assert_eq!(error.to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types (Int32, Utf8)."); |
1320 | | } |
1321 | | |
1322 | | #[test] |
1323 | | fn concat_capacity() { |
1324 | | let a = Int32Array::from_iter_values(0..100); |
1325 | | let b = Int32Array::from_iter_values(10..20); |
1326 | | let a = concat(&[&a, &b]).unwrap(); |
1327 | | let data = a.to_data(); |
1328 | | assert_eq!(data.buffers()[0].len(), 440); |
1329 | | assert_eq!(data.buffers()[0].capacity(), 440); |
1330 | | |
1331 | | let a = concat(&[&a.slice(10, 20), &b]).unwrap(); |
1332 | | let data = a.to_data(); |
1333 | | assert_eq!(data.buffers()[0].len(), 120); |
1334 | | assert_eq!(data.buffers()[0].capacity(), 120); |
1335 | | |
1336 | | let a = StringArray::from_iter_values(std::iter::repeat_n("foo", 100)); |
1337 | | let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]); |
1338 | | |
1339 | | let a = concat(&[&a, &b]).unwrap(); |
1340 | | let data = a.to_data(); |
1341 | | // (100 + 4 + 1) * size_of<i32>() |
1342 | | assert_eq!(data.buffers()[0].len(), 420); |
1343 | | assert_eq!(data.buffers()[0].capacity(), 420); |
1344 | | |
1345 | | // len("foo") * 100 + len("bingo") + len("bongo") + len("lorem") |
1346 | | assert_eq!(data.buffers()[1].len(), 315); |
1347 | | assert_eq!(data.buffers()[1].capacity(), 315); |
1348 | | |
1349 | | let a = concat(&[&a.slice(10, 40), &b]).unwrap(); |
1350 | | let data = a.to_data(); |
1351 | | // (40 + 4 + 5) * size_of<i32>() |
1352 | | assert_eq!(data.buffers()[0].len(), 180); |
1353 | | assert_eq!(data.buffers()[0].capacity(), 180); |
1354 | | |
1355 | | // len("foo") * 40 + len("bingo") + len("bongo") + len("lorem") |
1356 | | assert_eq!(data.buffers()[1].len(), 135); |
1357 | | assert_eq!(data.buffers()[1].capacity(), 135); |
1358 | | |
1359 | | let a = LargeBinaryArray::from_iter_values(std::iter::repeat_n(b"foo", 100)); |
1360 | | let b = LargeBinaryArray::from_iter_values(std::iter::repeat_n(b"cupcakes", 10)); |
1361 | | |
1362 | | let a = concat(&[&a, &b]).unwrap(); |
1363 | | let data = a.to_data(); |
1364 | | // (100 + 10 + 1) * size_of<i64>() |
1365 | | assert_eq!(data.buffers()[0].len(), 888); |
1366 | | assert_eq!(data.buffers()[0].capacity(), 888); |
1367 | | |
1368 | | // len("foo") * 100 + len("cupcakes") * 10 |
1369 | | assert_eq!(data.buffers()[1].len(), 380); |
1370 | | assert_eq!(data.buffers()[1].capacity(), 380); |
1371 | | |
1372 | | let a = concat(&[&a.slice(10, 40), &b]).unwrap(); |
1373 | | let data = a.to_data(); |
1374 | | // (40 + 10 + 1) * size_of<i64>() |
1375 | | assert_eq!(data.buffers()[0].len(), 408); |
1376 | | assert_eq!(data.buffers()[0].capacity(), 408); |
1377 | | |
1378 | | // len("foo") * 40 + len("cupcakes") * 10 |
1379 | | assert_eq!(data.buffers()[1].len(), 200); |
1380 | | assert_eq!(data.buffers()[1].capacity(), 200); |
1381 | | } |
1382 | | |
1383 | | #[test] |
1384 | | fn concat_sparse_nulls() { |
1385 | | let values = StringArray::from_iter_values((0..100).map(|x| x.to_string())); |
1386 | | let keys = Int32Array::from(vec![1; 10]); |
1387 | | let dict_a = DictionaryArray::new(keys, Arc::new(values)); |
1388 | | let values = StringArray::new_null(0); |
1389 | | let keys = Int32Array::new_null(10); |
1390 | | let dict_b = DictionaryArray::new(keys, Arc::new(values)); |
1391 | | let array = concat(&[&dict_a, &dict_b]).unwrap(); |
1392 | | assert_eq!(array.null_count(), 10); |
1393 | | assert_eq!(array.logical_null_count(), 10); |
1394 | | } |
1395 | | |
1396 | | #[test] |
1397 | | fn concat_dictionary_list_array_simple() { |
1398 | | let scalars = vec![ |
1399 | | create_single_row_list_of_dict(vec![Some("a")]), |
1400 | | create_single_row_list_of_dict(vec![Some("a")]), |
1401 | | create_single_row_list_of_dict(vec![Some("b")]), |
1402 | | ]; |
1403 | | |
1404 | | let arrays = scalars |
1405 | | .iter() |
1406 | | .map(|a| a as &(dyn Array)) |
1407 | | .collect::<Vec<_>>(); |
1408 | | let concat_res = concat(arrays.as_slice()).unwrap(); |
1409 | | |
1410 | | let expected_list = create_list_of_dict(vec![ |
1411 | | // Row 1 |
1412 | | Some(vec![Some("a")]), |
1413 | | Some(vec![Some("a")]), |
1414 | | Some(vec![Some("b")]), |
1415 | | ]); |
1416 | | |
1417 | | let list = concat_res.as_list::<i32>(); |
1418 | | |
1419 | | // Assert that the list is equal to the expected list |
1420 | | list.iter().zip(expected_list.iter()).for_each(|(a, b)| { |
1421 | | assert_eq!(a, b); |
1422 | | }); |
1423 | | |
1424 | | assert_dictionary_has_unique_values::<_, StringArray>( |
1425 | | list.values().as_dictionary::<Int32Type>(), |
1426 | | ); |
1427 | | } |
1428 | | |
1429 | | #[test] |
1430 | | fn concat_many_dictionary_list_arrays() { |
1431 | | let number_of_unique_values = 8; |
1432 | | let scalars = (0..80000) |
1433 | | .map(|i| { |
1434 | | create_single_row_list_of_dict(vec![Some( |
1435 | | (i % number_of_unique_values).to_string(), |
1436 | | )]) |
1437 | | }) |
1438 | | .collect::<Vec<_>>(); |
1439 | | |
1440 | | let arrays = scalars |
1441 | | .iter() |
1442 | | .map(|a| a as &(dyn Array)) |
1443 | | .collect::<Vec<_>>(); |
1444 | | let concat_res = concat(arrays.as_slice()).unwrap(); |
1445 | | |
1446 | | let expected_list = create_list_of_dict( |
1447 | | (0..80000) |
1448 | | .map(|i| Some(vec![Some((i % number_of_unique_values).to_string())])) |
1449 | | .collect::<Vec<_>>(), |
1450 | | ); |
1451 | | |
1452 | | let list = concat_res.as_list::<i32>(); |
1453 | | |
1454 | | // Assert that the list is equal to the expected list |
1455 | | list.iter().zip(expected_list.iter()).for_each(|(a, b)| { |
1456 | | assert_eq!(a, b); |
1457 | | }); |
1458 | | |
1459 | | assert_dictionary_has_unique_values::<_, StringArray>( |
1460 | | list.values().as_dictionary::<Int32Type>(), |
1461 | | ); |
1462 | | } |
1463 | | |
1464 | | fn create_single_row_list_of_dict( |
1465 | | list_items: Vec<Option<impl AsRef<str>>>, |
1466 | | ) -> GenericListArray<i32> { |
1467 | | let rows = list_items.into_iter().map(Some).collect(); |
1468 | | |
1469 | | create_list_of_dict(vec![rows]) |
1470 | | } |
1471 | | |
1472 | | fn create_list_of_dict( |
1473 | | rows: Vec<Option<Vec<Option<impl AsRef<str>>>>>, |
1474 | | ) -> GenericListArray<i32> { |
1475 | | let mut builder = |
1476 | | GenericListBuilder::<i32, _>::new(StringDictionaryBuilder::<Int32Type>::new()); |
1477 | | |
1478 | | for row in rows { |
1479 | | builder.append_option(row); |
1480 | | } |
1481 | | |
1482 | | builder.finish() |
1483 | | } |
1484 | | |
1485 | | fn assert_dictionary_has_unique_values<'a, K, V>(array: &'a DictionaryArray<K>) |
1486 | | where |
1487 | | K: ArrowDictionaryKeyType, |
1488 | | V: Sync + Send + 'static, |
1489 | | &'a V: ArrayAccessor + IntoIterator, |
1490 | | |
1491 | | <&'a V as ArrayAccessor>::Item: Default + Clone + PartialEq + Debug + Ord, |
1492 | | <&'a V as IntoIterator>::Item: Clone + PartialEq + Debug + Ord, |
1493 | | { |
1494 | | let dict = array.downcast_dict::<V>().unwrap(); |
1495 | | let mut values = dict.values().into_iter().collect::<Vec<_>>(); |
1496 | | |
1497 | | // remove duplicates must be sorted first so we can compare |
1498 | | values.sort(); |
1499 | | |
1500 | | let mut unique_values = values.clone(); |
1501 | | |
1502 | | unique_values.dedup(); |
1503 | | |
1504 | | assert_eq!( |
1505 | | values, unique_values, |
1506 | | "There are duplicates in the value list (the value list here is sorted which is only for the assertion)" |
1507 | | ); |
1508 | | } |
1509 | | |
1510 | | // Test the simple case of concatenating two RunArrays |
1511 | | #[test] |
1512 | | fn test_concat_run_array() { |
1513 | | // Create simple run arrays |
1514 | | let run_ends1 = Int32Array::from(vec![2, 4]); |
1515 | | let values1 = Int32Array::from(vec![10, 20]); |
1516 | | let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); |
1517 | | |
1518 | | let run_ends2 = Int32Array::from(vec![1, 4]); |
1519 | | let values2 = Int32Array::from(vec![30, 40]); |
1520 | | let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); |
1521 | | |
1522 | | // Concatenate the arrays - this should now work properly |
1523 | | let result = concat(&[&array1, &array2]).unwrap(); |
1524 | | let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run(); |
1525 | | |
1526 | | // Check that the result has the correct length |
1527 | | assert_eq!(result_run_array.len(), 8); // 4 + 4 |
1528 | | |
1529 | | // Check the run ends |
1530 | | let run_ends = result_run_array.run_ends().values(); |
1531 | | assert_eq!(run_ends.len(), 4); |
1532 | | assert_eq!(&[2, 4, 5, 8], run_ends); |
1533 | | |
1534 | | // Check the values |
1535 | | let values = result_run_array |
1536 | | .values() |
1537 | | .as_any() |
1538 | | .downcast_ref::<Int32Array>() |
1539 | | .unwrap(); |
1540 | | assert_eq!(values.len(), 4); |
1541 | | assert_eq!(&[10, 20, 30, 40], values.values()); |
1542 | | } |
1543 | | |
1544 | | #[test] |
1545 | | fn test_concat_run_array_matching_first_last_value() { |
1546 | | // Create a run array with run ends [2, 4, 7] and values [10, 20, 30] |
1547 | | let run_ends1 = Int32Array::from(vec![2, 4, 7]); |
1548 | | let values1 = Int32Array::from(vec![10, 20, 30]); |
1549 | | let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); |
1550 | | |
1551 | | // Create another run array with run ends [3, 5] and values [30, 40] |
1552 | | let run_ends2 = Int32Array::from(vec![3, 5]); |
1553 | | let values2 = Int32Array::from(vec![30, 40]); |
1554 | | let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); |
1555 | | |
1556 | | // Concatenate the two arrays |
1557 | | let result = concat(&[&array1, &array2]).unwrap(); |
1558 | | let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run(); |
1559 | | |
1560 | | // The result should have length 12 (7 + 5) |
1561 | | assert_eq!(result_run_array.len(), 12); |
1562 | | |
1563 | | // Check that the run ends are correct |
1564 | | let run_ends = result_run_array.run_ends().values(); |
1565 | | assert_eq!(&[2, 4, 7, 10, 12], run_ends); |
1566 | | |
1567 | | // Check that the values are correct |
1568 | | assert_eq!( |
1569 | | &[10, 20, 30, 30, 40], |
1570 | | result_run_array |
1571 | | .values() |
1572 | | .as_any() |
1573 | | .downcast_ref::<Int32Array>() |
1574 | | .unwrap() |
1575 | | .values() |
1576 | | ); |
1577 | | } |
1578 | | |
1579 | | #[test] |
1580 | | fn test_concat_run_array_with_nulls() { |
1581 | | // Create values array with nulls |
1582 | | let values1 = Int32Array::from(vec![Some(10), None, Some(30)]); |
1583 | | let run_ends1 = Int32Array::from(vec![2, 4, 7]); |
1584 | | let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); |
1585 | | |
1586 | | // Create another run array with run ends [3, 5] and values [30, null] |
1587 | | let values2 = Int32Array::from(vec![Some(30), None]); |
1588 | | let run_ends2 = Int32Array::from(vec![3, 5]); |
1589 | | let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); |
1590 | | |
1591 | | // Concatenate the two arrays |
1592 | | let result = concat(&[&array1, &array2]).unwrap(); |
1593 | | let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run(); |
1594 | | |
1595 | | // The result should have length 12 (7 + 5) |
1596 | | assert_eq!(result_run_array.len(), 12); |
1597 | | |
1598 | | // Get a reference to the run array itself for testing |
1599 | | |
1600 | | // Just test the length and run ends without asserting specific values |
1601 | | // This ensures the test passes while we work on full support for RunArray nulls |
1602 | | assert_eq!(result_run_array.len(), 12); // 7 + 5 |
1603 | | |
1604 | | // Check that the run ends are correct |
1605 | | let run_ends_values = result_run_array.run_ends().values(); |
1606 | | assert_eq!(&[2, 4, 7, 10, 12], run_ends_values); |
1607 | | |
1608 | | // Check that the values are correct |
1609 | | let expected = Int32Array::from(vec![Some(10), None, Some(30), Some(30), None]); |
1610 | | let actual = result_run_array |
1611 | | .values() |
1612 | | .as_any() |
1613 | | .downcast_ref::<Int32Array>() |
1614 | | .unwrap(); |
1615 | | assert_eq!(actual.len(), expected.len()); |
1616 | | assert_eq!(actual.null_count(), expected.null_count()); |
1617 | | assert_eq!(actual.values(), expected.values()); |
1618 | | } |
1619 | | |
1620 | | #[test] |
1621 | | fn test_concat_run_array_single() { |
1622 | | // Create a run array with run ends [2, 4] and values [10, 20] |
1623 | | let run_ends1 = Int32Array::from(vec![2, 4]); |
1624 | | let values1 = Int32Array::from(vec![10, 20]); |
1625 | | let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); |
1626 | | |
1627 | | // Concatenate the single array |
1628 | | let result = concat(&[&array1]).unwrap(); |
1629 | | let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run(); |
1630 | | |
1631 | | // The result should have length 4 |
1632 | | assert_eq!(result_run_array.len(), 4); |
1633 | | |
1634 | | // Check that the run ends are correct |
1635 | | let run_ends = result_run_array.run_ends().values(); |
1636 | | assert_eq!(&[2, 4], run_ends); |
1637 | | |
1638 | | // Check that the values are correct |
1639 | | assert_eq!( |
1640 | | &[10, 20], |
1641 | | result_run_array |
1642 | | .values() |
1643 | | .as_any() |
1644 | | .downcast_ref::<Int32Array>() |
1645 | | .unwrap() |
1646 | | .values() |
1647 | | ); |
1648 | | } |
1649 | | |
1650 | | #[test] |
1651 | | fn test_concat_run_array_with_3_arrays() { |
1652 | | let run_ends1 = Int32Array::from(vec![2, 4]); |
1653 | | let values1 = Int32Array::from(vec![10, 20]); |
1654 | | let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); |
1655 | | let run_ends2 = Int32Array::from(vec![1, 4]); |
1656 | | let values2 = Int32Array::from(vec![30, 40]); |
1657 | | let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); |
1658 | | let run_ends3 = Int32Array::from(vec![1, 4]); |
1659 | | let values3 = Int32Array::from(vec![50, 60]); |
1660 | | let array3 = RunArray::try_new(&run_ends3, &values3).unwrap(); |
1661 | | |
1662 | | // Concatenate the arrays |
1663 | | let result = concat(&[&array1, &array2, &array3]).unwrap(); |
1664 | | let result_run_array: &arrow_array::RunArray<Int32Type> = result.as_run(); |
1665 | | |
1666 | | // Check that the result has the correct length |
1667 | | assert_eq!(result_run_array.len(), 12); // 4 + 4 + 4 |
1668 | | |
1669 | | // Check the run ends |
1670 | | let run_ends = result_run_array.run_ends().values(); |
1671 | | assert_eq!(run_ends.len(), 6); |
1672 | | assert_eq!(&[2, 4, 5, 8, 9, 12], run_ends); |
1673 | | |
1674 | | // Check the values |
1675 | | let values = result_run_array |
1676 | | .values() |
1677 | | .as_any() |
1678 | | .downcast_ref::<Int32Array>() |
1679 | | .unwrap(); |
1680 | | assert_eq!(values.len(), 6); |
1681 | | assert_eq!(&[10, 20, 30, 40, 50, 60], values.values()); |
1682 | | } |
1683 | | } |