/Users/andrewlamb/Software/arrow-rs/arrow-select/src/interleave.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Interleave elements from multiple arrays |
19 | | |
20 | | use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values}; |
21 | | use arrow_array::builder::{BooleanBufferBuilder, PrimitiveBuilder}; |
22 | | use arrow_array::cast::AsArray; |
23 | | use arrow_array::types::*; |
24 | | use arrow_array::*; |
25 | | use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer}; |
26 | | use arrow_data::transform::MutableArrayData; |
27 | | use arrow_data::ByteView; |
28 | | use arrow_schema::{ArrowError, DataType, Fields}; |
29 | | use std::sync::Arc; |
30 | | |
31 | | macro_rules! primitive_helper { |
32 | | ($t:ty, $values:ident, $indices:ident, $data_type:ident) => { |
33 | | interleave_primitive::<$t>($values, $indices, $data_type) |
34 | | }; |
35 | | } |
36 | | |
37 | | macro_rules! dict_helper { |
38 | | ($t:ty, $values:expr, $indices:expr) => { |
39 | | Ok(Arc::new(interleave_dictionaries::<$t>($values, $indices)?) as _) |
40 | | }; |
41 | | } |
42 | | |
43 | | /// |
44 | | /// Takes elements by index from a list of [`Array`], creating a new [`Array`] from those values. |
45 | | /// |
46 | | /// Each element in `indices` is a pair of `usize` with the first identifying the index |
47 | | /// of the [`Array`] in `values`, and the second the index of the value within that [`Array`] |
48 | | /// |
49 | | /// ```text |
50 | | /// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ |
51 | | /// │ A │ │ (0, 0) │ interleave( │ A │ |
52 | | /// ├─────────────────┤ ├─────────┤ [values0, values1], ├─────────────────┤ |
53 | | /// │ D │ │ (1, 0) │ indices │ B │ |
54 | | /// └─────────────────┘ ├─────────┤ ) ├─────────────────┤ |
55 | | /// values array 0 │ (1, 1) │ ─────────────────────────▶ │ C │ |
56 | | /// ├─────────┤ ├─────────────────┤ |
57 | | /// │ (0, 1) │ │ D │ |
58 | | /// └─────────┘ └─────────────────┘ |
59 | | /// ┌─────────────────┐ indices |
60 | | /// │ B │ array |
61 | | /// ├─────────────────┤ result |
62 | | /// │ C │ |
63 | | /// ├─────────────────┤ |
64 | | /// │ E │ |
65 | | /// └─────────────────┘ |
66 | | /// values array 1 |
67 | | /// ``` |
68 | | /// |
69 | | /// For selecting values by index from a single array see [`crate::take`] |
70 | 3 | pub fn interleave( |
71 | 3 | values: &[&dyn Array], |
72 | 3 | indices: &[(usize, usize)], |
73 | 3 | ) -> Result<ArrayRef, ArrowError> { |
74 | 3 | if values.is_empty() { |
75 | 0 | return Err(ArrowError::InvalidArgumentError( |
76 | 0 | "interleave requires input of at least one array".to_string(), |
77 | 0 | )); |
78 | 3 | } |
79 | 3 | let data_type = values[0].data_type(); |
80 | | |
81 | 3 | for array in values.iter().skip(1) { |
82 | 3 | if array.data_type() != data_type { |
83 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
84 | 0 | "It is not possible to interleave arrays of different data types ({} and {})", |
85 | 0 | data_type, |
86 | 0 | array.data_type() |
87 | 0 | ))); |
88 | 3 | } |
89 | | } |
90 | | |
91 | 3 | if indices.is_empty() { |
92 | 0 | return Ok(new_empty_array(data_type)); |
93 | 3 | } |
94 | | |
95 | 0 | downcast_primitive! { |
96 | 0 | data_type => (primitive_helper, values, indices, data_type), |
97 | 3 | DataType::Utf8 => interleave_bytes::<Utf8Type>(values, indices), |
98 | 0 | DataType::LargeUtf8 => interleave_bytes::<LargeUtf8Type>(values, indices), |
99 | 0 | DataType::Binary => interleave_bytes::<BinaryType>(values, indices), |
100 | 0 | DataType::LargeBinary => interleave_bytes::<LargeBinaryType>(values, indices), |
101 | 0 | DataType::BinaryView => interleave_views::<BinaryViewType>(values, indices), |
102 | 0 | DataType::Utf8View => interleave_views::<StringViewType>(values, indices), |
103 | 0 | DataType::Dictionary(k, _) => downcast_integer! { |
104 | 0 | k.as_ref() => (dict_helper, values, indices), |
105 | 0 | _ => unreachable!("illegal dictionary key type {k}") |
106 | | }, |
107 | 0 | DataType::Struct(fields) => interleave_struct(fields, values, indices), |
108 | 0 | _ => interleave_fallback(values, indices) |
109 | | } |
110 | 3 | } |
111 | | |
112 | | /// Common functionality for interleaving arrays |
113 | | /// |
114 | | /// T is the concrete Array type |
115 | | struct Interleave<'a, T> { |
116 | | /// The input arrays downcast to T |
117 | | arrays: Vec<&'a T>, |
118 | | /// The null buffer of the interleaved output |
119 | | nulls: Option<NullBuffer>, |
120 | | } |
121 | | |
122 | | impl<'a, T: Array + 'static> Interleave<'a, T> { |
123 | 3 | fn new(values: &[&'a dyn Array], indices: &'a [(usize, usize)]) -> Self { |
124 | 3 | let mut has_nulls = false; |
125 | 3 | let arrays: Vec<&T> = values |
126 | 3 | .iter() |
127 | 6 | .map3 (|x| { |
128 | 6 | has_nulls = has_nulls || x.null_count() != 0; |
129 | 6 | x.as_any().downcast_ref().unwrap() |
130 | 6 | }) |
131 | 3 | .collect(); |
132 | | |
133 | 3 | let nulls = match has_nulls { |
134 | | true => { |
135 | 0 | let nulls = BooleanBuffer::collect_bool(indices.len(), |i| { |
136 | 0 | let (a, b) = indices[i]; |
137 | 0 | arrays[a].is_valid(b) |
138 | 0 | }); |
139 | 0 | Some(nulls.into()) |
140 | | } |
141 | 3 | false => None, |
142 | | }; |
143 | | |
144 | 3 | Self { arrays, nulls } |
145 | 3 | } |
146 | | } |
147 | | |
148 | 0 | fn interleave_primitive<T: ArrowPrimitiveType>( |
149 | 0 | values: &[&dyn Array], |
150 | 0 | indices: &[(usize, usize)], |
151 | 0 | data_type: &DataType, |
152 | 0 | ) -> Result<ArrayRef, ArrowError> { |
153 | 0 | let interleaved = Interleave::<'_, PrimitiveArray<T>>::new(values, indices); |
154 | | |
155 | 0 | let values = indices |
156 | 0 | .iter() |
157 | 0 | .map(|(a, b)| interleaved.arrays[*a].value(*b)) |
158 | 0 | .collect::<Vec<_>>(); |
159 | | |
160 | 0 | let array = PrimitiveArray::<T>::new(values.into(), interleaved.nulls); |
161 | 0 | Ok(Arc::new(array.with_data_type(data_type.clone()))) |
162 | 0 | } |
163 | | |
164 | 3 | fn interleave_bytes<T: ByteArrayType>( |
165 | 3 | values: &[&dyn Array], |
166 | 3 | indices: &[(usize, usize)], |
167 | 3 | ) -> Result<ArrayRef, ArrowError> { |
168 | 3 | let interleaved = Interleave::<'_, GenericByteArray<T>>::new(values, indices); |
169 | | |
170 | 3 | let mut capacity = 0; |
171 | 3 | let mut offsets = Vec::with_capacity(indices.len() + 1); |
172 | 3 | offsets.push(T::Offset::from_usize(0).unwrap()); |
173 | 11 | offsets3 .extend3 (indices3 .iter3 ().map3 (|(a, b)| { |
174 | 11 | let o = interleaved.arrays[*a].value_offsets(); |
175 | 11 | let element_len = o[*b + 1].as_usize() - o[*b].as_usize(); |
176 | 11 | capacity += element_len; |
177 | 11 | T::Offset::from_usize(capacity).expect("overflow") |
178 | 11 | })); |
179 | | |
180 | 3 | let mut values = Vec::with_capacity(capacity); |
181 | 14 | for (a11 , b11 ) in indices { |
182 | 11 | values.extend_from_slice(interleaved.arrays[*a].value(*b).as_ref()); |
183 | 11 | } |
184 | | |
185 | | // Safety: safe by construction |
186 | 3 | let array = unsafe { |
187 | 3 | let offsets = OffsetBuffer::new_unchecked(offsets.into()); |
188 | 3 | GenericByteArray::<T>::new_unchecked(offsets, values.into(), interleaved.nulls) |
189 | | }; |
190 | 3 | Ok(Arc::new(array)) |
191 | 3 | } |
192 | | |
193 | 0 | fn interleave_dictionaries<K: ArrowDictionaryKeyType>( |
194 | 0 | arrays: &[&dyn Array], |
195 | 0 | indices: &[(usize, usize)], |
196 | 0 | ) -> Result<ArrayRef, ArrowError> { |
197 | 0 | let dictionaries: Vec<_> = arrays.iter().map(|x| x.as_dictionary::<K>()).collect(); |
198 | 0 | if !should_merge_dictionary_values::<K>(&dictionaries, indices.len()) { |
199 | 0 | return interleave_fallback(arrays, indices); |
200 | 0 | } |
201 | | |
202 | 0 | let masks: Vec<_> = dictionaries |
203 | 0 | .iter() |
204 | 0 | .enumerate() |
205 | 0 | .map(|(a_idx, dictionary)| { |
206 | 0 | let mut key_mask = BooleanBufferBuilder::new_from_buffer( |
207 | 0 | MutableBuffer::new_null(dictionary.len()), |
208 | 0 | dictionary.len(), |
209 | | ); |
210 | | |
211 | 0 | for (_, key_idx) in indices.iter().filter(|(a, _)| *a == a_idx) { |
212 | 0 | key_mask.set_bit(*key_idx, true); |
213 | 0 | } |
214 | 0 | key_mask.finish() |
215 | 0 | }) |
216 | 0 | .collect(); |
217 | | |
218 | 0 | let merged = merge_dictionary_values(&dictionaries, Some(&masks))?; |
219 | | |
220 | | // Recompute keys |
221 | 0 | let mut keys = PrimitiveBuilder::<K>::with_capacity(indices.len()); |
222 | 0 | for (a, b) in indices { |
223 | 0 | let old_keys: &PrimitiveArray<K> = dictionaries[*a].keys(); |
224 | 0 | match old_keys.is_valid(*b) { |
225 | | true => { |
226 | 0 | let old_key = old_keys.values()[*b]; |
227 | 0 | keys.append_value(merged.key_mappings[*a][old_key.as_usize()]) |
228 | | } |
229 | 0 | false => keys.append_null(), |
230 | | } |
231 | | } |
232 | 0 | let array = unsafe { DictionaryArray::new_unchecked(keys.finish(), merged.values) }; |
233 | 0 | Ok(Arc::new(array)) |
234 | 0 | } |
235 | | |
236 | 0 | fn interleave_views<T: ByteViewType>( |
237 | 0 | values: &[&dyn Array], |
238 | 0 | indices: &[(usize, usize)], |
239 | 0 | ) -> Result<ArrayRef, ArrowError> { |
240 | 0 | let interleaved = Interleave::<'_, GenericByteViewArray<T>>::new(values, indices); |
241 | 0 | let mut buffers = Vec::new(); |
242 | | |
243 | | // Contains the offsets of start buffer in `buffer_to_new_index` |
244 | 0 | let mut offsets = Vec::with_capacity(interleaved.arrays.len() + 1); |
245 | 0 | offsets.push(0); |
246 | 0 | let mut total_buffers = 0; |
247 | 0 | for a in interleaved.arrays.iter() { |
248 | 0 | total_buffers += a.data_buffers().len(); |
249 | 0 | offsets.push(total_buffers); |
250 | 0 | } |
251 | | |
252 | | // contains the mapping from old buffer index to new buffer index |
253 | 0 | let mut buffer_to_new_index = vec![None; total_buffers]; |
254 | | |
255 | 0 | let views: Vec<u128> = indices |
256 | 0 | .iter() |
257 | 0 | .map(|(array_idx, value_idx)| { |
258 | 0 | let array = interleaved.arrays[*array_idx]; |
259 | 0 | let view = array.views().get(*value_idx).unwrap(); |
260 | 0 | let view_len = *view as u32; |
261 | 0 | if view_len <= 12 { |
262 | 0 | return *view; |
263 | 0 | } |
264 | | // value is big enough to be in a variadic buffer |
265 | 0 | let view = ByteView::from(*view); |
266 | 0 | let buffer_to_new_idx = offsets[*array_idx] + view.buffer_index as usize; |
267 | 0 | let new_buffer_idx: u32 = |
268 | 0 | *buffer_to_new_index[buffer_to_new_idx].get_or_insert_with(|| { |
269 | 0 | buffers.push(array.data_buffers()[view.buffer_index as usize].clone()); |
270 | 0 | (buffers.len() - 1) as u32 |
271 | 0 | }); |
272 | 0 | view.with_buffer_index(new_buffer_idx).as_u128() |
273 | 0 | }) |
274 | 0 | .collect(); |
275 | | |
276 | 0 | let array = unsafe { |
277 | 0 | GenericByteViewArray::<T>::new_unchecked(views.into(), buffers, interleaved.nulls) |
278 | | }; |
279 | 0 | Ok(Arc::new(array)) |
280 | 0 | } |
281 | | |
282 | 0 | fn interleave_struct( |
283 | 0 | fields: &Fields, |
284 | 0 | values: &[&dyn Array], |
285 | 0 | indices: &[(usize, usize)], |
286 | 0 | ) -> Result<ArrayRef, ArrowError> { |
287 | 0 | let interleaved = Interleave::<'_, StructArray>::new(values, indices); |
288 | | |
289 | 0 | let mut struct_fields_array = vec![]; |
290 | | |
291 | 0 | for i in 0..fields.len() { |
292 | 0 | let field_values: Vec<&dyn Array> = interleaved |
293 | 0 | .arrays |
294 | 0 | .iter() |
295 | 0 | .map(|x| x.column(i).as_ref()) |
296 | 0 | .collect(); |
297 | 0 | let interleaved = interleave(&field_values, indices)?; |
298 | 0 | struct_fields_array.push(interleaved); |
299 | | } |
300 | | |
301 | 0 | let struct_array = |
302 | 0 | StructArray::try_new(fields.clone(), struct_fields_array, interleaved.nulls)?; |
303 | | |
304 | 0 | Ok(Arc::new(struct_array)) |
305 | 0 | } |
306 | | |
307 | | /// Fallback implementation of interleave using [`MutableArrayData`] |
308 | 0 | fn interleave_fallback( |
309 | 0 | values: &[&dyn Array], |
310 | 0 | indices: &[(usize, usize)], |
311 | 0 | ) -> Result<ArrayRef, ArrowError> { |
312 | 0 | let arrays: Vec<_> = values.iter().map(|x| x.to_data()).collect(); |
313 | 0 | let arrays: Vec<_> = arrays.iter().collect(); |
314 | 0 | let mut array_data = MutableArrayData::new(arrays, false, indices.len()); |
315 | | |
316 | 0 | let mut cur_array = indices[0].0; |
317 | 0 | let mut start_row_idx = indices[0].1; |
318 | 0 | let mut end_row_idx = start_row_idx + 1; |
319 | | |
320 | 0 | for (array, row) in indices.iter().skip(1).copied() { |
321 | 0 | if array == cur_array && row == end_row_idx { |
322 | | // subsequent row in same batch |
323 | 0 | end_row_idx += 1; |
324 | 0 | continue; |
325 | 0 | } |
326 | | |
327 | | // emit current batch of rows for current buffer |
328 | 0 | array_data.extend(cur_array, start_row_idx, end_row_idx); |
329 | | |
330 | | // start new batch of rows |
331 | 0 | cur_array = array; |
332 | 0 | start_row_idx = row; |
333 | 0 | end_row_idx = start_row_idx + 1; |
334 | | } |
335 | | |
336 | | // emit final batch of rows |
337 | 0 | array_data.extend(cur_array, start_row_idx, end_row_idx); |
338 | 0 | Ok(make_array(array_data.freeze())) |
339 | 0 | } |
340 | | |
341 | | /// Interleave rows by index from multiple [`RecordBatch`] instances and return a new [`RecordBatch`]. |
342 | | /// |
343 | | /// This function will call [`interleave`] on each array of the [`RecordBatch`] instances and assemble a new [`RecordBatch`]. |
344 | | /// |
345 | | /// # Example |
346 | | /// ``` |
347 | | /// # use std::sync::Arc; |
348 | | /// # use arrow_array::{StringArray, Int32Array, RecordBatch, UInt32Array}; |
349 | | /// # use arrow_schema::{DataType, Field, Schema}; |
350 | | /// # use arrow_select::interleave::interleave_record_batch; |
351 | | /// |
352 | | /// let schema = Arc::new(Schema::new(vec![ |
353 | | /// Field::new("a", DataType::Int32, true), |
354 | | /// Field::new("b", DataType::Utf8, true), |
355 | | /// ])); |
356 | | /// |
357 | | /// let batch1 = RecordBatch::try_new( |
358 | | /// schema.clone(), |
359 | | /// vec![ |
360 | | /// Arc::new(Int32Array::from(vec![0, 1, 2])), |
361 | | /// Arc::new(StringArray::from(vec!["a", "b", "c"])), |
362 | | /// ], |
363 | | /// ).unwrap(); |
364 | | /// |
365 | | /// let batch2 = RecordBatch::try_new( |
366 | | /// schema.clone(), |
367 | | /// vec![ |
368 | | /// Arc::new(Int32Array::from(vec![3, 4, 5])), |
369 | | /// Arc::new(StringArray::from(vec!["d", "e", "f"])), |
370 | | /// ], |
371 | | /// ).unwrap(); |
372 | | /// |
373 | | /// let indices = vec![(0, 1), (1, 2), (0, 0), (1, 1)]; |
374 | | /// let interleaved = interleave_record_batch(&[&batch1, &batch2], &indices).unwrap(); |
375 | | /// |
376 | | /// let expected = RecordBatch::try_new( |
377 | | /// schema, |
378 | | /// vec![ |
379 | | /// Arc::new(Int32Array::from(vec![1, 5, 0, 4])), |
380 | | /// Arc::new(StringArray::from(vec!["b", "f", "a", "e"])), |
381 | | /// ], |
382 | | /// ).unwrap(); |
383 | | /// assert_eq!(interleaved, expected); |
384 | | /// ``` |
385 | 0 | pub fn interleave_record_batch( |
386 | 0 | record_batches: &[&RecordBatch], |
387 | 0 | indices: &[(usize, usize)], |
388 | 0 | ) -> Result<RecordBatch, ArrowError> { |
389 | 0 | let schema = record_batches[0].schema(); |
390 | 0 | let columns = (0..schema.fields().len()) |
391 | 0 | .map(|i| { |
392 | 0 | let column_values: Vec<&dyn Array> = record_batches |
393 | 0 | .iter() |
394 | 0 | .map(|batch| batch.column(i).as_ref()) |
395 | 0 | .collect(); |
396 | 0 | interleave(&column_values, indices) |
397 | 0 | }) |
398 | 0 | .collect::<Result<Vec<_>, _>>()?; |
399 | 0 | RecordBatch::try_new(schema, columns) |
400 | 0 | } |
401 | | |
402 | | #[cfg(test)] |
403 | | mod tests { |
404 | | use super::*; |
405 | | use arrow_array::builder::{Int32Builder, ListBuilder, PrimitiveRunBuilder}; |
406 | | use arrow_array::Int32RunArray; |
407 | | use arrow_schema::Field; |
408 | | |
409 | | #[test] |
410 | | fn test_primitive() { |
411 | | let a = Int32Array::from_iter_values([1, 2, 3, 4]); |
412 | | let b = Int32Array::from_iter_values([5, 6, 7]); |
413 | | let c = Int32Array::from_iter_values([8, 9, 10]); |
414 | | let values = interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); |
415 | | let v = values.as_primitive::<Int32Type>(); |
416 | | assert_eq!(v.values(), &[4, 4, 10, 8, 6]); |
417 | | } |
418 | | |
419 | | #[test] |
420 | | fn test_primitive_nulls() { |
421 | | let a = Int32Array::from_iter_values([1, 2, 3, 4]); |
422 | | let b = Int32Array::from_iter([Some(1), Some(4), None]); |
423 | | let values = interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (0, 2)]).unwrap(); |
424 | | let v: Vec<_> = values.as_primitive::<Int32Type>().into_iter().collect(); |
425 | | assert_eq!(&v, &[Some(2), None, None, Some(4), Some(3)]) |
426 | | } |
427 | | |
428 | | #[test] |
429 | | fn test_primitive_empty() { |
430 | | let a = Int32Array::from_iter_values([1, 2, 3, 4]); |
431 | | let v = interleave(&[&a], &[]).unwrap(); |
432 | | assert!(v.is_empty()); |
433 | | assert_eq!(v.data_type(), &DataType::Int32); |
434 | | } |
435 | | |
436 | | #[test] |
437 | | fn test_strings() { |
438 | | let a = StringArray::from_iter_values(["a", "b", "c"]); |
439 | | let b = StringArray::from_iter_values(["hello", "world", "foo"]); |
440 | | let values = interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); |
441 | | let v = values.as_string::<i32>(); |
442 | | let values: Vec<_> = v.into_iter().collect(); |
443 | | assert_eq!( |
444 | | &values, |
445 | | &[ |
446 | | Some("c"), |
447 | | Some("c"), |
448 | | Some("hello"), |
449 | | Some("world"), |
450 | | Some("b") |
451 | | ] |
452 | | ) |
453 | | } |
454 | | |
455 | | #[test] |
456 | | fn test_interleave_dictionary() { |
457 | | let a = DictionaryArray::<Int32Type>::from_iter(["a", "b", "c", "a", "b"]); |
458 | | let b = DictionaryArray::<Int32Type>::from_iter(["a", "c", "a", "c", "a"]); |
459 | | |
460 | | // Should not recompute dictionary |
461 | | let values = |
462 | | interleave(&[&a, &b], &[(0, 2), (0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); |
463 | | let v = values.as_dictionary::<Int32Type>(); |
464 | | assert_eq!(v.values().len(), 5); |
465 | | |
466 | | let vc = v.downcast_dict::<StringArray>().unwrap(); |
467 | | let collected: Vec<_> = vc.into_iter().map(Option::unwrap).collect(); |
468 | | assert_eq!(&collected, &["c", "c", "c", "a", "c", "b"]); |
469 | | |
470 | | // Should recompute dictionary |
471 | | let values = interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 1)]).unwrap(); |
472 | | let v = values.as_dictionary::<Int32Type>(); |
473 | | assert_eq!(v.values().len(), 1); |
474 | | |
475 | | let vc = v.downcast_dict::<StringArray>().unwrap(); |
476 | | let collected: Vec<_> = vc.into_iter().map(Option::unwrap).collect(); |
477 | | assert_eq!(&collected, &["c", "c", "c"]); |
478 | | } |
479 | | |
480 | | #[test] |
481 | | fn test_interleave_dictionary_nulls() { |
482 | | let input_1_keys = Int32Array::from_iter_values([0, 2, 1, 3]); |
483 | | let input_1_values = StringArray::from(vec![Some("foo"), None, Some("bar"), Some("fiz")]); |
484 | | let input_1 = DictionaryArray::new(input_1_keys, Arc::new(input_1_values)); |
485 | | let input_2: DictionaryArray<Int32Type> = vec![None].into_iter().collect(); |
486 | | |
487 | | let expected = vec![Some("fiz"), None, None, Some("foo")]; |
488 | | |
489 | | let values = interleave( |
490 | | &[&input_1 as _, &input_2 as _], |
491 | | &[(0, 3), (0, 2), (1, 0), (0, 0)], |
492 | | ) |
493 | | .unwrap(); |
494 | | let dictionary = values.as_dictionary::<Int32Type>(); |
495 | | let actual: Vec<Option<&str>> = dictionary |
496 | | .downcast_dict::<StringArray>() |
497 | | .unwrap() |
498 | | .into_iter() |
499 | | .collect(); |
500 | | |
501 | | assert_eq!(actual, expected); |
502 | | } |
503 | | |
504 | | #[test] |
505 | | fn test_lists() { |
506 | | // [[1, 2], null, [3]] |
507 | | let mut a = ListBuilder::new(Int32Builder::new()); |
508 | | a.values().append_value(1); |
509 | | a.values().append_value(2); |
510 | | a.append(true); |
511 | | a.append(false); |
512 | | a.values().append_value(3); |
513 | | a.append(true); |
514 | | let a = a.finish(); |
515 | | |
516 | | // [[4], null, [5, 6, null]] |
517 | | let mut b = ListBuilder::new(Int32Builder::new()); |
518 | | b.values().append_value(4); |
519 | | b.append(true); |
520 | | b.append(false); |
521 | | b.values().append_value(5); |
522 | | b.values().append_value(6); |
523 | | b.values().append_null(); |
524 | | b.append(true); |
525 | | let b = b.finish(); |
526 | | |
527 | | let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap(); |
528 | | let v = values.as_any().downcast_ref::<ListArray>().unwrap(); |
529 | | |
530 | | // [[3], null, [4], [5, 6, null], null] |
531 | | let mut expected = ListBuilder::new(Int32Builder::new()); |
532 | | expected.values().append_value(3); |
533 | | expected.append(true); |
534 | | expected.append(false); |
535 | | expected.values().append_value(4); |
536 | | expected.append(true); |
537 | | expected.values().append_value(5); |
538 | | expected.values().append_value(6); |
539 | | expected.values().append_null(); |
540 | | expected.append(true); |
541 | | expected.append(false); |
542 | | let expected = expected.finish(); |
543 | | |
544 | | assert_eq!(v, &expected); |
545 | | } |
546 | | |
547 | | #[test] |
548 | | fn test_struct_without_nulls() { |
549 | | let fields = Fields::from(vec![ |
550 | | Field::new("number_col", DataType::Int32, false), |
551 | | Field::new("string_col", DataType::Utf8, false), |
552 | | ]); |
553 | | let a = { |
554 | | let number_col = Int32Array::from_iter_values([1, 2, 3, 4]); |
555 | | let string_col = StringArray::from_iter_values(["a", "b", "c", "d"]); |
556 | | |
557 | | StructArray::try_new( |
558 | | fields.clone(), |
559 | | vec![Arc::new(number_col), Arc::new(string_col)], |
560 | | None, |
561 | | ) |
562 | | .unwrap() |
563 | | }; |
564 | | |
565 | | let b = { |
566 | | let number_col = Int32Array::from_iter_values([5, 6, 7]); |
567 | | let string_col = StringArray::from_iter_values(["hello", "world", "foo"]); |
568 | | |
569 | | StructArray::try_new( |
570 | | fields.clone(), |
571 | | vec![Arc::new(number_col), Arc::new(string_col)], |
572 | | None, |
573 | | ) |
574 | | .unwrap() |
575 | | }; |
576 | | |
577 | | let c = { |
578 | | let number_col = Int32Array::from_iter_values([8, 9, 10]); |
579 | | let string_col = StringArray::from_iter_values(["x", "y", "z"]); |
580 | | |
581 | | StructArray::try_new( |
582 | | fields.clone(), |
583 | | vec![Arc::new(number_col), Arc::new(string_col)], |
584 | | None, |
585 | | ) |
586 | | .unwrap() |
587 | | }; |
588 | | |
589 | | let values = interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); |
590 | | let values_struct = values.as_struct(); |
591 | | assert_eq!(values_struct.data_type(), &DataType::Struct(fields)); |
592 | | assert_eq!(values_struct.null_count(), 0); |
593 | | |
594 | | let values_number = values_struct.column(0).as_primitive::<Int32Type>(); |
595 | | assert_eq!(values_number.values(), &[4, 4, 10, 8, 6]); |
596 | | let values_string = values_struct.column(1).as_string::<i32>(); |
597 | | let values_string: Vec<_> = values_string.into_iter().collect(); |
598 | | assert_eq!( |
599 | | &values_string, |
600 | | &[Some("d"), Some("d"), Some("z"), Some("x"), Some("world")] |
601 | | ); |
602 | | } |
603 | | |
604 | | #[test] |
605 | | fn test_struct_with_nulls_in_values() { |
606 | | let fields = Fields::from(vec![ |
607 | | Field::new("number_col", DataType::Int32, true), |
608 | | Field::new("string_col", DataType::Utf8, true), |
609 | | ]); |
610 | | let a = { |
611 | | let number_col = Int32Array::from_iter_values([1, 2, 3, 4]); |
612 | | let string_col = StringArray::from_iter_values(["a", "b", "c", "d"]); |
613 | | |
614 | | StructArray::try_new( |
615 | | fields.clone(), |
616 | | vec![Arc::new(number_col), Arc::new(string_col)], |
617 | | None, |
618 | | ) |
619 | | .unwrap() |
620 | | }; |
621 | | |
622 | | let b = { |
623 | | let number_col = Int32Array::from_iter([Some(1), Some(4), None]); |
624 | | let string_col = StringArray::from(vec![Some("hello"), None, Some("foo")]); |
625 | | |
626 | | StructArray::try_new( |
627 | | fields.clone(), |
628 | | vec![Arc::new(number_col), Arc::new(string_col)], |
629 | | None, |
630 | | ) |
631 | | .unwrap() |
632 | | }; |
633 | | |
634 | | let values = interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (1, 1)]).unwrap(); |
635 | | let values_struct = values.as_struct(); |
636 | | assert_eq!(values_struct.data_type(), &DataType::Struct(fields)); |
637 | | |
638 | | // The struct itself has no nulls, but the values do |
639 | | assert_eq!(values_struct.null_count(), 0); |
640 | | |
641 | | let values_number: Vec<_> = values_struct |
642 | | .column(0) |
643 | | .as_primitive::<Int32Type>() |
644 | | .into_iter() |
645 | | .collect(); |
646 | | assert_eq!(values_number, &[Some(2), None, None, Some(4), Some(4)]); |
647 | | |
648 | | let values_string = values_struct.column(1).as_string::<i32>(); |
649 | | let values_string: Vec<_> = values_string.into_iter().collect(); |
650 | | assert_eq!( |
651 | | &values_string, |
652 | | &[Some("b"), Some("foo"), Some("foo"), Some("d"), None] |
653 | | ); |
654 | | } |
655 | | |
656 | | #[test] |
657 | | fn test_struct_with_nulls() { |
658 | | let fields = Fields::from(vec![ |
659 | | Field::new("number_col", DataType::Int32, false), |
660 | | Field::new("string_col", DataType::Utf8, false), |
661 | | ]); |
662 | | let a = { |
663 | | let number_col = Int32Array::from_iter_values([1, 2, 3, 4]); |
664 | | let string_col = StringArray::from_iter_values(["a", "b", "c", "d"]); |
665 | | |
666 | | StructArray::try_new( |
667 | | fields.clone(), |
668 | | vec![Arc::new(number_col), Arc::new(string_col)], |
669 | | None, |
670 | | ) |
671 | | .unwrap() |
672 | | }; |
673 | | |
674 | | let b = { |
675 | | let number_col = Int32Array::from_iter_values([5, 6, 7]); |
676 | | let string_col = StringArray::from_iter_values(["hello", "world", "foo"]); |
677 | | |
678 | | StructArray::try_new( |
679 | | fields.clone(), |
680 | | vec![Arc::new(number_col), Arc::new(string_col)], |
681 | | Some(NullBuffer::from(&[true, false, true])), |
682 | | ) |
683 | | .unwrap() |
684 | | }; |
685 | | |
686 | | let c = { |
687 | | let number_col = Int32Array::from_iter_values([8, 9, 10]); |
688 | | let string_col = StringArray::from_iter_values(["x", "y", "z"]); |
689 | | |
690 | | StructArray::try_new( |
691 | | fields.clone(), |
692 | | vec![Arc::new(number_col), Arc::new(string_col)], |
693 | | None, |
694 | | ) |
695 | | .unwrap() |
696 | | }; |
697 | | |
698 | | let values = interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (1, 1), (2, 0)]).unwrap(); |
699 | | let values_struct = values.as_struct(); |
700 | | assert_eq!(values_struct.data_type(), &DataType::Struct(fields)); |
701 | | |
702 | | let validity: Vec<bool> = { |
703 | | let null_buffer = values_struct.nulls().expect("should_have_nulls"); |
704 | | |
705 | | null_buffer.iter().collect() |
706 | | }; |
707 | | assert_eq!(validity, &[true, true, true, false, true]); |
708 | | let values_number = values_struct.column(0).as_primitive::<Int32Type>(); |
709 | | assert_eq!(values_number.values(), &[4, 4, 10, 6, 8]); |
710 | | let values_string = values_struct.column(1).as_string::<i32>(); |
711 | | let values_string: Vec<_> = values_string.into_iter().collect(); |
712 | | assert_eq!( |
713 | | &values_string, |
714 | | &[Some("d"), Some("d"), Some("z"), Some("world"), Some("x"),] |
715 | | ); |
716 | | } |
717 | | |
718 | | #[test] |
719 | | fn test_struct_empty() { |
720 | | let fields = Fields::from(vec![ |
721 | | Field::new("number_col", DataType::Int32, false), |
722 | | Field::new("string_col", DataType::Utf8, false), |
723 | | ]); |
724 | | let a = { |
725 | | let number_col = Int32Array::from_iter_values([1, 2, 3, 4]); |
726 | | let string_col = StringArray::from_iter_values(["a", "b", "c", "d"]); |
727 | | |
728 | | StructArray::try_new( |
729 | | fields.clone(), |
730 | | vec![Arc::new(number_col), Arc::new(string_col)], |
731 | | None, |
732 | | ) |
733 | | .unwrap() |
734 | | }; |
735 | | let v = interleave(&[&a], &[]).unwrap(); |
736 | | assert!(v.is_empty()); |
737 | | assert_eq!(v.data_type(), &DataType::Struct(fields)); |
738 | | } |
739 | | |
740 | | #[test] |
741 | | fn interleave_sparse_nulls() { |
742 | | let values = StringArray::from_iter_values((0..100).map(|x| x.to_string())); |
743 | | let keys = Int32Array::from_iter_values(0..10); |
744 | | let dict_a = DictionaryArray::new(keys, Arc::new(values)); |
745 | | let values = StringArray::new_null(0); |
746 | | let keys = Int32Array::new_null(10); |
747 | | let dict_b = DictionaryArray::new(keys, Arc::new(values)); |
748 | | |
749 | | let indices = &[(0, 0), (0, 1), (0, 2), (1, 0)]; |
750 | | let array = interleave(&[&dict_a, &dict_b], indices).unwrap(); |
751 | | |
752 | | let expected = |
753 | | DictionaryArray::<Int32Type>::from_iter(vec![Some("0"), Some("1"), Some("2"), None]); |
754 | | assert_eq!(array.as_ref(), &expected) |
755 | | } |
756 | | |
757 | | #[test] |
758 | | fn test_interleave_views() { |
759 | | let values = StringArray::from_iter_values([ |
760 | | "hello", |
761 | | "world_long_string_not_inlined", |
762 | | "foo", |
763 | | "bar", |
764 | | "baz", |
765 | | ]); |
766 | | let view_a = StringViewArray::from(&values); |
767 | | |
768 | | let values = StringArray::from_iter_values([ |
769 | | "test", |
770 | | "data", |
771 | | "more_long_string_not_inlined", |
772 | | "views", |
773 | | "here", |
774 | | ]); |
775 | | let view_b = StringViewArray::from(&values); |
776 | | |
777 | | let indices = &[ |
778 | | (0, 2), // "foo" |
779 | | (1, 0), // "test" |
780 | | (0, 4), // "baz" |
781 | | (1, 3), // "views" |
782 | | (0, 1), // "world_long_string_not_inlined" |
783 | | ]; |
784 | | |
785 | | // Test specialized implementation |
786 | | let values = interleave(&[&view_a, &view_b], indices).unwrap(); |
787 | | let result = values.as_string_view(); |
788 | | assert_eq!(result.data_buffers().len(), 1); |
789 | | |
790 | | let fallback = interleave_fallback(&[&view_a, &view_b], indices).unwrap(); |
791 | | let fallback_result = fallback.as_string_view(); |
792 | | // note that fallback_result has 2 buffers, but only one long enough string to warrant a buffer |
793 | | assert_eq!(fallback_result.data_buffers().len(), 2); |
794 | | |
795 | | // Convert to strings for easier assertion |
796 | | let collected: Vec<_> = result.iter().map(|x| x.map(|s| s.to_string())).collect(); |
797 | | |
798 | | let fallback_collected: Vec<_> = fallback_result |
799 | | .iter() |
800 | | .map(|x| x.map(|s| s.to_string())) |
801 | | .collect(); |
802 | | |
803 | | assert_eq!(&collected, &fallback_collected); |
804 | | |
805 | | assert_eq!( |
806 | | &collected, |
807 | | &[ |
808 | | Some("foo".to_string()), |
809 | | Some("test".to_string()), |
810 | | Some("baz".to_string()), |
811 | | Some("views".to_string()), |
812 | | Some("world_long_string_not_inlined".to_string()), |
813 | | ] |
814 | | ); |
815 | | } |
816 | | |
817 | | #[test] |
818 | | fn test_interleave_views_with_nulls() { |
819 | | let values = StringArray::from_iter([ |
820 | | Some("hello"), |
821 | | None, |
822 | | Some("foo_long_string_not_inlined"), |
823 | | Some("bar"), |
824 | | None, |
825 | | ]); |
826 | | let view_a = StringViewArray::from(&values); |
827 | | |
828 | | let values = StringArray::from_iter([ |
829 | | Some("test"), |
830 | | Some("data_long_string_not_inlined"), |
831 | | None, |
832 | | None, |
833 | | Some("here"), |
834 | | ]); |
835 | | let view_b = StringViewArray::from(&values); |
836 | | |
837 | | let indices = &[ |
838 | | (0, 1), // null |
839 | | (1, 2), // null |
840 | | (0, 2), // "foo_long_string_not_inlined" |
841 | | (1, 3), // null |
842 | | (0, 4), // null |
843 | | ]; |
844 | | |
845 | | // Test specialized implementation |
846 | | let values = interleave(&[&view_a, &view_b], indices).unwrap(); |
847 | | let result = values.as_string_view(); |
848 | | assert_eq!(result.data_buffers().len(), 1); |
849 | | |
850 | | let fallback = interleave_fallback(&[&view_a, &view_b], indices).unwrap(); |
851 | | let fallback_result = fallback.as_string_view(); |
852 | | |
853 | | // Convert to strings for easier assertion |
854 | | let collected: Vec<_> = result.iter().map(|x| x.map(|s| s.to_string())).collect(); |
855 | | |
856 | | let fallback_collected: Vec<_> = fallback_result |
857 | | .iter() |
858 | | .map(|x| x.map(|s| s.to_string())) |
859 | | .collect(); |
860 | | |
861 | | assert_eq!(&collected, &fallback_collected); |
862 | | |
863 | | assert_eq!( |
864 | | &collected, |
865 | | &[ |
866 | | None, |
867 | | None, |
868 | | Some("foo_long_string_not_inlined".to_string()), |
869 | | None, |
870 | | None, |
871 | | ] |
872 | | ); |
873 | | } |
874 | | |
875 | | #[test] |
876 | | fn test_interleave_views_multiple_buffers() { |
877 | | let str1 = "very_long_string_from_first_buffer".as_bytes(); |
878 | | let str2 = "very_long_string_from_second_buffer".as_bytes(); |
879 | | let buffer1 = str1.to_vec().into(); |
880 | | let buffer2 = str2.to_vec().into(); |
881 | | |
882 | | let view1 = ByteView::new(str1.len() as u32, &str1[..4]) |
883 | | .with_buffer_index(0) |
884 | | .with_offset(0) |
885 | | .as_u128(); |
886 | | let view2 = ByteView::new(str2.len() as u32, &str2[..4]) |
887 | | .with_buffer_index(1) |
888 | | .with_offset(0) |
889 | | .as_u128(); |
890 | | let view_a = |
891 | | StringViewArray::try_new(vec![view1, view2].into(), vec![buffer1, buffer2], None) |
892 | | .unwrap(); |
893 | | |
894 | | let str3 = "another_very_long_string_buffer_three".as_bytes(); |
895 | | let str4 = "different_long_string_in_buffer_four".as_bytes(); |
896 | | let buffer3 = str3.to_vec().into(); |
897 | | let buffer4 = str4.to_vec().into(); |
898 | | |
899 | | let view3 = ByteView::new(str3.len() as u32, &str3[..4]) |
900 | | .with_buffer_index(0) |
901 | | .with_offset(0) |
902 | | .as_u128(); |
903 | | let view4 = ByteView::new(str4.len() as u32, &str4[..4]) |
904 | | .with_buffer_index(1) |
905 | | .with_offset(0) |
906 | | .as_u128(); |
907 | | let view_b = |
908 | | StringViewArray::try_new(vec![view3, view4].into(), vec![buffer3, buffer4], None) |
909 | | .unwrap(); |
910 | | |
911 | | let indices = &[ |
912 | | (0, 0), // String from first buffer of array A |
913 | | (1, 0), // String from first buffer of array B |
914 | | (0, 1), // String from second buffer of array A |
915 | | (1, 1), // String from second buffer of array B |
916 | | (0, 0), // String from first buffer of array A again |
917 | | (1, 1), // String from second buffer of array B again |
918 | | ]; |
919 | | |
920 | | // Test interleave |
921 | | let values = interleave(&[&view_a, &view_b], indices).unwrap(); |
922 | | let result = values.as_string_view(); |
923 | | |
924 | | assert_eq!( |
925 | | result.data_buffers().len(), |
926 | | 4, |
927 | | "Expected four buffers (two from each input array)" |
928 | | ); |
929 | | |
930 | | let result_strings: Vec<_> = result.iter().map(|x| x.map(|s| s.to_string())).collect(); |
931 | | assert_eq!( |
932 | | result_strings, |
933 | | vec![ |
934 | | Some("very_long_string_from_first_buffer".to_string()), |
935 | | Some("another_very_long_string_buffer_three".to_string()), |
936 | | Some("very_long_string_from_second_buffer".to_string()), |
937 | | Some("different_long_string_in_buffer_four".to_string()), |
938 | | Some("very_long_string_from_first_buffer".to_string()), |
939 | | Some("different_long_string_in_buffer_four".to_string()), |
940 | | ] |
941 | | ); |
942 | | |
943 | | let views = result.views(); |
944 | | let buffer_indices: Vec<_> = views |
945 | | .iter() |
946 | | .map(|raw_view| ByteView::from(*raw_view).buffer_index) |
947 | | .collect(); |
948 | | |
949 | | assert_eq!( |
950 | | buffer_indices, |
951 | | vec![ |
952 | | 0, // First buffer from array A |
953 | | 1, // First buffer from array B |
954 | | 2, // Second buffer from array A |
955 | | 3, // Second buffer from array B |
956 | | 0, // First buffer from array A (reused) |
957 | | 3, // Second buffer from array B (reused) |
958 | | ] |
959 | | ); |
960 | | } |
961 | | |
962 | | #[test] |
963 | | fn test_interleave_run_end_encoded_primitive() { |
964 | | let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); |
965 | | builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some)); |
966 | | let a = builder.finish(); |
967 | | |
968 | | let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); |
969 | | builder.extend([4, 5, 5, 6, 6, 6].into_iter().map(Some)); |
970 | | let b = builder.finish(); |
971 | | |
972 | | let indices = &[(0, 1), (1, 0), (0, 4), (1, 2), (0, 5)]; |
973 | | let result = interleave(&[&a, &b], indices).unwrap(); |
974 | | |
975 | | // The result should be a RunEndEncoded array |
976 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
977 | | |
978 | | // Cast to RunArray to access values |
979 | | let result_run_array: &Int32RunArray = result.as_any().downcast_ref().unwrap(); |
980 | | |
981 | | // Verify the logical values by accessing the logical array directly |
982 | | let expected = vec![1, 4, 2, 5, 3]; |
983 | | let mut actual = Vec::new(); |
984 | | for i in 0..result_run_array.len() { |
985 | | let physical_idx = result_run_array.get_physical_index(i); |
986 | | let value = result_run_array |
987 | | .values() |
988 | | .as_primitive::<Int32Type>() |
989 | | .value(physical_idx); |
990 | | actual.push(value); |
991 | | } |
992 | | assert_eq!(actual, expected); |
993 | | } |
994 | | |
995 | | #[test] |
996 | | fn test_interleave_run_end_encoded_string() { |
997 | | let a: Int32RunArray = vec!["hello", "hello", "world", "world", "foo"] |
998 | | .into_iter() |
999 | | .collect(); |
1000 | | let b: Int32RunArray = vec!["bar", "baz", "baz", "qux"].into_iter().collect(); |
1001 | | |
1002 | | let indices = &[(0, 0), (1, 1), (0, 3), (1, 3), (0, 4)]; |
1003 | | let result = interleave(&[&a, &b], indices).unwrap(); |
1004 | | |
1005 | | // The result should be a RunEndEncoded array |
1006 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
1007 | | |
1008 | | // Cast to RunArray to access values |
1009 | | let result_run_array: &Int32RunArray = result.as_any().downcast_ref().unwrap(); |
1010 | | |
1011 | | // Verify the logical values by accessing the logical array directly |
1012 | | let expected = vec!["hello", "baz", "world", "qux", "foo"]; |
1013 | | let mut actual = Vec::new(); |
1014 | | for i in 0..result_run_array.len() { |
1015 | | let physical_idx = result_run_array.get_physical_index(i); |
1016 | | let value = result_run_array |
1017 | | .values() |
1018 | | .as_string::<i32>() |
1019 | | .value(physical_idx); |
1020 | | actual.push(value); |
1021 | | } |
1022 | | assert_eq!(actual, expected); |
1023 | | } |
1024 | | |
1025 | | #[test] |
1026 | | fn test_interleave_run_end_encoded_with_nulls() { |
1027 | | let a: Int32RunArray = vec![Some("a"), Some("a"), None, None, Some("b")] |
1028 | | .into_iter() |
1029 | | .collect(); |
1030 | | let b: Int32RunArray = vec![None, Some("c"), Some("c"), Some("d")] |
1031 | | .into_iter() |
1032 | | .collect(); |
1033 | | |
1034 | | let indices = &[(0, 1), (1, 0), (0, 2), (1, 3), (0, 4)]; |
1035 | | let result = interleave(&[&a, &b], indices).unwrap(); |
1036 | | |
1037 | | // The result should be a RunEndEncoded array |
1038 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
1039 | | |
1040 | | // Cast to RunArray to access values |
1041 | | let result_run_array: &Int32RunArray = result.as_any().downcast_ref().unwrap(); |
1042 | | |
1043 | | // Verify the logical values by accessing the logical array directly |
1044 | | let expected = vec![Some("a"), None, None, Some("d"), Some("b")]; |
1045 | | let mut actual = Vec::new(); |
1046 | | for i in 0..result_run_array.len() { |
1047 | | let physical_idx = result_run_array.get_physical_index(i); |
1048 | | if result_run_array.values().is_null(physical_idx) { |
1049 | | actual.push(None); |
1050 | | } else { |
1051 | | let value = result_run_array |
1052 | | .values() |
1053 | | .as_string::<i32>() |
1054 | | .value(physical_idx); |
1055 | | actual.push(Some(value)); |
1056 | | } |
1057 | | } |
1058 | | assert_eq!(actual, expected); |
1059 | | } |
1060 | | |
1061 | | #[test] |
1062 | | fn test_interleave_run_end_encoded_different_run_types() { |
1063 | | let mut builder = PrimitiveRunBuilder::<Int16Type, Int32Type>::new(); |
1064 | | builder.extend([1, 1, 2, 3, 3].into_iter().map(Some)); |
1065 | | let a = builder.finish(); |
1066 | | |
1067 | | let mut builder = PrimitiveRunBuilder::<Int16Type, Int32Type>::new(); |
1068 | | builder.extend([4, 5, 5, 6].into_iter().map(Some)); |
1069 | | let b = builder.finish(); |
1070 | | |
1071 | | let indices = &[(0, 0), (1, 1), (0, 3), (1, 3)]; |
1072 | | let result = interleave(&[&a, &b], indices).unwrap(); |
1073 | | |
1074 | | // The result should be a RunEndEncoded array |
1075 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
1076 | | |
1077 | | // Cast to RunArray to access values |
1078 | | let result_run_array: &RunArray<Int16Type> = result.as_any().downcast_ref().unwrap(); |
1079 | | |
1080 | | // Verify the logical values by accessing the logical array directly |
1081 | | let expected = vec![1, 5, 3, 6]; |
1082 | | let mut actual = Vec::new(); |
1083 | | for i in 0..result_run_array.len() { |
1084 | | let physical_idx = result_run_array.get_physical_index(i); |
1085 | | let value = result_run_array |
1086 | | .values() |
1087 | | .as_primitive::<Int32Type>() |
1088 | | .value(physical_idx); |
1089 | | actual.push(value); |
1090 | | } |
1091 | | assert_eq!(actual, expected); |
1092 | | } |
1093 | | |
1094 | | #[test] |
1095 | | fn test_interleave_run_end_encoded_mixed_run_lengths() { |
1096 | | let mut builder = PrimitiveRunBuilder::<Int64Type, Int32Type>::new(); |
1097 | | builder.extend([1, 2, 2, 2, 2, 3, 3, 4].into_iter().map(Some)); |
1098 | | let a = builder.finish(); |
1099 | | |
1100 | | let mut builder = PrimitiveRunBuilder::<Int64Type, Int32Type>::new(); |
1101 | | builder.extend([5, 5, 5, 6, 7, 7, 8, 8].into_iter().map(Some)); |
1102 | | let b = builder.finish(); |
1103 | | |
1104 | | let indices = &[ |
1105 | | (0, 0), // 1 |
1106 | | (1, 2), // 5 |
1107 | | (0, 3), // 2 |
1108 | | (1, 3), // 6 |
1109 | | (0, 6), // 3 |
1110 | | (1, 6), // 8 |
1111 | | (0, 7), // 4 |
1112 | | (1, 4), // 7 |
1113 | | ]; |
1114 | | let result = interleave(&[&a, &b], indices).unwrap(); |
1115 | | |
1116 | | // The result should be a RunEndEncoded array |
1117 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
1118 | | |
1119 | | // Cast to RunArray to access values |
1120 | | let result_run_array: &RunArray<Int64Type> = result.as_any().downcast_ref().unwrap(); |
1121 | | |
1122 | | // Verify the logical values by accessing the logical array directly |
1123 | | let expected = vec![1, 5, 2, 6, 3, 8, 4, 7]; |
1124 | | let mut actual = Vec::new(); |
1125 | | for i in 0..result_run_array.len() { |
1126 | | let physical_idx = result_run_array.get_physical_index(i); |
1127 | | let value = result_run_array |
1128 | | .values() |
1129 | | .as_primitive::<Int32Type>() |
1130 | | .value(physical_idx); |
1131 | | actual.push(value); |
1132 | | } |
1133 | | assert_eq!(actual, expected); |
1134 | | } |
1135 | | |
1136 | | #[test] |
1137 | | fn test_interleave_run_end_encoded_empty_runs() { |
1138 | | let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); |
1139 | | builder.extend([1].into_iter().map(Some)); |
1140 | | let a = builder.finish(); |
1141 | | |
1142 | | let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); |
1143 | | builder.extend([2, 2, 2].into_iter().map(Some)); |
1144 | | let b = builder.finish(); |
1145 | | |
1146 | | let indices = &[(0, 0), (1, 1), (1, 2)]; |
1147 | | let result = interleave(&[&a, &b], indices).unwrap(); |
1148 | | |
1149 | | // The result should be a RunEndEncoded array |
1150 | | assert!(matches!(result.data_type(), DataType::RunEndEncoded(_, _))); |
1151 | | |
1152 | | // Cast to RunArray to access values |
1153 | | let result_run_array: &Int32RunArray = result.as_any().downcast_ref().unwrap(); |
1154 | | |
1155 | | // Verify the logical values by accessing the logical array directly |
1156 | | let expected = vec![1, 2, 2]; |
1157 | | let mut actual = Vec::new(); |
1158 | | for i in 0..result_run_array.len() { |
1159 | | let physical_idx = result_run_array.get_physical_index(i); |
1160 | | let value = result_run_array |
1161 | | .values() |
1162 | | .as_primitive::<Int32Type>() |
1163 | | .value(physical_idx); |
1164 | | actual.push(value); |
1165 | | } |
1166 | | assert_eq!(actual, expected); |
1167 | | } |
1168 | | } |