/Users/andrewlamb/Software/arrow-rs/arrow-select/src/zip.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! [`zip`]: Combine values from two arrays based on boolean mask |
19 | | |
20 | | use crate::filter::{SlicesIterator, prep_null_mask_filter}; |
21 | | use arrow_array::cast::AsArray; |
22 | | use arrow_array::types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, Utf8Type}; |
23 | | use arrow_array::*; |
24 | | use arrow_buffer::{ |
25 | | BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, OffsetBufferBuilder, |
26 | | ScalarBuffer, |
27 | | }; |
28 | | use arrow_data::ArrayData; |
29 | | use arrow_data::transform::MutableArrayData; |
30 | | use arrow_schema::{ArrowError, DataType}; |
31 | | use std::fmt::{Debug, Formatter}; |
32 | | use std::hash::Hash; |
33 | | use std::marker::PhantomData; |
34 | | use std::ops::Not; |
35 | | use std::sync::Arc; |
36 | | |
37 | | /// Zip two arrays by some boolean mask. |
38 | | /// |
39 | | /// - Where `mask` is `true`, values of `truthy` are taken |
40 | | /// - Where `mask` is `false` or `NULL`, values of `falsy` are taken |
41 | | /// |
42 | | /// # Example: `zip` two arrays |
43 | | /// ``` |
44 | | /// # use std::sync::Arc; |
45 | | /// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; |
46 | | /// # use arrow_select::zip::zip; |
47 | | /// // mask: [true, true, false, NULL, true] |
48 | | /// let mask = BooleanArray::from(vec![ |
49 | | /// Some(true), Some(true), Some(false), None, Some(true) |
50 | | /// ]); |
51 | | /// // truthy array: [1, NULL, 3, 4, 5] |
52 | | /// let truthy = Int32Array::from(vec![ |
53 | | /// Some(1), None, Some(3), Some(4), Some(5) |
54 | | /// ]); |
55 | | /// // falsy array: [10, 20, 30, 40, 50] |
56 | | /// let falsy = Int32Array::from(vec![ |
57 | | /// Some(10), Some(20), Some(30), Some(40), Some(50) |
58 | | /// ]); |
59 | | /// // zip with this mask select the first, second and last value from `truthy` |
60 | | /// // and the third and fourth value from `falsy` |
61 | | /// let result = zip(&mask, &truthy, &falsy).unwrap(); |
62 | | /// // Expected: [1, NULL, 30, 40, 5] |
63 | | /// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ |
64 | | /// Some(1), None, Some(30), Some(40), Some(5) |
65 | | /// ])); |
66 | | /// assert_eq!(&result, &expected); |
67 | | /// ``` |
68 | | /// |
69 | | /// # Example: `zip` and array with a scalar |
70 | | /// |
71 | | /// Use `zip` to replace certain values in an array with a scalar |
72 | | /// |
73 | | /// ``` |
74 | | /// # use std::sync::Arc; |
75 | | /// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; |
76 | | /// # use arrow_select::zip::zip; |
77 | | /// // mask: [true, true, false, NULL, true] |
78 | | /// let mask = BooleanArray::from(vec![ |
79 | | /// Some(true), Some(true), Some(false), None, Some(true) |
80 | | /// ]); |
81 | | /// // array: [1, NULL, 3, 4, 5] |
82 | | /// let arr = Int32Array::from(vec![ |
83 | | /// Some(1), None, Some(3), Some(4), Some(5) |
84 | | /// ]); |
85 | | /// // scalar: 42 |
86 | | /// let scalar = Int32Array::new_scalar(42); |
87 | | /// // zip the array with the mask select the first, second and last value from `arr` |
88 | | /// // and fill the third and fourth value with the scalar 42 |
89 | | /// let result = zip(&mask, &arr, &scalar).unwrap(); |
90 | | /// // Expected: [1, NULL, 42, 42, 5] |
91 | | /// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ |
92 | | /// Some(1), None, Some(42), Some(42), Some(5) |
93 | | /// ])); |
94 | | /// assert_eq!(&result, &expected); |
95 | | /// ``` |
96 | 78 | pub fn zip( |
97 | 78 | mask: &BooleanArray, |
98 | 78 | truthy: &dyn Datum, |
99 | 78 | falsy: &dyn Datum, |
100 | 78 | ) -> Result<ArrayRef, ArrowError> { |
101 | 78 | let (truthy_array, truthy_is_scalar) = truthy.get(); |
102 | 78 | let (falsy_array, falsy_is_scalar) = falsy.get(); |
103 | | |
104 | 78 | if falsy_is_scalar && truthy_is_scalar54 { |
105 | 39 | let zipper = ScalarZipper::try_new(truthy, falsy)?0 ; |
106 | 39 | return zipper.zip_impl.create_output(mask); |
107 | 39 | } |
108 | | |
109 | 39 | let truthy = truthy_array; |
110 | 39 | let falsy = falsy_array; |
111 | | |
112 | 39 | if truthy.data_type() != falsy.data_type() { |
113 | 0 | return Err(ArrowError::InvalidArgumentError( |
114 | 0 | "arguments need to have the same data type".into(), |
115 | 0 | )); |
116 | 39 | } |
117 | | |
118 | 39 | if truthy_is_scalar && truthy.len() != 114 { |
119 | 0 | return Err(ArrowError::InvalidArgumentError( |
120 | 0 | "scalar arrays must have 1 element".into(), |
121 | 0 | )); |
122 | 39 | } |
123 | 39 | if !truthy_is_scalar && truthy25 .len25 () != mask.len() { |
124 | 0 | return Err(ArrowError::InvalidArgumentError( |
125 | 0 | "all arrays should have the same length".into(), |
126 | 0 | )); |
127 | 39 | } |
128 | 39 | if falsy_is_scalar && falsy.len() != 115 { |
129 | 0 | return Err(ArrowError::InvalidArgumentError( |
130 | 0 | "scalar arrays must have 1 element".into(), |
131 | 0 | )); |
132 | 39 | } |
133 | 39 | if !falsy_is_scalar && falsy24 .len24 () != mask.len() { |
134 | 0 | return Err(ArrowError::InvalidArgumentError( |
135 | 0 | "all arrays should have the same length".into(), |
136 | 0 | )); |
137 | 39 | } |
138 | | |
139 | 39 | let falsy = falsy.to_data(); |
140 | 39 | let truthy = truthy.to_data(); |
141 | | |
142 | 39 | zip_impl(mask, &truthy, truthy_is_scalar, &falsy, falsy_is_scalar) |
143 | 78 | } |
144 | | |
145 | 40 | fn zip_impl( |
146 | 40 | mask: &BooleanArray, |
147 | 40 | truthy: &ArrayData, |
148 | 40 | truthy_is_scalar: bool, |
149 | 40 | falsy: &ArrayData, |
150 | 40 | falsy_is_scalar: bool, |
151 | 40 | ) -> Result<ArrayRef, ArrowError> { |
152 | 40 | let mut mutable = MutableArrayData::new(vec![truthy, falsy], false, truthy.len()); |
153 | | |
154 | | // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to |
155 | | // fill with falsy values |
156 | | |
157 | | // keep track of how much is filled |
158 | 40 | let mut filled = 0; |
159 | | |
160 | 40 | let mask_buffer = maybe_prep_null_mask_filter(mask); |
161 | 40 | SlicesIterator::from(&mask_buffer).for_each(|(start, end)| {35 |
162 | | // the gap needs to be filled with falsy values |
163 | 35 | if start > filled { |
164 | 8 | if falsy_is_scalar { |
165 | 7 | for _ in filled4 ..start4 { |
166 | 7 | // Copy the first item from the 'falsy' array into the output buffer. |
167 | 7 | mutable.extend(1, 0, 1); |
168 | 7 | } |
169 | 4 | } else { |
170 | 4 | mutable.extend(1, filled, start); |
171 | 4 | } |
172 | 27 | } |
173 | | // fill with truthy values |
174 | 35 | if truthy_is_scalar { |
175 | 20 | for _ in start14 ..end14 { |
176 | 20 | // Copy the first item from the 'truthy' array into the output buffer. |
177 | 20 | mutable.extend(0, 0, 1); |
178 | 20 | } |
179 | 21 | } else { |
180 | 21 | mutable.extend(0, start, end); |
181 | 21 | } |
182 | 35 | filled = end; |
183 | 35 | }); |
184 | | // the remaining part is falsy |
185 | 40 | if filled < mask.len() { |
186 | 27 | if falsy_is_scalar { |
187 | 15 | for _ in filled11 ..mask11 .len11 () { |
188 | 15 | // Copy the first item from the 'falsy' array into the output buffer. |
189 | 15 | mutable.extend(1, 0, 1); |
190 | 15 | } |
191 | 16 | } else { |
192 | 16 | mutable.extend(1, filled, mask.len()); |
193 | 16 | } |
194 | 13 | } |
195 | | |
196 | 40 | let data = mutable.freeze(); |
197 | 40 | Ok(make_array(data)) |
198 | 40 | } |
199 | | |
200 | | /// Zipper for 2 scalars |
201 | | /// |
202 | | /// Useful for using in `IF <expr> THEN <scalar> ELSE <scalar> END` expressions |
203 | | /// |
204 | | /// # Example |
205 | | /// ``` |
206 | | /// # use std::sync::Arc; |
207 | | /// # use arrow_array::{ArrayRef, BooleanArray, Int32Array, Scalar, cast::AsArray, types::Int32Type}; |
208 | | /// |
209 | | /// # use arrow_select::zip::ScalarZipper; |
210 | | /// let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
211 | | /// let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); |
212 | | /// let zipper = ScalarZipper::try_new(&scalar_truthy, &scalar_falsy).unwrap(); |
213 | | /// |
214 | | /// // Later when we have a boolean mask |
215 | | /// let mask = BooleanArray::from(vec![true, false, true, false, true]); |
216 | | /// let result = zipper.zip(&mask).unwrap(); |
217 | | /// let actual = result.as_primitive::<Int32Type>(); |
218 | | /// let expected = Int32Array::from(vec![Some(42), Some(123), Some(42), Some(123), Some(42)]); |
219 | | /// ``` |
220 | | /// |
221 | | #[derive(Debug, Clone)] |
222 | | pub struct ScalarZipper { |
223 | | zip_impl: Arc<dyn ZipImpl>, |
224 | | } |
225 | | |
226 | | impl ScalarZipper { |
227 | | /// Try to create a new ScalarZipper from two scalar Datum |
228 | | /// |
229 | | /// # Errors |
230 | | /// returns error if: |
231 | | /// - the two Datum have different data types |
232 | | /// - either Datum is not a scalar (or has more than 1 element) |
233 | | /// |
234 | 40 | pub fn try_new(truthy: &dyn Datum, falsy: &dyn Datum) -> Result<Self, ArrowError> { |
235 | 40 | let (truthy, truthy_is_scalar) = truthy.get(); |
236 | 40 | let (falsy, falsy_is_scalar) = falsy.get(); |
237 | | |
238 | 40 | if truthy.data_type() != falsy.data_type() { |
239 | 0 | return Err(ArrowError::InvalidArgumentError( |
240 | 0 | "arguments need to have the same data type".into(), |
241 | 0 | )); |
242 | 40 | } |
243 | | |
244 | 40 | if !truthy_is_scalar { |
245 | 0 | return Err(ArrowError::InvalidArgumentError( |
246 | 0 | "only scalar arrays are supported".into(), |
247 | 0 | )); |
248 | 40 | } |
249 | | |
250 | 40 | if !falsy_is_scalar { |
251 | 0 | return Err(ArrowError::InvalidArgumentError( |
252 | 0 | "only scalar arrays are supported".into(), |
253 | 0 | )); |
254 | 40 | } |
255 | | |
256 | 40 | if truthy.len() != 1 { |
257 | 0 | return Err(ArrowError::InvalidArgumentError( |
258 | 0 | "scalar arrays must have 1 element".into(), |
259 | 0 | )); |
260 | 40 | } |
261 | 40 | if falsy.len() != 1 { |
262 | 0 | return Err(ArrowError::InvalidArgumentError( |
263 | 0 | "scalar arrays must have 1 element".into(), |
264 | 0 | )); |
265 | 40 | } |
266 | | |
267 | | macro_rules! primitive_size_helper { |
268 | | ($t:ty) => { |
269 | | Arc::new(PrimitiveScalarImpl::<$t>::new(truthy, falsy)) as Arc<dyn ZipImpl> |
270 | | }; |
271 | | } |
272 | | |
273 | 40 | let zip_impl = downcast_primitive!0 { |
274 | 24 | truthy.data_type() => (primitive_size_helper), |
275 | | DataType::Utf8 => { |
276 | 11 | Arc::new(BytesScalarImpl::<Utf8Type>::new(truthy, falsy)) as Arc<dyn ZipImpl> |
277 | | }, |
278 | | DataType::LargeUtf8 => { |
279 | 1 | Arc::new(BytesScalarImpl::<LargeUtf8Type>::new(truthy, falsy)) as Arc<dyn ZipImpl> |
280 | | }, |
281 | | DataType::Binary => { |
282 | 1 | Arc::new(BytesScalarImpl::<BinaryType>::new(truthy, falsy)) as Arc<dyn ZipImpl> |
283 | | }, |
284 | | DataType::LargeBinary => { |
285 | 1 | Arc::new(BytesScalarImpl::<LargeBinaryType>::new(truthy, falsy)) as Arc<dyn ZipImpl> |
286 | | }, |
287 | | // TODO: Handle Utf8View https://github.com/apache/arrow-rs/issues/8724 |
288 | | _ => { |
289 | 1 | Arc::new(FallbackImpl::new(truthy, falsy)) as Arc<dyn ZipImpl> |
290 | | }, |
291 | | }; |
292 | | |
293 | 40 | Ok(Self { zip_impl }) |
294 | 40 | } |
295 | | |
296 | | /// Creating output array based on input boolean array and the two scalar values the zipper was created with |
297 | | /// See struct level documentation for examples. |
298 | 2 | pub fn zip(&self, mask: &BooleanArray) -> Result<ArrayRef, ArrowError> { |
299 | 2 | self.zip_impl.create_output(mask) |
300 | 2 | } |
301 | | } |
302 | | |
303 | | /// Impl for creating output array based on a mask |
304 | | trait ZipImpl: Debug + Send + Sync { |
305 | | /// Creating output array based on input boolean array |
306 | | fn create_output(&self, input: &BooleanArray) -> Result<ArrayRef, ArrowError>; |
307 | | } |
308 | | |
309 | | #[derive(Debug, PartialEq)] |
310 | | struct FallbackImpl { |
311 | | truthy: ArrayData, |
312 | | falsy: ArrayData, |
313 | | } |
314 | | |
315 | | impl FallbackImpl { |
316 | 1 | fn new(left: &dyn Array, right: &dyn Array) -> Self { |
317 | 1 | Self { |
318 | 1 | truthy: left.to_data(), |
319 | 1 | falsy: right.to_data(), |
320 | 1 | } |
321 | 1 | } |
322 | | } |
323 | | |
324 | | impl ZipImpl for FallbackImpl { |
325 | 1 | fn create_output(&self, predicate: &BooleanArray) -> Result<ArrayRef, ArrowError> { |
326 | 1 | zip_impl(predicate, &self.truthy, true, &self.falsy, true) |
327 | 1 | } |
328 | | } |
329 | | |
330 | | struct PrimitiveScalarImpl<T: ArrowPrimitiveType> { |
331 | | data_type: DataType, |
332 | | truthy: Option<T::Native>, |
333 | | falsy: Option<T::Native>, |
334 | | } |
335 | | |
336 | | impl<T: ArrowPrimitiveType> Debug for PrimitiveScalarImpl<T> { |
337 | 0 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
338 | 0 | f.debug_struct("PrimitiveScalarImpl") |
339 | 0 | .field("data_type", &self.data_type) |
340 | 0 | .field("truthy", &self.truthy) |
341 | 0 | .field("falsy", &self.falsy) |
342 | 0 | .finish() |
343 | 0 | } |
344 | | } |
345 | | |
346 | | impl<T: ArrowPrimitiveType> PrimitiveScalarImpl<T> { |
347 | 25 | fn new(truthy: &dyn Array, falsy: &dyn Array) -> Self { |
348 | 25 | Self { |
349 | 25 | data_type: truthy.data_type().clone(), |
350 | 25 | truthy: Self::get_value_from_scalar(truthy), |
351 | 25 | falsy: Self::get_value_from_scalar(falsy), |
352 | 25 | } |
353 | 25 | } |
354 | | |
355 | 50 | fn get_value_from_scalar(scalar: &dyn Array) -> Option<T::Native> { |
356 | 50 | if scalar.is_null(0) { |
357 | 16 | None |
358 | | } else { |
359 | 34 | let value = scalar.as_primitive::<T>().value(0); |
360 | | |
361 | 34 | Some(value) |
362 | | } |
363 | 50 | } |
364 | | |
365 | | /// return an output array that has |
366 | | /// `value` in all locations where predicate is true |
367 | | /// `null` otherwise |
368 | 14 | fn get_scalar_and_null_buffer_for_single_non_nullable( |
369 | 14 | predicate: BooleanBuffer, |
370 | 14 | value: T::Native, |
371 | 14 | ) -> (Vec<T::Native>, Option<NullBuffer>) { |
372 | 14 | let result_len = predicate.len(); |
373 | 14 | let nulls = NullBuffer::new(predicate); |
374 | 14 | let scalars = vec![value; result_len]; |
375 | | |
376 | 14 | (scalars, Some(nulls)) |
377 | 14 | } |
378 | | } |
379 | | |
380 | | impl<T: ArrowPrimitiveType> ZipImpl for PrimitiveScalarImpl<T> { |
381 | 26 | fn create_output(&self, predicate: &BooleanArray) -> Result<ArrayRef, ArrowError> { |
382 | 26 | let result_len = predicate.len(); |
383 | | // Nulls are treated as false |
384 | 26 | let predicate = maybe_prep_null_mask_filter(predicate); |
385 | | |
386 | 26 | let (scalars, nulls): (Vec<T::Native>, Option<NullBuffer>) = match (self.truthy, self.falsy) |
387 | | { |
388 | 11 | (Some(truthy_val), Some(falsy_val)) => { |
389 | 11 | let scalars: Vec<T::Native> = predicate |
390 | 11 | .iter() |
391 | 85 | .map11 (|b| if b { truthy_val42 } else { falsy_val43 }) |
392 | 11 | .collect(); |
393 | | |
394 | 11 | (scalars, None) |
395 | | } |
396 | 8 | (Some(truthy_val), None) => { |
397 | | // If a value is true we need the TRUTHY and the null buffer will have 1 (meaning not null) |
398 | | // If a value is false we need the FALSY and the null buffer will have 0 (meaning null) |
399 | | |
400 | 8 | Self::get_scalar_and_null_buffer_for_single_non_nullable(predicate, truthy_val) |
401 | | } |
402 | 6 | (None, Some(falsy_val)) => { |
403 | | // Flipping the boolean buffer as we want the opposite of the TRUE case |
404 | | // |
405 | | // if the condition is true we want null so we need to NOT the value so we get 0 (meaning null) |
406 | | // if the condition is false we want the FALSY value so we need to NOT the value so we get 1 (meaning not null) |
407 | 6 | let predicate = predicate.not(); |
408 | | |
409 | 6 | Self::get_scalar_and_null_buffer_for_single_non_nullable(predicate, falsy_val) |
410 | | } |
411 | | (None, None) => { |
412 | | // All values are null |
413 | 1 | let nulls = NullBuffer::new_null(result_len); |
414 | 1 | let scalars = vec![T::default_value(); result_len]; |
415 | | |
416 | 1 | (scalars, Some(nulls)) |
417 | | } |
418 | | }; |
419 | | |
420 | 26 | let scalars = ScalarBuffer::<T::Native>::from(scalars); |
421 | 26 | let output = PrimitiveArray::<T>::try_new(scalars, nulls)?0 ; |
422 | | |
423 | | // Keep decimal precisions, scales or timestamps timezones |
424 | 26 | let output = output.with_data_type(self.data_type.clone()); |
425 | | |
426 | 26 | Ok(Arc::new(output)) |
427 | 26 | } |
428 | | } |
429 | | |
430 | | #[derive(PartialEq, Hash)] |
431 | | struct BytesScalarImpl<T: ByteArrayType> { |
432 | | truthy: Option<Vec<u8>>, |
433 | | falsy: Option<Vec<u8>>, |
434 | | phantom: PhantomData<T>, |
435 | | } |
436 | | |
437 | | impl<T: ByteArrayType> Debug for BytesScalarImpl<T> { |
438 | 0 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
439 | 0 | f.debug_struct("BytesScalarImpl") |
440 | 0 | .field("truthy", &self.truthy) |
441 | 0 | .field("falsy", &self.falsy) |
442 | 0 | .finish() |
443 | 0 | } |
444 | | } |
445 | | |
446 | | impl<T: ByteArrayType> BytesScalarImpl<T> { |
447 | 14 | fn new(truthy_value: &dyn Array, falsy_value: &dyn Array) -> Self { |
448 | 14 | Self { |
449 | 14 | truthy: Self::get_value_from_scalar(truthy_value), |
450 | 14 | falsy: Self::get_value_from_scalar(falsy_value), |
451 | 14 | phantom: PhantomData, |
452 | 14 | } |
453 | 14 | } |
454 | | |
455 | 28 | fn get_value_from_scalar(scalar: &dyn Array) -> Option<Vec<u8>> { |
456 | 28 | if scalar.is_null(0) { |
457 | 8 | None |
458 | | } else { |
459 | 20 | let bytes: &[u8] = scalar.as_bytes::<T>().value(0).as_ref(); |
460 | | |
461 | 20 | Some(bytes.to_vec()) |
462 | | } |
463 | 28 | } |
464 | | |
465 | | /// return an output array that has |
466 | | /// `value` in all locations where predicate is true |
467 | | /// `null` otherwise |
468 | 6 | fn get_scalar_and_null_buffer_for_single_non_nullable( |
469 | 6 | predicate: BooleanBuffer, |
470 | 6 | value: &[u8], |
471 | 6 | ) -> (Buffer, OffsetBuffer<T::Offset>, Option<NullBuffer>) { |
472 | 6 | let value_length = value.len(); |
473 | | |
474 | 6 | let number_of_true = predicate.count_set_bits(); |
475 | | |
476 | | // Fast path for all nulls |
477 | 6 | if number_of_true == 0 { |
478 | | // All values are null |
479 | 2 | let nulls = NullBuffer::new_null(predicate.len()); |
480 | | |
481 | 2 | return ( |
482 | 2 | // Empty bytes |
483 | 2 | Buffer::from(&[]), |
484 | 2 | // All nulls so all lengths are 0 |
485 | 2 | OffsetBuffer::<T::Offset>::new_zeroed(predicate.len()), |
486 | 2 | Some(nulls), |
487 | 2 | ); |
488 | 4 | } |
489 | | |
490 | 4 | let offsets = OffsetBuffer::<T::Offset>::from_lengths( |
491 | 20 | predicate4 .iter4 ().map4 (|b| if b { value_length15 } else { 05 }), |
492 | | ); |
493 | | |
494 | 4 | let mut bytes = MutableBuffer::with_capacity(0); |
495 | 4 | bytes.repeat_slice_n_times(value, number_of_true); |
496 | | |
497 | 4 | let bytes = Buffer::from(bytes); |
498 | | |
499 | | // If a value is true we need the TRUTHY and the null buffer will have 1 (meaning not null) |
500 | | // If a value is false we need the FALSY and the null buffer will have 0 (meaning null) |
501 | 4 | let nulls = NullBuffer::new(predicate); |
502 | | |
503 | 4 | (bytes, offsets, Some(nulls)) |
504 | 6 | } |
505 | | |
506 | | /// Create a [`Buffer`] where `value` slice is repeated `number_of_values` times |
507 | | /// and [`OffsetBuffer`] where there are `number_of_values` lengths, and all equals to `value` length |
508 | 2 | fn get_bytes_and_offset_for_all_same_value( |
509 | 2 | number_of_values: usize, |
510 | 2 | value: &[u8], |
511 | 2 | ) -> (Buffer, OffsetBuffer<T::Offset>) { |
512 | 2 | let value_length = value.len(); |
513 | | |
514 | 2 | let offsets = |
515 | 2 | OffsetBuffer::<T::Offset>::from_repeated_length(value_length, number_of_values); |
516 | | |
517 | 2 | let mut bytes = MutableBuffer::with_capacity(0); |
518 | 2 | bytes.repeat_slice_n_times(value, number_of_values); |
519 | 2 | let bytes = Buffer::from(bytes); |
520 | | |
521 | 2 | (bytes, offsets) |
522 | 2 | } |
523 | | |
524 | 7 | fn create_output_on_non_nulls( |
525 | 7 | predicate: &BooleanBuffer, |
526 | 7 | truthy_val: &[u8], |
527 | 7 | falsy_val: &[u8], |
528 | 7 | ) -> (Buffer, OffsetBuffer<<T as ByteArrayType>::Offset>) { |
529 | 7 | let true_count = predicate.count_set_bits(); |
530 | | |
531 | 6 | match true_count { |
532 | | 0 => { |
533 | | // All values are falsy |
534 | | |
535 | 1 | let (bytes, offsets) = |
536 | 1 | Self::get_bytes_and_offset_for_all_same_value(predicate.len(), falsy_val); |
537 | | |
538 | 1 | return (bytes, offsets); |
539 | | } |
540 | 6 | n1 if n == predicate.len()1 => { |
541 | | // All values are truthy |
542 | 1 | let (bytes, offsets) = |
543 | 1 | Self::get_bytes_and_offset_for_all_same_value(predicate.len(), truthy_val); |
544 | | |
545 | 1 | return (bytes, offsets); |
546 | | } |
547 | | |
548 | 5 | _ => { |
549 | 5 | // Fallback |
550 | 5 | } |
551 | | } |
552 | | |
553 | 5 | let total_number_of_bytes = |
554 | 5 | true_count * truthy_val.len() + (predicate.len() - true_count) * falsy_val.len(); |
555 | 5 | let mut mutable = MutableBuffer::with_capacity(total_number_of_bytes); |
556 | 5 | let mut offset_buffer_builder = OffsetBufferBuilder::<T::Offset>::new(predicate.len()); |
557 | | |
558 | | // keep track of how much is filled |
559 | 5 | let mut filled = 0; |
560 | | |
561 | 5 | let truthy_len = truthy_val.len(); |
562 | 5 | let falsy_len = falsy_val.len(); |
563 | | |
564 | 12 | SlicesIterator::from5 (predicate5 ).for_each5 (|(start, end)| { |
565 | | // the gap needs to be filled with falsy values |
566 | 12 | if start > filled { |
567 | 7 | let false_repeat_count = start - filled; |
568 | | // Push false value `repeat_count` times |
569 | 7 | mutable.repeat_slice_n_times(falsy_val, false_repeat_count); |
570 | | |
571 | 7 | for _ in 0..false_repeat_count { |
572 | 7 | offset_buffer_builder.push_length(falsy_len) |
573 | | } |
574 | 5 | } |
575 | | |
576 | 12 | let true_repeat_count = end - start; |
577 | | // fill with truthy values |
578 | 12 | mutable.repeat_slice_n_times(truthy_val, true_repeat_count); |
579 | | |
580 | 12 | for _ in 0..true_repeat_count { |
581 | 14 | offset_buffer_builder.push_length(truthy_len) |
582 | | } |
583 | 12 | filled = end; |
584 | 12 | }); |
585 | | // the remaining part is falsy |
586 | 5 | if filled < predicate.len() { |
587 | 2 | let false_repeat_count = predicate.len() - filled; |
588 | | // Copy the first item from the 'falsy' array into the output buffer. |
589 | 2 | mutable.repeat_slice_n_times(falsy_val, false_repeat_count); |
590 | | |
591 | 2 | for _ in 0..false_repeat_count { |
592 | 6 | offset_buffer_builder.push_length(falsy_len) |
593 | | } |
594 | 3 | } |
595 | | |
596 | 5 | (mutable.into(), offset_buffer_builder.finish()) |
597 | 7 | } |
598 | | } |
599 | | |
600 | | impl<T: ByteArrayType> ZipImpl for BytesScalarImpl<T> { |
601 | 14 | fn create_output(&self, predicate: &BooleanArray) -> Result<ArrayRef, ArrowError> { |
602 | 14 | let result_len = predicate.len(); |
603 | | // Nulls are treated as false |
604 | 14 | let predicate = maybe_prep_null_mask_filter(predicate); |
605 | | |
606 | 14 | let (bytes, offsets, nulls): (Buffer, OffsetBuffer<T::Offset>, Option<NullBuffer>) = |
607 | 14 | match (self.truthy.as_deref(), self.falsy.as_deref()) { |
608 | 7 | (Some(truthy_val), Some(falsy_val)) => { |
609 | 7 | let (bytes, offsets) = |
610 | 7 | Self::create_output_on_non_nulls(&predicate, truthy_val, falsy_val); |
611 | | |
612 | 7 | (bytes, offsets, None) |
613 | | } |
614 | 3 | (Some(truthy_val), None) => { |
615 | 3 | Self::get_scalar_and_null_buffer_for_single_non_nullable(predicate, truthy_val) |
616 | | } |
617 | 3 | (None, Some(falsy_val)) => { |
618 | | // Flipping the boolean buffer as we want the opposite of the TRUE case |
619 | | // |
620 | | // if the condition is true we want null so we need to NOT the value so we get 0 (meaning null) |
621 | | // if the condition is false we want the FALSE value so we need to NOT the value so we get 1 (meaning not null) |
622 | 3 | let predicate = predicate.not(); |
623 | 3 | Self::get_scalar_and_null_buffer_for_single_non_nullable(predicate, falsy_val) |
624 | | } |
625 | | (None, None) => { |
626 | | // All values are null |
627 | 1 | let nulls = NullBuffer::new_null(result_len); |
628 | | |
629 | 1 | ( |
630 | 1 | // Empty bytes |
631 | 1 | Buffer::from(&[]), |
632 | 1 | // All nulls so all lengths are 0 |
633 | 1 | OffsetBuffer::<T::Offset>::new_zeroed(predicate.len()), |
634 | 1 | Some(nulls), |
635 | 1 | ) |
636 | | } |
637 | | }; |
638 | | |
639 | 14 | let output = unsafe { |
640 | | // Safety: the values are based on valid inputs |
641 | | // and `try_new` is expensive for strings as it validate that the input is valid utf8 |
642 | 14 | GenericByteArray::<T>::new_unchecked(offsets, bytes, nulls) |
643 | | }; |
644 | | |
645 | 14 | Ok(Arc::new(output)) |
646 | 14 | } |
647 | | } |
648 | | |
649 | 80 | fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer { |
650 | | // Nulls are treated as false |
651 | 80 | if predicate.null_count() == 0 { |
652 | 76 | predicate.values().clone() |
653 | | } else { |
654 | 4 | let cleaned = prep_null_mask_filter(predicate); |
655 | 4 | let (boolean_buffer, _) = cleaned.into_parts(); |
656 | 4 | boolean_buffer |
657 | | } |
658 | 80 | } |
659 | | |
660 | | #[cfg(test)] |
661 | | mod test { |
662 | | use super::*; |
663 | | use arrow_array::types::Int32Type; |
664 | | |
665 | | #[test] |
666 | 1 | fn test_zip_kernel_one() { |
667 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
668 | 1 | let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), Some(3)]); |
669 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
670 | 1 | let out = zip(&mask, &a, &b).unwrap(); |
671 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
672 | 1 | let expected = Int32Array::from(vec![Some(5), None, Some(6), Some(7), Some(1)]); |
673 | 1 | assert_eq!(actual, &expected); |
674 | 1 | } |
675 | | |
676 | | #[test] |
677 | 1 | fn test_zip_kernel_two() { |
678 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
679 | 1 | let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), Some(3)]); |
680 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
681 | 1 | let out = zip(&mask, &a, &b).unwrap(); |
682 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
683 | 1 | let expected = Int32Array::from(vec![None, Some(3), Some(7), None, Some(3)]); |
684 | 1 | assert_eq!(actual, &expected); |
685 | 1 | } |
686 | | |
687 | | #[test] |
688 | 1 | fn test_zip_kernel_scalar_falsy_1() { |
689 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
690 | | |
691 | 1 | let fallback = Scalar::new(Int32Array::from_value(42, 1)); |
692 | | |
693 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
694 | 1 | let out = zip(&mask, &a, &fallback).unwrap(); |
695 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
696 | 1 | let expected = Int32Array::from(vec![Some(5), None, Some(42), Some(42), Some(1)]); |
697 | 1 | assert_eq!(actual, &expected); |
698 | 1 | } |
699 | | |
700 | | #[test] |
701 | 1 | fn test_zip_kernel_scalar_falsy_2() { |
702 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
703 | | |
704 | 1 | let fallback = Scalar::new(Int32Array::from_value(42, 1)); |
705 | | |
706 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
707 | 1 | let out = zip(&mask, &a, &fallback).unwrap(); |
708 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
709 | 1 | let expected = Int32Array::from(vec![Some(42), Some(42), Some(7), None, Some(42)]); |
710 | 1 | assert_eq!(actual, &expected); |
711 | 1 | } |
712 | | |
713 | | #[test] |
714 | 1 | fn test_zip_kernel_scalar_truthy_1() { |
715 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
716 | | |
717 | 1 | let fallback = Scalar::new(Int32Array::from_value(42, 1)); |
718 | | |
719 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
720 | 1 | let out = zip(&mask, &fallback, &a).unwrap(); |
721 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
722 | 1 | let expected = Int32Array::from(vec![Some(42), Some(42), Some(7), None, Some(42)]); |
723 | 1 | assert_eq!(actual, &expected); |
724 | 1 | } |
725 | | |
726 | | #[test] |
727 | 1 | fn test_zip_kernel_scalar_truthy_2() { |
728 | 1 | let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); |
729 | | |
730 | 1 | let fallback = Scalar::new(Int32Array::from_value(42, 1)); |
731 | | |
732 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
733 | 1 | let out = zip(&mask, &fallback, &a).unwrap(); |
734 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
735 | 1 | let expected = Int32Array::from(vec![Some(5), None, Some(42), Some(42), Some(1)]); |
736 | 1 | assert_eq!(actual, &expected); |
737 | 1 | } |
738 | | |
739 | | #[test] |
740 | 1 | fn test_zip_kernel_scalar_both_mask_ends_with_true() { |
741 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
742 | 1 | let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); |
743 | | |
744 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
745 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
746 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
747 | 1 | let expected = Int32Array::from(vec![Some(42), Some(42), Some(123), Some(123), Some(42)]); |
748 | 1 | assert_eq!(actual, &expected); |
749 | 1 | } |
750 | | |
751 | | #[test] |
752 | 1 | fn test_zip_kernel_scalar_both_mask_ends_with_false() { |
753 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
754 | 1 | let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); |
755 | | |
756 | 1 | let mask = BooleanArray::from(vec![true, true, false, true, false, false]); |
757 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
758 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
759 | 1 | let expected = Int32Array::from(vec![ |
760 | 1 | Some(42), |
761 | 1 | Some(42), |
762 | 1 | Some(123), |
763 | 1 | Some(42), |
764 | 1 | Some(123), |
765 | 1 | Some(123), |
766 | | ]); |
767 | 1 | assert_eq!(actual, &expected); |
768 | 1 | } |
769 | | |
770 | | #[test] |
771 | 1 | fn test_zip_kernel_primitive_scalar_none_1() { |
772 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
773 | 1 | let scalar_falsy = Scalar::new(Int32Array::new_null(1)); |
774 | | |
775 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
776 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
777 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
778 | 1 | let expected = Int32Array::from(vec![Some(42), Some(42), None, None, Some(42)]); |
779 | 1 | assert_eq!(actual, &expected); |
780 | 1 | } |
781 | | |
782 | | #[test] |
783 | 1 | fn test_zip_kernel_primitive_scalar_none_2() { |
784 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
785 | 1 | let scalar_falsy = Scalar::new(Int32Array::new_null(1)); |
786 | | |
787 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
788 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
789 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
790 | 1 | let expected = Int32Array::from(vec![None, None, Some(42), Some(42), None]); |
791 | 1 | assert_eq!(actual, &expected); |
792 | 1 | } |
793 | | |
794 | | #[test] |
795 | 1 | fn test_zip_kernel_primitive_scalar_both_null() { |
796 | 1 | let scalar_truthy = Scalar::new(Int32Array::new_null(1)); |
797 | 1 | let scalar_falsy = Scalar::new(Int32Array::new_null(1)); |
798 | | |
799 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
800 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
801 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
802 | 1 | let expected = Int32Array::from(vec![None, None, None, None, None]); |
803 | 1 | assert_eq!(actual, &expected); |
804 | 1 | } |
805 | | |
806 | | #[test] |
807 | 1 | fn test_zip_primitive_array_with_nulls_is_mask_should_be_treated_as_false() { |
808 | 1 | let truthy = Int32Array::from_iter_values(vec![1, 2, 3, 4, 5, 6]); |
809 | 1 | let falsy = Int32Array::from_iter_values(vec![7, 8, 9, 10, 11, 12]); |
810 | | |
811 | 1 | let mask = { |
812 | 1 | let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]); |
813 | 1 | let nulls = NullBuffer::from(vec![ |
814 | | true, true, true, |
815 | | false, // null treated as false even though in the original mask it was true |
816 | | true, true, |
817 | | ]); |
818 | 1 | BooleanArray::new(booleans, Some(nulls)) |
819 | | }; |
820 | 1 | let out = zip(&mask, &truthy, &falsy).unwrap(); |
821 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
822 | 1 | let expected = Int32Array::from(vec![ |
823 | 1 | Some(1), |
824 | 1 | Some(2), |
825 | 1 | Some(9), |
826 | 1 | Some(10), // true in mask but null |
827 | 1 | Some(11), |
828 | 1 | Some(12), |
829 | | ]); |
830 | 1 | assert_eq!(actual, &expected); |
831 | 1 | } |
832 | | |
833 | | #[test] |
834 | 1 | fn test_zip_kernel_primitive_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false() |
835 | | { |
836 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
837 | 1 | let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); |
838 | | |
839 | 1 | let mask = { |
840 | 1 | let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]); |
841 | 1 | let nulls = NullBuffer::from(vec![ |
842 | | true, true, true, |
843 | | false, // null treated as false even though in the original mask it was true |
844 | | true, true, |
845 | | ]); |
846 | 1 | BooleanArray::new(booleans, Some(nulls)) |
847 | | }; |
848 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
849 | 1 | let actual = out.as_any().downcast_ref::<Int32Array>().unwrap(); |
850 | 1 | let expected = Int32Array::from(vec![ |
851 | 1 | Some(42), |
852 | 1 | Some(42), |
853 | 1 | Some(123), |
854 | 1 | Some(123), // true in mask but null |
855 | 1 | Some(123), |
856 | 1 | Some(123), |
857 | | ]); |
858 | 1 | assert_eq!(actual, &expected); |
859 | 1 | } |
860 | | |
861 | | #[test] |
862 | 1 | fn test_zip_string_array_with_nulls_is_mask_should_be_treated_as_false() { |
863 | 1 | let truthy = StringArray::from_iter_values(vec!["1", "2", "3", "4", "5", "6"]); |
864 | 1 | let falsy = StringArray::from_iter_values(vec!["7", "8", "9", "10", "11", "12"]); |
865 | | |
866 | 1 | let mask = { |
867 | 1 | let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]); |
868 | 1 | let nulls = NullBuffer::from(vec![ |
869 | | true, true, true, |
870 | | false, // null treated as false even though in the original mask it was true |
871 | | true, true, |
872 | | ]); |
873 | 1 | BooleanArray::new(booleans, Some(nulls)) |
874 | | }; |
875 | 1 | let out = zip(&mask, &truthy, &falsy).unwrap(); |
876 | 1 | let actual = out.as_string::<i32>(); |
877 | 1 | let expected = StringArray::from_iter_values(vec![ |
878 | 1 | "1", "2", "9", "10", // true in mask but null |
879 | 1 | "11", "12", |
880 | | ]); |
881 | 1 | assert_eq!(actual, &expected); |
882 | 1 | } |
883 | | |
884 | | #[test] |
885 | 1 | fn test_zip_kernel_large_string_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false() |
886 | | { |
887 | 1 | let scalar_truthy = Scalar::new(LargeStringArray::from_iter_values(["test"])); |
888 | 1 | let scalar_falsy = Scalar::new(LargeStringArray::from_iter_values(["something else"])); |
889 | | |
890 | 1 | let mask = { |
891 | 1 | let booleans = BooleanBuffer::from(vec![true, true, false, true, false, false]); |
892 | 1 | let nulls = NullBuffer::from(vec![ |
893 | | true, true, true, |
894 | | false, // null treated as false even though in the original mask it was true |
895 | | true, true, |
896 | | ]); |
897 | 1 | BooleanArray::new(booleans, Some(nulls)) |
898 | | }; |
899 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
900 | 1 | let actual = out.as_any().downcast_ref::<LargeStringArray>().unwrap(); |
901 | 1 | let expected = LargeStringArray::from_iter(vec![ |
902 | 1 | Some("test"), |
903 | 1 | Some("test"), |
904 | 1 | Some("something else"), |
905 | 1 | Some("something else"), // true in mask but null |
906 | 1 | Some("something else"), |
907 | 1 | Some("something else"), |
908 | | ]); |
909 | 1 | assert_eq!(actual, &expected); |
910 | 1 | } |
911 | | |
912 | | #[test] |
913 | 1 | fn test_zip_kernel_bytes_scalar_none_1() { |
914 | 1 | let scalar_truthy = Scalar::new(StringArray::from_iter_values(["hello"])); |
915 | 1 | let scalar_falsy = Scalar::new(StringArray::new_null(1)); |
916 | | |
917 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
918 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
919 | 1 | let actual = out.as_any().downcast_ref::<StringArray>().unwrap(); |
920 | 1 | let expected = StringArray::from_iter(vec![ |
921 | 1 | Some("hello"), |
922 | 1 | Some("hello"), |
923 | 1 | None, |
924 | 1 | None, |
925 | 1 | Some("hello"), |
926 | | ]); |
927 | 1 | assert_eq!(actual, &expected); |
928 | 1 | } |
929 | | |
930 | | #[test] |
931 | 1 | fn test_zip_kernel_bytes_scalar_none_2() { |
932 | 1 | let scalar_truthy = Scalar::new(StringArray::new_null(1)); |
933 | 1 | let scalar_falsy = Scalar::new(StringArray::from_iter_values(["hello"])); |
934 | | |
935 | 1 | let mask = BooleanArray::from(vec![true, true, false, false, true]); |
936 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
937 | 1 | let actual = out.as_any().downcast_ref::<StringArray>().unwrap(); |
938 | 1 | let expected = StringArray::from_iter(vec![None, None, Some("hello"), Some("hello"), None]); |
939 | 1 | assert_eq!(actual, &expected); |
940 | 1 | } |
941 | | |
942 | | #[test] |
943 | 1 | fn test_zip_kernel_bytes_scalar_both() { |
944 | 1 | let scalar_truthy = Scalar::new(StringArray::from_iter_values(["test"])); |
945 | 1 | let scalar_falsy = Scalar::new(StringArray::from_iter_values(["something else"])); |
946 | | |
947 | | // mask ends with false |
948 | 1 | let mask = BooleanArray::from(vec![true, true, false, true, false, false]); |
949 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
950 | 1 | let actual = out.as_any().downcast_ref::<StringArray>().unwrap(); |
951 | 1 | let expected = StringArray::from_iter(vec![ |
952 | 1 | Some("test"), |
953 | 1 | Some("test"), |
954 | 1 | Some("something else"), |
955 | 1 | Some("test"), |
956 | 1 | Some("something else"), |
957 | 1 | Some("something else"), |
958 | | ]); |
959 | 1 | assert_eq!(actual, &expected); |
960 | 1 | } |
961 | | |
962 | | #[test] |
963 | 1 | fn test_zip_scalar_bytes_only_taking_one_side() { |
964 | 1 | let mask_len = 5; |
965 | 1 | let all_true_mask = BooleanArray::from(vec![true; mask_len]); |
966 | 1 | let all_false_mask = BooleanArray::from(vec![false; mask_len]); |
967 | | |
968 | 1 | let null_scalar = Scalar::new(StringArray::new_null(1)); |
969 | 1 | let non_null_scalar_1 = Scalar::new(StringArray::from_iter_values(["test"])); |
970 | 1 | let non_null_scalar_2 = Scalar::new(StringArray::from_iter_values(["something else"])); |
971 | | |
972 | | { |
973 | | // 1. Test where left is null and right is non-null |
974 | | // and mask is all true |
975 | 1 | let out = zip(&all_true_mask, &null_scalar, &non_null_scalar_1).unwrap(); |
976 | 1 | let actual = out.as_string::<i32>(); |
977 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(None::<&str>, mask_len)); |
978 | 1 | assert_eq!(actual, &expected); |
979 | | } |
980 | | |
981 | | { |
982 | | // 2. Test where left is null and right is non-null |
983 | | // and mask is all false |
984 | 1 | let out = zip(&all_false_mask, &null_scalar, &non_null_scalar_1).unwrap(); |
985 | 1 | let actual = out.as_string::<i32>(); |
986 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(Some("test"), mask_len)); |
987 | 1 | assert_eq!(actual, &expected); |
988 | | } |
989 | | |
990 | | { |
991 | | // 3. Test where left is non-null and right is null |
992 | | // and mask is all true |
993 | 1 | let out = zip(&all_true_mask, &non_null_scalar_1, &null_scalar).unwrap(); |
994 | 1 | let actual = out.as_string::<i32>(); |
995 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(Some("test"), mask_len)); |
996 | 1 | assert_eq!(actual, &expected); |
997 | | } |
998 | | |
999 | | { |
1000 | | // 4. Test where left is non-null and right is null |
1001 | | // and mask is all false |
1002 | 1 | let out = zip(&all_false_mask, &non_null_scalar_1, &null_scalar).unwrap(); |
1003 | 1 | let actual = out.as_string::<i32>(); |
1004 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(None::<&str>, mask_len)); |
1005 | 1 | assert_eq!(actual, &expected); |
1006 | | } |
1007 | | |
1008 | | { |
1009 | | // 5. Test where both left and right are not null |
1010 | | // and mask is all true |
1011 | 1 | let out = zip(&all_true_mask, &non_null_scalar_1, &non_null_scalar_2).unwrap(); |
1012 | 1 | let actual = out.as_string::<i32>(); |
1013 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(Some("test"), mask_len)); |
1014 | 1 | assert_eq!(actual, &expected); |
1015 | | } |
1016 | | |
1017 | | { |
1018 | | // 6. Test where both left and right are not null |
1019 | | // and mask is all false |
1020 | 1 | let out = zip(&all_false_mask, &non_null_scalar_1, &non_null_scalar_2).unwrap(); |
1021 | 1 | let actual = out.as_string::<i32>(); |
1022 | 1 | let expected = |
1023 | 1 | StringArray::from_iter(std::iter::repeat_n(Some("something else"), mask_len)); |
1024 | 1 | assert_eq!(actual, &expected); |
1025 | | } |
1026 | | |
1027 | | { |
1028 | | // 7. Test where both left and right are null |
1029 | | // and mask is random |
1030 | 1 | let mask = BooleanArray::from(vec![true, false, true, false, true]); |
1031 | 1 | let out = zip(&mask, &null_scalar, &null_scalar).unwrap(); |
1032 | 1 | let actual = out.as_string::<i32>(); |
1033 | 1 | let expected = StringArray::from_iter(std::iter::repeat_n(None::<&str>, mask_len)); |
1034 | 1 | assert_eq!(actual, &expected); |
1035 | | } |
1036 | 1 | } |
1037 | | |
1038 | | #[test] |
1039 | 1 | fn test_scalar_zipper() { |
1040 | 1 | let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); |
1041 | 1 | let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); |
1042 | | |
1043 | 1 | let mask = BooleanArray::from(vec![false, false, true, true, false]); |
1044 | | |
1045 | 1 | let scalar_zipper = ScalarZipper::try_new(&scalar_truthy, &scalar_falsy).unwrap(); |
1046 | 1 | let out = scalar_zipper.zip(&mask).unwrap(); |
1047 | 1 | let actual = out.as_primitive::<Int32Type>(); |
1048 | 1 | let expected = Int32Array::from(vec![Some(123), Some(123), Some(42), Some(42), Some(123)]); |
1049 | 1 | assert_eq!(actual, &expected); |
1050 | | |
1051 | | // test with different mask length as well |
1052 | 1 | let mask = BooleanArray::from(vec![true, false, true]); |
1053 | 1 | let out = scalar_zipper.zip(&mask).unwrap(); |
1054 | 1 | let actual = out.as_primitive::<Int32Type>(); |
1055 | 1 | let expected = Int32Array::from(vec![Some(42), Some(123), Some(42)]); |
1056 | 1 | assert_eq!(actual, &expected); |
1057 | 1 | } |
1058 | | |
1059 | | #[test] |
1060 | 1 | fn test_zip_kernel_scalar_strings() { |
1061 | 1 | let scalar_truthy = Scalar::new(StringArray::from(vec!["hello"])); |
1062 | 1 | let scalar_falsy = Scalar::new(StringArray::from(vec!["world"])); |
1063 | | |
1064 | 1 | let mask = BooleanArray::from(vec![true, false, true, false, true]); |
1065 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
1066 | 1 | let actual = out.as_string::<i32>(); |
1067 | 1 | let expected = StringArray::from(vec![ |
1068 | 1 | Some("hello"), |
1069 | 1 | Some("world"), |
1070 | 1 | Some("hello"), |
1071 | 1 | Some("world"), |
1072 | 1 | Some("hello"), |
1073 | | ]); |
1074 | 1 | assert_eq!(actual, &expected); |
1075 | 1 | } |
1076 | | |
1077 | | #[test] |
1078 | 1 | fn test_zip_kernel_scalar_binary() { |
1079 | 1 | let truthy_bytes: &[u8] = b"\xFF\xFE\xFD"; |
1080 | 1 | let falsy_bytes: &[u8] = b"world"; |
1081 | 1 | let scalar_truthy = Scalar::new(BinaryArray::from_iter_values( |
1082 | | // Non valid UTF8 bytes |
1083 | 1 | vec![truthy_bytes], |
1084 | | )); |
1085 | 1 | let scalar_falsy = Scalar::new(BinaryArray::from_iter_values(vec![falsy_bytes])); |
1086 | | |
1087 | 1 | let mask = BooleanArray::from(vec![true, false, true, false, true]); |
1088 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
1089 | 1 | let actual = out.as_binary::<i32>(); |
1090 | 1 | let expected = BinaryArray::from(vec![ |
1091 | 1 | Some(truthy_bytes), |
1092 | 1 | Some(falsy_bytes), |
1093 | 1 | Some(truthy_bytes), |
1094 | 1 | Some(falsy_bytes), |
1095 | 1 | Some(truthy_bytes), |
1096 | | ]); |
1097 | 1 | assert_eq!(actual, &expected); |
1098 | 1 | } |
1099 | | |
1100 | | #[test] |
1101 | 1 | fn test_zip_kernel_scalar_large_binary() { |
1102 | 1 | let truthy_bytes: &[u8] = b"hey"; |
1103 | 1 | let falsy_bytes: &[u8] = b"world"; |
1104 | 1 | let scalar_truthy = Scalar::new(LargeBinaryArray::from_iter_values(vec![truthy_bytes])); |
1105 | 1 | let scalar_falsy = Scalar::new(LargeBinaryArray::from_iter_values(vec![falsy_bytes])); |
1106 | | |
1107 | 1 | let mask = BooleanArray::from(vec![true, false, true, false, true]); |
1108 | 1 | let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); |
1109 | 1 | let actual = out.as_binary::<i64>(); |
1110 | 1 | let expected = LargeBinaryArray::from(vec![ |
1111 | 1 | Some(truthy_bytes), |
1112 | 1 | Some(falsy_bytes), |
1113 | 1 | Some(truthy_bytes), |
1114 | 1 | Some(falsy_bytes), |
1115 | 1 | Some(truthy_bytes), |
1116 | | ]); |
1117 | 1 | assert_eq!(actual, &expected); |
1118 | 1 | } |
1119 | | |
1120 | | // Test to ensure that the precision and scale are kept when zipping Decimal128 data |
1121 | | #[test] |
1122 | 1 | fn test_zip_decimal_with_custom_precision_and_scale() { |
1123 | 1 | let arr = Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) |
1124 | 1 | .with_precision_and_scale(20, 2) |
1125 | 1 | .unwrap(); |
1126 | | |
1127 | 1 | let arr: ArrayRef = Arc::new(arr); |
1128 | | |
1129 | 1 | let scalar_1 = Scalar::new(arr.slice(0, 1)); |
1130 | 1 | let scalar_2 = Scalar::new(arr.slice(1, 1)); |
1131 | 1 | let null_scalar = Scalar::new(new_null_array(arr.data_type(), 1)); |
1132 | 1 | let array_1: ArrayRef = arr.slice(0, 2); |
1133 | 1 | let array_2: ArrayRef = arr.slice(2, 2); |
1134 | | |
1135 | 1 | test_zip_output_data_types_for_input(scalar_1, scalar_2, null_scalar, array_1, array_2); |
1136 | 1 | } |
1137 | | |
1138 | | // Test to ensure that the timezone is kept when zipping TimestampArray data |
1139 | | #[test] |
1140 | 1 | fn test_zip_timestamp_with_timezone() { |
1141 | 1 | let arr = TimestampSecondArray::from(vec![0, 1000, 2000, 4000]) |
1142 | 1 | .with_timezone("+01:00".to_string()); |
1143 | | |
1144 | 1 | let arr: ArrayRef = Arc::new(arr); |
1145 | | |
1146 | 1 | let scalar_1 = Scalar::new(arr.slice(0, 1)); |
1147 | 1 | let scalar_2 = Scalar::new(arr.slice(1, 1)); |
1148 | 1 | let null_scalar = Scalar::new(new_null_array(arr.data_type(), 1)); |
1149 | 1 | let array_1: ArrayRef = arr.slice(0, 2); |
1150 | 1 | let array_2: ArrayRef = arr.slice(2, 2); |
1151 | | |
1152 | 1 | test_zip_output_data_types_for_input(scalar_1, scalar_2, null_scalar, array_1, array_2); |
1153 | 1 | } |
1154 | | |
1155 | 2 | fn test_zip_output_data_types_for_input( |
1156 | 2 | scalar_1: Scalar<ArrayRef>, |
1157 | 2 | scalar_2: Scalar<ArrayRef>, |
1158 | 2 | null_scalar: Scalar<ArrayRef>, |
1159 | 2 | array_1: ArrayRef, |
1160 | 2 | array_2: ArrayRef, |
1161 | 2 | ) { |
1162 | | // non null Scalar vs non null Scalar |
1163 | 2 | test_zip_output_data_type(&scalar_1, &scalar_2, 10); |
1164 | | |
1165 | | // null Scalar vs non-null Scalar (and vice versa) |
1166 | 2 | test_zip_output_data_type(&null_scalar, &scalar_1, 10); |
1167 | 2 | test_zip_output_data_type(&scalar_1, &null_scalar, 10); |
1168 | | |
1169 | | // non-null Scalar and array (and vice versa) |
1170 | 2 | test_zip_output_data_type(&array_1.as_ref(), &scalar_1, array_1.len()); |
1171 | 2 | test_zip_output_data_type(&scalar_1, &array_1.as_ref(), array_1.len()); |
1172 | | |
1173 | | // Array and null scalar (and vice versa) |
1174 | 2 | test_zip_output_data_type(&array_1.as_ref(), &null_scalar, array_1.len()); |
1175 | | |
1176 | 2 | test_zip_output_data_type(&null_scalar, &array_1.as_ref(), array_1.len()); |
1177 | | |
1178 | | // Both arrays |
1179 | 2 | test_zip_output_data_type(&array_1.as_ref(), &array_2.as_ref(), array_1.len()); |
1180 | 2 | } |
1181 | | |
1182 | 16 | fn test_zip_output_data_type(truthy: &dyn Datum, falsy: &dyn Datum, mask_length: usize) { |
1183 | 16 | let expected_data_type = truthy.get().0.data_type().clone(); |
1184 | 16 | assert_eq!(&expected_data_type, falsy.get().0.data_type()); |
1185 | | |
1186 | | // Try different masks to test different paths |
1187 | 16 | let mask_all_true = BooleanArray::from(vec![true; mask_length]); |
1188 | 16 | let mask_all_false = BooleanArray::from(vec![false; mask_length]); |
1189 | 16 | let mask_some_true_and_false = |
1190 | 80 | BooleanArray::from16 ((0..mask_length)16 .map16 (|i| i % 2 == 0).collect16 ::<Vec<bool>>()); |
1191 | | |
1192 | 48 | for mask in [&mask_all_true16 , &mask_all_false16 , &mask_some_true_and_false16 ] { |
1193 | 48 | let out = zip(mask, truthy, falsy).unwrap(); |
1194 | 48 | assert_eq!(out.data_type(), &expected_data_type); |
1195 | | } |
1196 | 16 | } |
1197 | | |
1198 | | #[test] |
1199 | 1 | fn zip_scalar_fallback_impl() { |
1200 | 1 | let truthy_list_item_scalar = Some(vec![Some(1), None, Some(3)]); |
1201 | 1 | let truthy_list_array_scalar = |
1202 | 1 | Scalar::new(ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ |
1203 | 1 | truthy_list_item_scalar.clone(), |
1204 | | ])); |
1205 | 1 | let falsy_list_item_scalar = Some(vec![None, Some(2), Some(4)]); |
1206 | 1 | let falsy_list_array_scalar = |
1207 | 1 | Scalar::new(ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ |
1208 | 1 | falsy_list_item_scalar.clone(), |
1209 | | ])); |
1210 | 1 | let mask = BooleanArray::from(vec![true, false, true, false, false, true, false]); |
1211 | 1 | let out = zip(&mask, &truthy_list_array_scalar, &falsy_list_array_scalar).unwrap(); |
1212 | 1 | let actual = out.as_list::<i32>(); |
1213 | | |
1214 | 1 | let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ |
1215 | 1 | truthy_list_item_scalar.clone(), |
1216 | 1 | falsy_list_item_scalar.clone(), |
1217 | 1 | truthy_list_item_scalar.clone(), |
1218 | 1 | falsy_list_item_scalar.clone(), |
1219 | 1 | falsy_list_item_scalar.clone(), |
1220 | 1 | truthy_list_item_scalar.clone(), |
1221 | 1 | falsy_list_item_scalar.clone(), |
1222 | | ]); |
1223 | 1 | assert_eq!(actual, &expected); |
1224 | 1 | } |
1225 | | } |