/Users/andrewlamb/Software/arrow-rs/arrow-select/src/nullif.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Implements the `nullif` function for Arrow arrays. |
19 | | |
20 | | use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; |
21 | | use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_unary_op_helper}; |
22 | | use arrow_buffer::{BooleanBuffer, NullBuffer}; |
23 | | use arrow_schema::{ArrowError, DataType}; |
24 | | |
25 | | /// Returns a new array with the same values and the validity bit to false where |
26 | | /// the corresponding element of`right` is true. |
27 | | /// |
28 | | /// This can be used to implement SQL `NULLIF` |
29 | | /// |
30 | | /// # Example |
31 | | /// ``` |
32 | | /// # use arrow_array::{Int32Array, BooleanArray}; |
33 | | /// # use arrow_array::cast::AsArray; |
34 | | /// # use arrow_array::types::Int32Type; |
35 | | /// # use arrow_select::nullif::nullif; |
36 | | /// // input is [null, 8, 1, 9] |
37 | | /// let a = Int32Array::from(vec![None, Some(8), Some(1), Some(9)]); |
38 | | /// // use nullif to set index 1 to null |
39 | | /// let bool_array = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]); |
40 | | /// let nulled = nullif(&a, &bool_array).unwrap(); |
41 | | /// // The resulting array is [null, null, 1, 9] |
42 | | /// assert_eq!(nulled.as_primitive(), &Int32Array::from(vec![None, None, Some(1), Some(9)])); |
43 | | /// ``` |
44 | | pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result<ArrayRef, ArrowError> { |
45 | | let left_data = left.to_data(); |
46 | | |
47 | | if left_data.len() != right.len() { |
48 | | return Err(ArrowError::ComputeError( |
49 | | "Cannot perform comparison operation on arrays of different length".to_string(), |
50 | | )); |
51 | | } |
52 | | let len = left_data.len(); |
53 | | |
54 | | if len == 0 || left_data.data_type() == &DataType::Null { |
55 | | return Ok(make_array(left_data)); |
56 | | } |
57 | | |
58 | | // left=0 (null) right=null output bitmap=null |
59 | | // left=0 right=1 output bitmap=null |
60 | | // left=1 (set) right=null output bitmap=set (passthrough) |
61 | | // left=1 right=1 & comp=true output bitmap=null |
62 | | // left=1 right=1 & comp=false output bitmap=set |
63 | | // |
64 | | // Thus: result = left null bitmap & (!right_values | !right_bitmap) |
65 | | // OR left null bitmap & !(right_values & right_bitmap) |
66 | | |
67 | | // Compute right_values & right_bitmap |
68 | | let right = match right.nulls() { |
69 | | Some(nulls) => right.values() & nulls.inner(), |
70 | | None => right.values().clone(), |
71 | | }; |
72 | | |
73 | | // Compute left null bitmap & !right |
74 | | |
75 | | let (combined, null_count) = match left_data.nulls() { |
76 | | Some(left) => { |
77 | | let mut valid_count = 0; |
78 | | let b = bitwise_bin_op_helper( |
79 | | left.buffer(), |
80 | | left.offset(), |
81 | | right.inner(), |
82 | | right.offset(), |
83 | | len, |
84 | 0 | |l, r| { |
85 | 0 | let t = l & !r; |
86 | 0 | valid_count += t.count_ones() as usize; |
87 | 0 | t |
88 | 0 | }, |
89 | | ); |
90 | | (b, len - valid_count) |
91 | | } |
92 | | None => { |
93 | | let mut null_count = 0; |
94 | 0 | let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { |
95 | 0 | let t = !b; |
96 | 0 | null_count += t.count_zeros() as usize; |
97 | 0 | t |
98 | 0 | }); |
99 | | (buffer, null_count) |
100 | | } |
101 | | }; |
102 | | |
103 | | let combined = BooleanBuffer::new(combined, 0, len); |
104 | | // Safety: |
105 | | // Counted nulls whilst computing |
106 | | let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) }; |
107 | | let data = left_data.into_builder().nulls(Some(nulls)); |
108 | | |
109 | | // SAFETY: |
110 | | // Only altered null mask |
111 | | Ok(make_array(unsafe { data.build_unchecked() })) |
112 | | } |
113 | | |
114 | | #[cfg(test)] |
115 | | mod tests { |
116 | | use super::*; |
117 | | use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; |
118 | | use arrow_array::cast::AsArray; |
119 | | use arrow_array::types::Int32Type; |
120 | | use arrow_array::{Int32Array, NullArray, StringArray, StructArray}; |
121 | | use arrow_data::ArrayData; |
122 | | use arrow_schema::{Field, Fields}; |
123 | | use rand::{rng, Rng}; |
124 | | |
125 | | #[test] |
126 | | fn test_nullif_int_array() { |
127 | | let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); |
128 | | let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); |
129 | | let res = nullif(&a, &comp).unwrap(); |
130 | | |
131 | | let expected = Int32Array::from(vec![ |
132 | | Some(15), |
133 | | None, |
134 | | None, // comp true, slot 2 turned into null |
135 | | Some(1), |
136 | | // Even though comp array / right is null, should still pass through original value |
137 | | // comp true, slot 2 turned into null |
138 | | Some(9), |
139 | | ]); |
140 | | |
141 | | let res = res.as_primitive::<Int32Type>(); |
142 | | assert_eq!(&expected, res); |
143 | | } |
144 | | |
145 | | #[test] |
146 | | fn test_nullif_null_array() { |
147 | | assert_eq!( |
148 | | nullif(&NullArray::new(0), &BooleanArray::new_null(0)) |
149 | | .unwrap() |
150 | | .as_ref(), |
151 | | &NullArray::new(0) |
152 | | ); |
153 | | |
154 | | assert_eq!( |
155 | | nullif( |
156 | | &NullArray::new(3), |
157 | | &BooleanArray::from(vec![Some(false), Some(true), None]), |
158 | | ) |
159 | | .unwrap() |
160 | | .as_ref(), |
161 | | &NullArray::new(3) |
162 | | ); |
163 | | } |
164 | | |
165 | | #[test] |
166 | | fn test_nullif_int_array_offset() { |
167 | | let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); |
168 | | let a = a.slice(1, 3); // Some(15), Some(8), Some(1) |
169 | | let a = a.as_any().downcast_ref::<Int32Array>().unwrap(); |
170 | | let comp = BooleanArray::from(vec![ |
171 | | Some(false), |
172 | | Some(false), |
173 | | Some(false), |
174 | | None, |
175 | | Some(true), |
176 | | Some(false), |
177 | | None, |
178 | | ]); |
179 | | let comp = comp.slice(2, 3); // Some(false), None, Some(true) |
180 | | let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap(); |
181 | | let res = nullif(a, comp).unwrap(); |
182 | | |
183 | | let expected = Int32Array::from(vec![ |
184 | | Some(15), // False => keep it |
185 | | Some(8), // None => keep it |
186 | | None, // true => None |
187 | | ]); |
188 | | let res = res.as_primitive::<Int32Type>(); |
189 | | assert_eq!(&expected, res) |
190 | | } |
191 | | |
192 | | #[test] |
193 | | fn test_nullif_string() { |
194 | | let s = StringArray::from_iter([ |
195 | | Some("hello"), |
196 | | None, |
197 | | Some("world"), |
198 | | Some("a"), |
199 | | Some("b"), |
200 | | None, |
201 | | None, |
202 | | ]); |
203 | | let select = BooleanArray::from_iter([ |
204 | | Some(true), |
205 | | Some(true), |
206 | | Some(false), |
207 | | Some(true), |
208 | | Some(false), |
209 | | Some(false), |
210 | | None, |
211 | | ]); |
212 | | |
213 | | let a = nullif(&s, &select).unwrap(); |
214 | | let r: Vec<_> = a.as_string::<i32>().iter().collect(); |
215 | | assert_eq!( |
216 | | r, |
217 | | vec![None, None, Some("world"), None, Some("b"), None, None] |
218 | | ); |
219 | | |
220 | | let s = s.slice(2, 3); |
221 | | let select = select.slice(1, 3); |
222 | | let a = nullif(&s, &select).unwrap(); |
223 | | let r: Vec<_> = a.as_string::<i32>().iter().collect(); |
224 | | assert_eq!(r, vec![None, Some("a"), None]); |
225 | | } |
226 | | |
227 | | #[test] |
228 | | fn test_nullif_int_large_left_offset() { |
229 | | let a = Int32Array::from(vec![ |
230 | | Some(-1), // 0 |
231 | | Some(-1), |
232 | | Some(-1), |
233 | | Some(-1), |
234 | | Some(-1), |
235 | | Some(-1), |
236 | | Some(-1), |
237 | | Some(-1), |
238 | | Some(-1), // 8 |
239 | | Some(-1), |
240 | | Some(-1), |
241 | | Some(-1), |
242 | | Some(-1), |
243 | | Some(-1), |
244 | | Some(-1), |
245 | | Some(-1), |
246 | | None, // 16 |
247 | | Some(15), // 17 |
248 | | Some(8), |
249 | | Some(1), |
250 | | Some(9), |
251 | | ]); |
252 | | let a = a.slice(17, 3); // Some(15), Some(8), Some(1) |
253 | | |
254 | | let comp = BooleanArray::from(vec![ |
255 | | Some(false), |
256 | | Some(false), |
257 | | Some(false), |
258 | | None, |
259 | | Some(true), |
260 | | Some(false), |
261 | | None, |
262 | | ]); |
263 | | let comp = comp.slice(2, 3); // Some(false), None, Some(true) |
264 | | let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap(); |
265 | | let res = nullif(&a, comp).unwrap(); |
266 | | let res = res.as_any().downcast_ref::<Int32Array>().unwrap(); |
267 | | |
268 | | let expected = Int32Array::from(vec![ |
269 | | Some(15), // False => keep it |
270 | | Some(8), // None => keep it |
271 | | None, // true => None |
272 | | ]); |
273 | | assert_eq!(&expected, res) |
274 | | } |
275 | | |
276 | | #[test] |
277 | | fn test_nullif_int_large_right_offset() { |
278 | | let a = Int32Array::from(vec![ |
279 | | None, // 0 |
280 | | Some(15), // 1 |
281 | | Some(8), |
282 | | Some(1), |
283 | | Some(9), |
284 | | ]); |
285 | | let a = a.slice(1, 3); // Some(15), Some(8), Some(1) |
286 | | |
287 | | let comp = BooleanArray::from(vec![ |
288 | | Some(false), // 0 |
289 | | Some(false), |
290 | | Some(false), |
291 | | Some(false), |
292 | | Some(false), |
293 | | Some(false), |
294 | | Some(false), |
295 | | Some(false), |
296 | | Some(false), // 8 |
297 | | Some(false), |
298 | | Some(false), |
299 | | Some(false), |
300 | | Some(false), |
301 | | Some(false), |
302 | | Some(false), |
303 | | Some(false), |
304 | | Some(false), // 16 |
305 | | Some(false), // 17 |
306 | | Some(false), // 18 |
307 | | None, |
308 | | Some(true), |
309 | | Some(false), |
310 | | None, |
311 | | ]); |
312 | | let comp = comp.slice(18, 3); // Some(false), None, Some(true) |
313 | | let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap(); |
314 | | let res = nullif(&a, comp).unwrap(); |
315 | | let res = res.as_any().downcast_ref::<Int32Array>().unwrap(); |
316 | | |
317 | | let expected = Int32Array::from(vec![ |
318 | | Some(15), // False => keep it |
319 | | Some(8), // None => keep it |
320 | | None, // true => None |
321 | | ]); |
322 | | assert_eq!(&expected, res) |
323 | | } |
324 | | |
325 | | #[test] |
326 | | fn test_nullif_boolean_offset() { |
327 | | let a = BooleanArray::from(vec![ |
328 | | None, // 0 |
329 | | Some(true), // 1 |
330 | | Some(false), |
331 | | Some(true), |
332 | | Some(true), |
333 | | ]); |
334 | | let a = a.slice(1, 3); // Some(true), Some(false), Some(true) |
335 | | |
336 | | let comp = BooleanArray::from(vec![ |
337 | | Some(false), // 0 |
338 | | Some(false), // 1 |
339 | | Some(false), // 2 |
340 | | None, |
341 | | Some(true), |
342 | | Some(false), |
343 | | None, |
344 | | ]); |
345 | | let comp = comp.slice(2, 3); // Some(false), None, Some(true) |
346 | | let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap(); |
347 | | let res = nullif(&a, comp).unwrap(); |
348 | | let res = res.as_any().downcast_ref::<BooleanArray>().unwrap(); |
349 | | |
350 | | let expected = BooleanArray::from(vec![ |
351 | | Some(true), // False => keep it |
352 | | Some(false), // None => keep it |
353 | | None, // true => None |
354 | | ]); |
355 | | assert_eq!(&expected, res) |
356 | | } |
357 | | |
358 | | struct Foo { |
359 | | a: Option<i32>, |
360 | | b: Option<bool>, |
361 | | /// Whether the entry should be valid. |
362 | | is_valid: bool, |
363 | | } |
364 | | |
365 | | impl Foo { |
366 | | fn new_valid(a: i32, b: bool) -> Foo { |
367 | | Self { |
368 | | a: Some(a), |
369 | | b: Some(b), |
370 | | is_valid: true, |
371 | | } |
372 | | } |
373 | | |
374 | | fn new_null() -> Foo { |
375 | | Self { |
376 | | a: None, |
377 | | b: None, |
378 | | is_valid: false, |
379 | | } |
380 | | } |
381 | | } |
382 | | |
383 | | /// Struct Array equality is a bit weird -- we need to have the *child values* |
384 | | /// correct even if the enclosing struct indicates it is null. But we |
385 | | /// also need the top level is_valid bits to be correct. |
386 | | fn create_foo_struct(values: Vec<Foo>) -> StructArray { |
387 | | let mut struct_array = StructBuilder::new( |
388 | | Fields::from(vec![ |
389 | | Field::new("a", DataType::Int32, true), |
390 | | Field::new("b", DataType::Boolean, true), |
391 | | ]), |
392 | | vec![ |
393 | | Box::new(Int32Builder::with_capacity(values.len())), |
394 | | Box::new(BooleanBuilder::with_capacity(values.len())), |
395 | | ], |
396 | | ); |
397 | | |
398 | | for value in values { |
399 | | struct_array |
400 | | .field_builder::<Int32Builder>(0) |
401 | | .unwrap() |
402 | | .append_option(value.a); |
403 | | struct_array |
404 | | .field_builder::<BooleanBuilder>(1) |
405 | | .unwrap() |
406 | | .append_option(value.b); |
407 | | struct_array.append(value.is_valid); |
408 | | } |
409 | | |
410 | | struct_array.finish() |
411 | | } |
412 | | |
413 | | #[test] |
414 | | fn test_nullif_struct_slices() { |
415 | | let struct_array = create_foo_struct(vec![ |
416 | | Foo::new_valid(7, true), |
417 | | Foo::new_valid(15, false), |
418 | | Foo::new_valid(8, true), |
419 | | Foo::new_valid(12, false), |
420 | | Foo::new_null(), |
421 | | Foo::new_null(), |
422 | | Foo::new_valid(42, true), |
423 | | ]); |
424 | | |
425 | | // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), |
426 | | // None, None |
427 | | let struct_array = struct_array.slice(1, 5); |
428 | | let comp = BooleanArray::from(vec![ |
429 | | Some(false), // 0 |
430 | | Some(false), // 1 |
431 | | Some(false), // 2 |
432 | | None, |
433 | | Some(true), |
434 | | Some(false), |
435 | | None, |
436 | | ]); |
437 | | let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None |
438 | | let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap(); |
439 | | let res = nullif(&struct_array, comp).unwrap(); |
440 | | let res = res.as_any().downcast_ref::<StructArray>().unwrap(); |
441 | | |
442 | | let expected = create_foo_struct(vec![ |
443 | | // Some(false) -> keep |
444 | | Foo::new_valid(15, false), |
445 | | // None -> keep |
446 | | Foo::new_valid(8, true), |
447 | | // Some(true) -> null out. But child values are still there. |
448 | | Foo { |
449 | | a: Some(12), |
450 | | b: Some(false), |
451 | | is_valid: false, |
452 | | }, |
453 | | // Some(false) -> keep, but was null |
454 | | Foo::new_null(), |
455 | | // None -> keep, but was null |
456 | | Foo::new_null(), |
457 | | ]); |
458 | | |
459 | | assert_eq!(&expected, res); |
460 | | } |
461 | | |
462 | | #[test] |
463 | | fn test_nullif_no_nulls() { |
464 | | let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); |
465 | | let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); |
466 | | let res = nullif(&a, &comp).unwrap(); |
467 | | let res = res.as_primitive::<Int32Type>(); |
468 | | |
469 | | let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); |
470 | | assert_eq!(res, &expected); |
471 | | } |
472 | | |
473 | | #[test] |
474 | | fn nullif_empty() { |
475 | | let a = Int32Array::from(ArrayData::new_empty(&DataType::Int32)); |
476 | | let mask = BooleanArray::from(ArrayData::new_empty(&DataType::Boolean)); |
477 | | let res = nullif(&a, &mask).unwrap(); |
478 | | assert_eq!(res.as_ref(), &a); |
479 | | } |
480 | | |
481 | | fn test_nullif(values: &Int32Array, filter: &BooleanArray) { |
482 | | let expected: Int32Array = values |
483 | | .iter() |
484 | | .zip(filter.iter()) |
485 | | .map(|(a, b)| match b { |
486 | | Some(true) => None, |
487 | | Some(false) | None => a, |
488 | | }) |
489 | | .collect(); |
490 | | |
491 | | let r = nullif(values, filter).unwrap(); |
492 | | let r_data = r.to_data(); |
493 | | r_data.validate().unwrap(); |
494 | | |
495 | | assert_eq!(r.as_ref(), &expected); |
496 | | } |
497 | | |
498 | | #[test] |
499 | | fn nullif_fuzz() { |
500 | | let mut rng = rng(); |
501 | | |
502 | | let arrays = [ |
503 | | Int32Array::from(vec![0; 128]), |
504 | | (0..128) |
505 | | .map(|_| rng.random_bool(0.5).then_some(0)) |
506 | | .collect(), |
507 | | ]; |
508 | | |
509 | | for a in arrays { |
510 | | let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; |
511 | | |
512 | | for (a_offset, a_length) in a_slices { |
513 | | let a = a.slice(a_offset, a_length); |
514 | | |
515 | | for i in 1..65 { |
516 | | let b_start_offset = rng.random_range(0..i); |
517 | | let b_end_offset = rng.random_range(0..i); |
518 | | |
519 | | let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) |
520 | | .map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5))) |
521 | | .collect(); |
522 | | let b = b.slice(b_start_offset, a_length); |
523 | | |
524 | | test_nullif(&a, &b); |
525 | | } |
526 | | } |
527 | | } |
528 | | } |
529 | | } |