/Users/andrewlamb/Software/arrow-rs/arrow-data/src/equal/list.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::data::{count_nulls, ArrayData}; |
19 | | use arrow_buffer::ArrowNativeType; |
20 | | use num::Integer; |
21 | | |
22 | | use super::equal_range; |
23 | | |
24 | 48 | fn lengths_equal<T: ArrowNativeType + Integer>(lhs: &[T], rhs: &[T]) -> bool { |
25 | | // invariant from `base_equal` |
26 | 48 | debug_assert_eq!(lhs.len(), rhs.len()); |
27 | | |
28 | 48 | if lhs.is_empty() { |
29 | 0 | return true; |
30 | 48 | } |
31 | | |
32 | 48 | if lhs[0] == T::zero() && rhs[0] == T::zero()36 { |
33 | 36 | return lhs == rhs; |
34 | 12 | }; |
35 | | |
36 | | // The expensive case, e.g. |
37 | | // [0, 2, 4, 6, 9] == [4, 6, 8, 10, 13] |
38 | 12 | lhs.windows(2) |
39 | 12 | .zip(rhs.windows(2)) |
40 | 12 | .all(|(lhs_offsets, rhs_offsets)| {0 |
41 | | // length of left == length of right |
42 | 0 | (lhs_offsets[1] - lhs_offsets[0]) == (rhs_offsets[1] - rhs_offsets[0]) |
43 | 0 | }) |
44 | 48 | } |
45 | | |
46 | 79 | pub(super) fn list_equal<T: ArrowNativeType + Integer>( |
47 | 79 | lhs: &ArrayData, |
48 | 79 | rhs: &ArrayData, |
49 | 79 | lhs_start: usize, |
50 | 79 | rhs_start: usize, |
51 | 79 | len: usize, |
52 | 79 | ) -> bool { |
53 | 79 | let lhs_offsets = lhs.buffer::<T>(0); |
54 | 79 | let rhs_offsets = rhs.buffer::<T>(0); |
55 | | |
56 | | // There is an edge-case where a n-length list that has 0 children, results in panics. |
57 | | // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but will have |
58 | | // no valid children. |
59 | | // Under logical equality, the child null bitmap will be an empty buffer, as there are |
60 | | // no child values. This causes panics when trying to count set bits. |
61 | | // |
62 | | // We caught this by chance from an accidental test-case, but due to the nature of this |
63 | | // crash only occurring on list equality checks, we are adding a check here, instead of |
64 | | // on the buffer/bitmap utilities, as a length check would incur a penalty for almost all |
65 | | // other use-cases. |
66 | | // |
67 | | // The solution is to check the number of child values from offsets, and return `true` if |
68 | | // they = 0. Empty arrays are equal, so this is correct. |
69 | | // |
70 | | // It's unlikely that one would create a n-length list array with no values, where n > 0, |
71 | | // however, one is more likely to slice into a list array and get a region that has 0 |
72 | | // child values. |
73 | | // The test that triggered this behaviour had [4, 4] as a slice of 1 value slot. |
74 | | // For the edge case that zero length list arrays are always equal. |
75 | 79 | if len == 0 { |
76 | 2 | return true; |
77 | 77 | } |
78 | | |
79 | 77 | let lhs_child_length = lhs_offsets[lhs_start + len].to_usize().unwrap() |
80 | 77 | - lhs_offsets[lhs_start].to_usize().unwrap(); |
81 | | |
82 | 77 | let rhs_child_length = rhs_offsets[rhs_start + len].to_usize().unwrap() |
83 | 77 | - rhs_offsets[rhs_start].to_usize().unwrap(); |
84 | | |
85 | 77 | if lhs_child_length == 0 && lhs_child_length == rhs_child_length16 { |
86 | 16 | return true; |
87 | 61 | } |
88 | | |
89 | 61 | let lhs_values = &lhs.child_data()[0]; |
90 | 61 | let rhs_values = &rhs.child_data()[0]; |
91 | | |
92 | 61 | let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); |
93 | 61 | let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); |
94 | | |
95 | 61 | if lhs_null_count != rhs_null_count { |
96 | 0 | return false; |
97 | 61 | } |
98 | | |
99 | 61 | if lhs_null_count == 0 && rhs_null_count == 048 { |
100 | 48 | lhs_child_length == rhs_child_length |
101 | 48 | && lengths_equal( |
102 | 48 | &lhs_offsets[lhs_start..lhs_start + len], |
103 | 48 | &rhs_offsets[rhs_start..rhs_start + len], |
104 | | ) |
105 | 48 | && equal_range( |
106 | 48 | lhs_values, |
107 | 48 | rhs_values, |
108 | 48 | lhs_offsets[lhs_start].to_usize().unwrap(), |
109 | 48 | rhs_offsets[rhs_start].to_usize().unwrap(), |
110 | 48 | lhs_child_length, |
111 | | ) |
112 | | } else { |
113 | | // get a ref of the parent null buffer bytes, to use in testing for nullness |
114 | 13 | let lhs_nulls = lhs.nulls().unwrap(); |
115 | 13 | let rhs_nulls = rhs.nulls().unwrap(); |
116 | | |
117 | | // with nulls, we need to compare item by item whenever it is not null |
118 | | // TODO: Could potentially compare runs of not NULL values |
119 | 78 | (0..len)13 .all13 (|i| { |
120 | 78 | let lhs_pos = lhs_start + i; |
121 | 78 | let rhs_pos = rhs_start + i; |
122 | | |
123 | 78 | let lhs_is_null = lhs_nulls.is_null(lhs_pos); |
124 | 78 | let rhs_is_null = rhs_nulls.is_null(rhs_pos); |
125 | | |
126 | 78 | if lhs_is_null != rhs_is_null { |
127 | 0 | return false; |
128 | 78 | } |
129 | | |
130 | 78 | let lhs_offset_start = lhs_offsets[lhs_pos].to_usize().unwrap(); |
131 | 78 | let lhs_offset_end = lhs_offsets[lhs_pos + 1].to_usize().unwrap(); |
132 | 78 | let rhs_offset_start = rhs_offsets[rhs_pos].to_usize().unwrap(); |
133 | 78 | let rhs_offset_end = rhs_offsets[rhs_pos + 1].to_usize().unwrap(); |
134 | | |
135 | 78 | let lhs_len = lhs_offset_end - lhs_offset_start; |
136 | 78 | let rhs_len = rhs_offset_end - rhs_offset_start; |
137 | | |
138 | 78 | lhs_is_null |
139 | 54 | || (lhs_len == rhs_len |
140 | 54 | && equal_range( |
141 | 54 | lhs_values, |
142 | 54 | rhs_values, |
143 | 54 | lhs_offset_start, |
144 | 54 | rhs_offset_start, |
145 | 54 | lhs_len, |
146 | | )) |
147 | 78 | }) |
148 | | } |
149 | 79 | } |