/Users/andrewlamb/Software/arrow-rs/arrow-ord/src/partition.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines partition kernel for `ArrayRef` |
19 | | |
20 | | use std::ops::Range; |
21 | | |
22 | | use arrow_array::{Array, ArrayRef}; |
23 | | use arrow_buffer::BooleanBuffer; |
24 | | use arrow_schema::{ArrowError, SortOptions}; |
25 | | |
26 | | use crate::cmp::distinct; |
27 | | use crate::ord::make_comparator; |
28 | | |
29 | | /// A computed set of partitions, see [`partition`] |
30 | | #[derive(Debug, Clone)] |
31 | | pub struct Partitions(Option<BooleanBuffer>); |
32 | | |
33 | | impl Partitions { |
34 | | /// Returns the range of each partition |
35 | | /// |
36 | | /// Consecutive ranges will be contiguous: i.e [`(a, b)` and `(b, c)`], and |
37 | | /// `start = 0` and `end = self.len()` for the first and last range respectively |
38 | 0 | pub fn ranges(&self) -> Vec<Range<usize>> { |
39 | 0 | let boundaries = match &self.0 { |
40 | 0 | Some(boundaries) => boundaries, |
41 | 0 | None => return vec![], |
42 | | }; |
43 | | |
44 | 0 | let mut out = vec![]; |
45 | 0 | let mut current = 0; |
46 | 0 | for idx in boundaries.set_indices() { |
47 | 0 | let t = current; |
48 | 0 | current = idx + 1; |
49 | 0 | out.push(t..current) |
50 | | } |
51 | 0 | let last = boundaries.len() + 1; |
52 | 0 | if current != last { |
53 | 0 | out.push(current..last) |
54 | 0 | } |
55 | 0 | out |
56 | 0 | } |
57 | | |
58 | | /// Returns the number of partitions |
59 | 0 | pub fn len(&self) -> usize { |
60 | 0 | match &self.0 { |
61 | 0 | Some(b) => b.count_set_bits() + 1, |
62 | 0 | None => 0, |
63 | | } |
64 | 0 | } |
65 | | |
66 | | /// Returns true if this contains no partitions |
67 | 0 | pub fn is_empty(&self) -> bool { |
68 | 0 | self.0.is_none() |
69 | 0 | } |
70 | | } |
71 | | |
72 | | /// Given a list of lexicographically sorted columns, computes the [`Partitions`], |
73 | | /// where a partition consists of the set of consecutive rows with equal values |
74 | | /// |
75 | | /// Returns an error if no columns are specified or all columns do not |
76 | | /// have the same number of rows. |
77 | | /// |
78 | | /// # Example: |
79 | | /// |
80 | | /// For example, given columns `x`, `y` and `z`, calling |
81 | | /// [`partition`]`(values, (x, y))` will divide the |
82 | | /// rows into ranges where the values of `(x, y)` are equal: |
83 | | /// |
84 | | /// ```text |
85 | | /// ┌ ─ ┬───┬ ─ ─┌───┐─ ─ ┬───┬ ─ ─ ┐ |
86 | | /// │ 1 │ │ 1 │ │ A │ Range: 0..1 (x=1, y=1) |
87 | | /// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ |
88 | | /// │ 1 │ │ 2 │ │ B │ |
89 | | /// │ ├───┤ ├───┤ ├───┤ │ |
90 | | /// │ 1 │ │ 2 │ │ C │ Range: 1..4 (x=1, y=2) |
91 | | /// │ ├───┤ ├───┤ ├───┤ │ |
92 | | /// │ 1 │ │ 2 │ │ D │ |
93 | | /// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ |
94 | | /// │ 2 │ │ 1 │ │ E │ Range: 4..5 (x=2, y=1) |
95 | | /// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ |
96 | | /// │ 3 │ │ 1 │ │ F │ Range: 5..6 (x=3, y=1) |
97 | | /// └ ─ ┴───┴ ─ ─└───┘─ ─ ┴───┴ ─ ─ ┘ |
98 | | /// |
99 | | /// x y z partition(&[x, y]) |
100 | | /// ``` |
101 | | /// |
102 | | /// # Example Code |
103 | | /// |
104 | | /// ``` |
105 | | /// # use std::{sync::Arc, ops::Range}; |
106 | | /// # use arrow_array::{RecordBatch, Int64Array, StringArray, ArrayRef}; |
107 | | /// # use arrow_ord::sort::{SortColumn, SortOptions}; |
108 | | /// # use arrow_ord::partition::partition; |
109 | | /// let batch = RecordBatch::try_from_iter(vec![ |
110 | | /// ("x", Arc::new(Int64Array::from(vec![1, 1, 1, 1, 2, 3])) as ArrayRef), |
111 | | /// ("y", Arc::new(Int64Array::from(vec![1, 2, 2, 2, 1, 1])) as ArrayRef), |
112 | | /// ("z", Arc::new(StringArray::from(vec!["A", "B", "C", "D", "E", "F"])) as ArrayRef), |
113 | | /// ]).unwrap(); |
114 | | /// |
115 | | /// // Partition on first two columns |
116 | | /// let ranges = partition(&batch.columns()[..2]).unwrap().ranges(); |
117 | | /// |
118 | | /// let expected = vec![ |
119 | | /// (0..1), |
120 | | /// (1..4), |
121 | | /// (4..5), |
122 | | /// (5..6), |
123 | | /// ]; |
124 | | /// |
125 | | /// assert_eq!(ranges, expected); |
126 | | /// ``` |
127 | 0 | pub fn partition(columns: &[ArrayRef]) -> Result<Partitions, ArrowError> { |
128 | 0 | if columns.is_empty() { |
129 | 0 | return Err(ArrowError::InvalidArgumentError( |
130 | 0 | "Partition requires at least one column".to_string(), |
131 | 0 | )); |
132 | 0 | } |
133 | 0 | let num_rows = columns[0].len(); |
134 | 0 | if columns.iter().any(|item| item.len() != num_rows) { |
135 | 0 | return Err(ArrowError::InvalidArgumentError( |
136 | 0 | "Partition columns have different row counts".to_string(), |
137 | 0 | )); |
138 | 0 | }; |
139 | | |
140 | 0 | match num_rows { |
141 | 0 | 0 => return Ok(Partitions(None)), |
142 | 0 | 1 => return Ok(Partitions(Some(BooleanBuffer::new_unset(0)))), |
143 | 0 | _ => {} |
144 | | } |
145 | | |
146 | 0 | let acc = find_boundaries(&columns[0])?; |
147 | 0 | let acc = columns |
148 | 0 | .iter() |
149 | 0 | .skip(1) |
150 | 0 | .try_fold(acc, |acc, c| find_boundaries(c.as_ref()).map(|b| &acc | &b))?; |
151 | | |
152 | 0 | Ok(Partitions(Some(acc))) |
153 | 0 | } |
154 | | |
155 | | /// Returns a mask with bits set whenever the value or nullability changes |
156 | 0 | fn find_boundaries(v: &dyn Array) -> Result<BooleanBuffer, ArrowError> { |
157 | 0 | let slice_len = v.len() - 1; |
158 | 0 | let v1 = v.slice(0, slice_len); |
159 | 0 | let v2 = v.slice(1, slice_len); |
160 | | |
161 | 0 | if !v.data_type().is_nested() { |
162 | 0 | return Ok(distinct(&v1, &v2)?.values().clone()); |
163 | 0 | } |
164 | | // Given that we're only comparing values, null ordering in the input or |
165 | | // sort options do not matter. |
166 | 0 | let cmp = make_comparator(&v1, &v2, SortOptions::default())?; |
167 | 0 | Ok((0..slice_len).map(|i| !cmp(i, i).is_eq()).collect()) |
168 | 0 | } |
169 | | |
170 | | #[cfg(test)] |
171 | | mod tests { |
172 | | use std::sync::Arc; |
173 | | |
174 | | use super::*; |
175 | | use arrow_array::*; |
176 | | use arrow_schema::DataType; |
177 | | |
178 | | #[test] |
179 | | fn test_partition_empty() { |
180 | | let err = partition(&[]).unwrap_err(); |
181 | | assert_eq!( |
182 | | err.to_string(), |
183 | | "Invalid argument error: Partition requires at least one column" |
184 | | ); |
185 | | } |
186 | | |
187 | | #[test] |
188 | | fn test_partition_unaligned_rows() { |
189 | | let input = vec![ |
190 | | Arc::new(Int64Array::from(vec![None, Some(-1)])) as _, |
191 | | Arc::new(StringArray::from(vec![Some("foo")])) as _, |
192 | | ]; |
193 | | let err = partition(&input).unwrap_err(); |
194 | | assert_eq!( |
195 | | err.to_string(), |
196 | | "Invalid argument error: Partition columns have different row counts" |
197 | | ) |
198 | | } |
199 | | |
200 | | #[test] |
201 | | fn test_partition_small() { |
202 | | let results = partition(&[ |
203 | | Arc::new(Int32Array::new(vec![].into(), None)) as _, |
204 | | Arc::new(Int32Array::new(vec![].into(), None)) as _, |
205 | | Arc::new(Int32Array::new(vec![].into(), None)) as _, |
206 | | ]) |
207 | | .unwrap(); |
208 | | assert_eq!(results.len(), 0); |
209 | | assert!(results.is_empty()); |
210 | | |
211 | | let results = partition(&[ |
212 | | Arc::new(Int32Array::from(vec![1])) as _, |
213 | | Arc::new(Int32Array::from(vec![1])) as _, |
214 | | ]) |
215 | | .unwrap() |
216 | | .ranges(); |
217 | | assert_eq!(results.len(), 1); |
218 | | assert_eq!(results[0], 0..1); |
219 | | } |
220 | | |
221 | | #[test] |
222 | | fn test_partition_single_column() { |
223 | | let a = Int64Array::from(vec![1, 2, 2, 2, 2, 2, 2, 2, 9]); |
224 | | let input = vec![Arc::new(a) as _]; |
225 | | assert_eq!( |
226 | | partition(&input).unwrap().ranges(), |
227 | | vec![(0..1), (1..8), (8..9)], |
228 | | ); |
229 | | } |
230 | | |
231 | | #[test] |
232 | | fn test_partition_all_equal_values() { |
233 | | let a = Int64Array::from_value(1, 1000); |
234 | | let input = vec![Arc::new(a) as _]; |
235 | | assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1000)]); |
236 | | } |
237 | | |
238 | | #[test] |
239 | | fn test_partition_all_null_values() { |
240 | | let input = vec![ |
241 | | new_null_array(&DataType::Int8, 1000), |
242 | | new_null_array(&DataType::UInt16, 1000), |
243 | | ]; |
244 | | assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1000)]); |
245 | | } |
246 | | |
247 | | #[test] |
248 | | fn test_partition_unique_column_1() { |
249 | | let input = vec![ |
250 | | Arc::new(Int64Array::from(vec![None, Some(-1)])) as _, |
251 | | Arc::new(StringArray::from(vec![Some("foo"), Some("bar")])) as _, |
252 | | ]; |
253 | | assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1), (1..2)],); |
254 | | } |
255 | | |
256 | | #[test] |
257 | | fn test_partition_unique_column_2() { |
258 | | let input = vec![ |
259 | | Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1)])) as _, |
260 | | Arc::new(StringArray::from(vec![ |
261 | | Some("foo"), |
262 | | Some("bar"), |
263 | | Some("apple"), |
264 | | ])) as _, |
265 | | ]; |
266 | | assert_eq!( |
267 | | partition(&input).unwrap().ranges(), |
268 | | vec![(0..1), (1..2), (2..3),], |
269 | | ); |
270 | | } |
271 | | |
272 | | #[test] |
273 | | fn test_partition_non_unique_column_1() { |
274 | | let input = vec![ |
275 | | Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1), Some(1)])) as _, |
276 | | Arc::new(StringArray::from(vec![ |
277 | | Some("foo"), |
278 | | Some("bar"), |
279 | | Some("bar"), |
280 | | Some("bar"), |
281 | | ])) as _, |
282 | | ]; |
283 | | assert_eq!( |
284 | | partition(&input).unwrap().ranges(), |
285 | | vec![(0..1), (1..3), (3..4),], |
286 | | ); |
287 | | } |
288 | | |
289 | | #[test] |
290 | | fn test_partition_masked_nulls() { |
291 | | let input = vec![ |
292 | | Arc::new(Int64Array::new(vec![1; 9].into(), None)) as _, |
293 | | Arc::new(Int64Array::new( |
294 | | vec![1, 1, 2, 2, 2, 3, 3, 3, 3].into(), |
295 | | Some(vec![false, true, true, true, true, false, false, true, false].into()), |
296 | | )) as _, |
297 | | Arc::new(Int64Array::new( |
298 | | vec![1, 1, 2, 2, 2, 2, 2, 3, 7].into(), |
299 | | Some(vec![true, true, true, true, false, true, true, true, false].into()), |
300 | | )) as _, |
301 | | ]; |
302 | | |
303 | | assert_eq!( |
304 | | partition(&input).unwrap().ranges(), |
305 | | vec![(0..1), (1..2), (2..4), (4..5), (5..7), (7..8), (8..9)], |
306 | | ); |
307 | | } |
308 | | |
309 | | #[test] |
310 | | fn test_partition_nested() { |
311 | | let input = vec![ |
312 | | Arc::new( |
313 | | StructArray::try_from(vec![( |
314 | | "f1", |
315 | | Arc::new(Int64Array::from(vec![ |
316 | | None, |
317 | | None, |
318 | | Some(1), |
319 | | Some(2), |
320 | | Some(2), |
321 | | Some(2), |
322 | | Some(3), |
323 | | Some(4), |
324 | | ])) as _, |
325 | | )]) |
326 | | .unwrap(), |
327 | | ) as _, |
328 | | Arc::new(Int64Array::from(vec![1, 1, 1, 2, 3, 3, 3, 4])) as _, |
329 | | ]; |
330 | | assert_eq!( |
331 | | partition(&input).unwrap().ranges(), |
332 | | vec![0..2, 2..3, 3..4, 4..6, 6..7, 7..8] |
333 | | ) |
334 | | } |
335 | | } |