/Users/andrewlamb/Software/arrow-rs/arrow-ord/src/sort.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Defines sort kernel for `ArrayRef` |
19 | | |
20 | | use crate::ord::{make_comparator, DynComparator}; |
21 | | use arrow_array::builder::BufferBuilder; |
22 | | use arrow_array::cast::*; |
23 | | use arrow_array::types::*; |
24 | | use arrow_array::*; |
25 | | use arrow_buffer::ArrowNativeType; |
26 | | use arrow_buffer::BooleanBufferBuilder; |
27 | | use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; |
28 | | use arrow_schema::{ArrowError, DataType}; |
29 | | use arrow_select::take::take; |
30 | | use std::cmp::Ordering; |
31 | | use std::sync::Arc; |
32 | | |
33 | | use crate::rank::{can_rank, rank}; |
34 | | pub use arrow_schema::SortOptions; |
35 | | |
36 | | /// Sort the `ArrayRef` using `SortOptions`. |
37 | | /// |
38 | | /// Performs a sort on values and indices. Nulls are ordered according |
39 | | /// to the `nulls_first` flag in `options`. Floats are sorted using |
40 | | /// IEEE 754 totalOrder |
41 | | /// |
42 | | /// Returns an `ArrowError::ComputeError(String)` if the array type is |
43 | | /// either unsupported by `sort_to_indices` or `take`. |
44 | | /// |
45 | | /// Note: this is an unstable_sort, meaning it may not preserve the |
46 | | /// order of equal elements. |
47 | | /// |
48 | | /// # Example |
49 | | /// ```rust |
50 | | /// # use std::sync::Arc; |
51 | | /// # use arrow_array::Int32Array; |
52 | | /// # use arrow_ord::sort::sort; |
53 | | /// let array = Int32Array::from(vec![5, 4, 3, 2, 1]); |
54 | | /// let sorted_array = sort(&array, None).unwrap(); |
55 | | /// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![1, 2, 3, 4, 5])); |
56 | | /// ``` |
57 | 0 | pub fn sort(values: &dyn Array, options: Option<SortOptions>) -> Result<ArrayRef, ArrowError> { |
58 | 0 | downcast_primitive_array!( |
59 | 0 | values => sort_native_type(values, options), |
60 | 0 | DataType::RunEndEncoded(_, _) => sort_run(values, options, None), |
61 | | _ => { |
62 | 0 | let indices = sort_to_indices(values, options, None)?; |
63 | 0 | take(values, &indices, None) |
64 | | } |
65 | | ) |
66 | 0 | } |
67 | | |
68 | 0 | fn sort_native_type<T>( |
69 | 0 | primitive_values: &PrimitiveArray<T>, |
70 | 0 | options: Option<SortOptions>, |
71 | 0 | ) -> Result<ArrayRef, ArrowError> |
72 | 0 | where |
73 | 0 | T: ArrowPrimitiveType, |
74 | | { |
75 | 0 | let sort_options = options.unwrap_or_default(); |
76 | | |
77 | 0 | let mut mutable_buffer = vec![T::default_value(); primitive_values.len()]; |
78 | 0 | let mutable_slice = &mut mutable_buffer; |
79 | | |
80 | 0 | let input_values = primitive_values.values().as_ref(); |
81 | | |
82 | 0 | let nulls_count = primitive_values.null_count(); |
83 | 0 | let valid_count = primitive_values.len() - nulls_count; |
84 | | |
85 | 0 | let null_bit_buffer = match nulls_count > 0 { |
86 | | true => { |
87 | 0 | let mut validity_buffer = BooleanBufferBuilder::new(primitive_values.len()); |
88 | 0 | if sort_options.nulls_first { |
89 | 0 | validity_buffer.append_n(nulls_count, false); |
90 | 0 | validity_buffer.append_n(valid_count, true); |
91 | 0 | } else { |
92 | 0 | validity_buffer.append_n(valid_count, true); |
93 | 0 | validity_buffer.append_n(nulls_count, false); |
94 | 0 | } |
95 | 0 | Some(validity_buffer.finish().into()) |
96 | | } |
97 | 0 | false => None, |
98 | | }; |
99 | | |
100 | 0 | if let Some(nulls) = primitive_values.nulls().filter(|n| n.null_count() > 0) { |
101 | 0 | let values_slice = match sort_options.nulls_first { |
102 | 0 | true => &mut mutable_slice[nulls_count..], |
103 | 0 | false => &mut mutable_slice[..valid_count], |
104 | | }; |
105 | | |
106 | 0 | for (write_index, index) in nulls.valid_indices().enumerate() { |
107 | 0 | values_slice[write_index] = primitive_values.value(index); |
108 | 0 | } |
109 | | |
110 | 0 | values_slice.sort_unstable_by(|a, b| a.compare(*b)); |
111 | 0 | if sort_options.descending { |
112 | 0 | values_slice.reverse(); |
113 | 0 | } |
114 | | } else { |
115 | 0 | mutable_slice.copy_from_slice(input_values); |
116 | 0 | mutable_slice.sort_unstable_by(|a, b| a.compare(*b)); |
117 | 0 | if sort_options.descending { |
118 | 0 | mutable_slice.reverse(); |
119 | 0 | } |
120 | | } |
121 | | |
122 | 0 | Ok(Arc::new( |
123 | 0 | PrimitiveArray::<T>::new(mutable_buffer.into(), null_bit_buffer) |
124 | 0 | .with_data_type(primitive_values.data_type().clone()), |
125 | 0 | )) |
126 | 0 | } |
127 | | |
128 | | /// Sort the `ArrayRef` partially. |
129 | | /// |
130 | | /// If `limit` is specified, the resulting array will contain only |
131 | | /// first `limit` in the sort order. Any data data after the limit |
132 | | /// will be discarded. |
133 | | /// |
134 | | /// Note: this is an unstable_sort, meaning it may not preserve the |
135 | | /// order of equal elements. |
136 | | /// |
137 | | /// # Example |
138 | | /// ```rust |
139 | | /// # use std::sync::Arc; |
140 | | /// # use arrow_array::Int32Array; |
141 | | /// # use arrow_ord::sort::{sort_limit, SortOptions}; |
142 | | /// let array = Int32Array::from(vec![5, 4, 3, 2, 1]); |
143 | | /// |
144 | | /// // Find the the top 2 items |
145 | | /// let sorted_array = sort_limit(&array, None, Some(2)).unwrap(); |
146 | | /// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![1, 2])); |
147 | | /// |
148 | | /// // Find the bottom top 2 items |
149 | | /// let options = Some(SortOptions { |
150 | | /// descending: true, |
151 | | /// ..Default::default() |
152 | | /// }); |
153 | | /// let sorted_array = sort_limit(&array, options, Some(2)).unwrap(); |
154 | | /// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![5, 4])); |
155 | | /// ``` |
156 | 0 | pub fn sort_limit( |
157 | 0 | values: &dyn Array, |
158 | 0 | options: Option<SortOptions>, |
159 | 0 | limit: Option<usize>, |
160 | 0 | ) -> Result<ArrayRef, ArrowError> { |
161 | 0 | if let DataType::RunEndEncoded(_, _) = values.data_type() { |
162 | 0 | return sort_run(values, options, limit); |
163 | 0 | } |
164 | 0 | let indices = sort_to_indices(values, options, limit)?; |
165 | 0 | take(values, &indices, None) |
166 | 0 | } |
167 | | |
168 | | /// we can only do this if the T is primitive |
169 | | #[inline] |
170 | 0 | fn sort_unstable_by<T, F>(array: &mut [T], limit: usize, cmp: F) |
171 | 0 | where |
172 | 0 | F: FnMut(&T, &T) -> Ordering, |
173 | | { |
174 | 0 | if array.len() == limit { |
175 | 0 | array.sort_unstable_by(cmp); |
176 | 0 | } else { |
177 | 0 | partial_sort(array, limit, cmp); |
178 | 0 | } |
179 | 0 | } |
180 | | |
181 | | /// Partition indices of an Arrow array into two categories: |
182 | | /// - `valid`: indices of non-null elements |
183 | | /// - `nulls`: indices of null elements |
184 | | /// |
185 | | /// Optimized for performance with fast-path for all-valid arrays |
186 | | /// and bit-parallel scan for null-containing arrays. |
187 | | #[inline(always)] |
188 | 0 | pub fn partition_validity(array: &dyn Array) -> (Vec<u32>, Vec<u32>) { |
189 | 0 | let len = array.len(); |
190 | 0 | let null_count = array.null_count(); |
191 | | |
192 | | // Fast path: if there are no nulls, all elements are valid |
193 | 0 | if null_count == 0 { |
194 | | // Simply return a range of indices [0, len) |
195 | 0 | let valid = (0..len as u32).collect(); |
196 | 0 | return (valid, Vec::new()); |
197 | 0 | } |
198 | | |
199 | | // null bitmap exists and some values are null |
200 | 0 | partition_validity_scan(array, len, null_count) |
201 | 0 | } |
202 | | |
203 | | /// Scans the null bitmap and partitions valid/null indices efficiently. |
204 | | /// Uses bit-level operations to extract bit positions. |
205 | | /// This function is only called when nulls exist. |
206 | | #[inline(always)] |
207 | 0 | fn partition_validity_scan( |
208 | 0 | array: &dyn Array, |
209 | 0 | len: usize, |
210 | 0 | null_count: usize, |
211 | 0 | ) -> (Vec<u32>, Vec<u32>) { |
212 | | // SAFETY: Guaranteed by caller that null_count > 0, so bitmap must exist |
213 | 0 | let bitmap = array.nulls().unwrap(); |
214 | | |
215 | | // Preallocate result vectors with exact capacities (avoids reallocations) |
216 | 0 | let mut valid = Vec::with_capacity(len - null_count); |
217 | 0 | let mut nulls = Vec::with_capacity(null_count); |
218 | | |
219 | | unsafe { |
220 | | // 1) Write valid indices (bits == 1) |
221 | 0 | let valid_slice = valid.spare_capacity_mut(); |
222 | 0 | for (i, idx) in bitmap.inner().set_indices_u32().enumerate() { |
223 | 0 | valid_slice[i].write(idx); |
224 | 0 | } |
225 | | |
226 | | // 2) Write null indices by inverting |
227 | 0 | let inv_buf = !bitmap.inner(); |
228 | 0 | let null_slice = nulls.spare_capacity_mut(); |
229 | 0 | for (i, idx) in inv_buf.set_indices_u32().enumerate() { |
230 | 0 | null_slice[i].write(idx); |
231 | 0 | } |
232 | | |
233 | | // Finalize lengths |
234 | 0 | valid.set_len(len - null_count); |
235 | 0 | nulls.set_len(null_count); |
236 | | } |
237 | | |
238 | 0 | assert_eq!(valid.len(), len - null_count); |
239 | 0 | assert_eq!(nulls.len(), null_count); |
240 | 0 | (valid, nulls) |
241 | 0 | } |
242 | | |
243 | | /// Whether `sort_to_indices` can sort an array of given data type. |
244 | 0 | fn can_sort_to_indices(data_type: &DataType) -> bool { |
245 | 0 | data_type.is_primitive() |
246 | 0 | || matches!( |
247 | 0 | data_type, |
248 | | DataType::Boolean |
249 | | | DataType::Utf8 |
250 | | | DataType::LargeUtf8 |
251 | | | DataType::Utf8View |
252 | | | DataType::Binary |
253 | | | DataType::LargeBinary |
254 | | | DataType::BinaryView |
255 | | | DataType::FixedSizeBinary(_) |
256 | | ) |
257 | 0 | || match data_type { |
258 | 0 | DataType::List(f) if can_rank(f.data_type()) => true, |
259 | 0 | DataType::LargeList(f) if can_rank(f.data_type()) => true, |
260 | 0 | DataType::FixedSizeList(f, _) if can_rank(f.data_type()) => true, |
261 | 0 | DataType::Dictionary(_, values) if can_rank(values.as_ref()) => true, |
262 | 0 | DataType::RunEndEncoded(_, f) if can_sort_to_indices(f.data_type()) => true, |
263 | 0 | _ => false, |
264 | | } |
265 | 0 | } |
266 | | |
267 | | /// Sort elements from `ArrayRef` into an unsigned integer (`UInt32Array`) of indices. |
268 | | /// Floats are sorted using IEEE 754 totalOrder. `limit` is an option for [partial_sort]. |
269 | 0 | pub fn sort_to_indices( |
270 | 0 | array: &dyn Array, |
271 | 0 | options: Option<SortOptions>, |
272 | 0 | limit: Option<usize>, |
273 | 0 | ) -> Result<UInt32Array, ArrowError> { |
274 | 0 | let options = options.unwrap_or_default(); |
275 | | |
276 | 0 | let (v, n) = partition_validity(array); |
277 | | |
278 | 0 | Ok(downcast_primitive_array! { |
279 | 0 | array => sort_primitive(array, v, n, options, limit), |
280 | 0 | DataType::Boolean => sort_boolean(array.as_boolean(), v, n, options, limit), |
281 | 0 | DataType::Utf8 => sort_bytes(array.as_string::<i32>(), v, n, options, limit), |
282 | 0 | DataType::LargeUtf8 => sort_bytes(array.as_string::<i64>(), v, n, options, limit), |
283 | 0 | DataType::Utf8View => sort_byte_view(array.as_string_view(), v, n, options, limit), |
284 | 0 | DataType::Binary => sort_bytes(array.as_binary::<i32>(), v, n, options, limit), |
285 | 0 | DataType::LargeBinary => sort_bytes(array.as_binary::<i64>(), v, n, options, limit), |
286 | 0 | DataType::BinaryView => sort_byte_view(array.as_binary_view(), v, n, options, limit), |
287 | 0 | DataType::FixedSizeBinary(_) => sort_fixed_size_binary(array.as_fixed_size_binary(), v, n, options, limit), |
288 | 0 | DataType::List(_) => sort_list(array.as_list::<i32>(), v, n, options, limit)?, |
289 | 0 | DataType::LargeList(_) => sort_list(array.as_list::<i64>(), v, n, options, limit)?, |
290 | 0 | DataType::FixedSizeList(_, _) => sort_fixed_size_list(array.as_fixed_size_list(), v, n, options, limit)?, |
291 | 0 | DataType::Dictionary(_, _) => downcast_dictionary_array!{ |
292 | 0 | array => sort_dictionary(array, v, n, options, limit)?, |
293 | 0 | _ => unreachable!() |
294 | | } |
295 | 0 | DataType::RunEndEncoded(run_ends_field, _) => match run_ends_field.data_type() { |
296 | 0 | DataType::Int16 => sort_run_to_indices::<Int16Type>(array, options, limit), |
297 | 0 | DataType::Int32 => sort_run_to_indices::<Int32Type>(array, options, limit), |
298 | 0 | DataType::Int64 => sort_run_to_indices::<Int64Type>(array, options, limit), |
299 | 0 | dt => { |
300 | 0 | return Err(ArrowError::ComputeError(format!( |
301 | 0 | "Invalid run end data type: {dt}" |
302 | 0 | ))) |
303 | | } |
304 | | }, |
305 | 0 | t => { |
306 | 0 | return Err(ArrowError::ComputeError(format!( |
307 | 0 | "Sort not supported for data type {t:?}" |
308 | 0 | ))); |
309 | | } |
310 | | }) |
311 | 0 | } |
312 | | |
313 | 0 | fn sort_boolean( |
314 | 0 | values: &BooleanArray, |
315 | 0 | value_indices: Vec<u32>, |
316 | 0 | null_indices: Vec<u32>, |
317 | 0 | options: SortOptions, |
318 | 0 | limit: Option<usize>, |
319 | 0 | ) -> UInt32Array { |
320 | 0 | let mut valids = value_indices |
321 | 0 | .into_iter() |
322 | 0 | .map(|index| (index, values.value(index as usize))) |
323 | 0 | .collect::<Vec<(u32, bool)>>(); |
324 | 0 | sort_impl(options, &mut valids, &null_indices, limit, |a, b| a.cmp(&b)).into() |
325 | 0 | } |
326 | | |
327 | 0 | fn sort_primitive<T: ArrowPrimitiveType>( |
328 | 0 | values: &PrimitiveArray<T>, |
329 | 0 | value_indices: Vec<u32>, |
330 | 0 | nulls: Vec<u32>, |
331 | 0 | options: SortOptions, |
332 | 0 | limit: Option<usize>, |
333 | 0 | ) -> UInt32Array { |
334 | 0 | let mut valids = value_indices |
335 | 0 | .into_iter() |
336 | 0 | .map(|index| (index, values.value(index as usize))) |
337 | 0 | .collect::<Vec<(u32, T::Native)>>(); |
338 | 0 | sort_impl(options, &mut valids, &nulls, limit, T::Native::compare).into() |
339 | 0 | } |
340 | | |
341 | 0 | fn sort_bytes<T: ByteArrayType>( |
342 | 0 | values: &GenericByteArray<T>, |
343 | 0 | value_indices: Vec<u32>, |
344 | 0 | nulls: Vec<u32>, |
345 | 0 | options: SortOptions, |
346 | 0 | limit: Option<usize>, |
347 | 0 | ) -> UInt32Array { |
348 | | // Note: Why do we use 4‑byte prefix? |
349 | | // Compute the 4‑byte prefix in BE order, or left‑pad if shorter. |
350 | | // Most byte‐sequences differ in their first few bytes, so by |
351 | | // comparing up to 4 bytes as a single u32 we avoid the overhead |
352 | | // of a full lexicographical compare for the vast majority of cases. |
353 | | |
354 | | // 1. Build a vector of (index, prefix, length) tuples |
355 | 0 | let mut valids: Vec<(u32, u32, u64)> = value_indices |
356 | 0 | .into_iter() |
357 | 0 | .map(|idx| unsafe { |
358 | 0 | let slice: &[u8] = values.value_unchecked(idx as usize).as_ref(); |
359 | 0 | let len = slice.len() as u64; |
360 | | // Compute the 4‑byte prefix in BE order, or left‑pad if shorter |
361 | 0 | let prefix = if slice.len() >= 4 { |
362 | 0 | let raw = std::ptr::read_unaligned(slice.as_ptr() as *const u32); |
363 | 0 | u32::from_be(raw) |
364 | 0 | } else if slice.is_empty() { |
365 | | // Handle empty slice case to avoid shift overflow |
366 | 0 | 0u32 |
367 | | } else { |
368 | 0 | let mut v = 0u32; |
369 | 0 | for &b in slice { |
370 | 0 | v = (v << 8) | (b as u32); |
371 | 0 | } |
372 | | // Safe shift: slice.len() is in range [1, 3], so shift is in range [8, 24] |
373 | 0 | v << (8 * (4 - slice.len())) |
374 | | }; |
375 | 0 | (idx, prefix, len) |
376 | 0 | }) |
377 | 0 | .collect(); |
378 | | |
379 | | // 2. compute the number of non-null entries to partially sort |
380 | 0 | let vlimit = match (limit, options.nulls_first) { |
381 | 0 | (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), |
382 | 0 | _ => valids.len(), |
383 | | }; |
384 | | |
385 | | // 3. Comparator: compare prefix, then (when both slices shorter than 4) length, otherwise full slice |
386 | 0 | let cmp_bytes = |a: &(u32, u32, u64), b: &(u32, u32, u64)| unsafe { |
387 | 0 | let (ia, pa, la) = *a; |
388 | 0 | let (ib, pb, lb) = *b; |
389 | | // 3.1 prefix (first 4 bytes) |
390 | 0 | let ord = pa.cmp(&pb); |
391 | 0 | if ord != Ordering::Equal { |
392 | 0 | return ord; |
393 | 0 | } |
394 | | // 3.2 only if both slices had length < 4 (so prefix was padded) |
395 | 0 | if la < 4 || lb < 4 { |
396 | 0 | let ord = la.cmp(&lb); |
397 | 0 | if ord != Ordering::Equal { |
398 | 0 | return ord; |
399 | 0 | } |
400 | 0 | } |
401 | | // 3.3 full lexicographical compare |
402 | 0 | let a_bytes: &[u8] = values.value_unchecked(ia as usize).as_ref(); |
403 | 0 | let b_bytes: &[u8] = values.value_unchecked(ib as usize).as_ref(); |
404 | 0 | a_bytes.cmp(b_bytes) |
405 | 0 | }; |
406 | | |
407 | | // 4. Partially sort according to ascending/descending |
408 | 0 | if !options.descending { |
409 | 0 | sort_unstable_by(&mut valids, vlimit, cmp_bytes); |
410 | 0 | } else { |
411 | 0 | sort_unstable_by(&mut valids, vlimit, |x, y| cmp_bytes(x, y).reverse()); |
412 | | } |
413 | | |
414 | | // 5. Assemble nulls and sorted indices into final output |
415 | 0 | let total = valids.len() + nulls.len(); |
416 | 0 | let out_limit = limit.unwrap_or(total).min(total); |
417 | 0 | let mut out = Vec::with_capacity(out_limit); |
418 | | |
419 | 0 | if options.nulls_first { |
420 | 0 | out.extend_from_slice(&nulls[..nulls.len().min(out_limit)]); |
421 | 0 | let rem = out_limit - out.len(); |
422 | 0 | out.extend(valids.iter().map(|&(i, _, _)| i).take(rem)); |
423 | | } else { |
424 | 0 | out.extend(valids.iter().map(|&(i, _, _)| i).take(out_limit)); |
425 | 0 | let rem = out_limit - out.len(); |
426 | 0 | out.extend_from_slice(&nulls[..rem]); |
427 | | } |
428 | | |
429 | 0 | out.into() |
430 | 0 | } |
431 | | |
432 | 0 | fn sort_byte_view<T: ByteViewType>( |
433 | 0 | values: &GenericByteViewArray<T>, |
434 | 0 | value_indices: Vec<u32>, |
435 | 0 | nulls: Vec<u32>, |
436 | 0 | options: SortOptions, |
437 | 0 | limit: Option<usize>, |
438 | 0 | ) -> UInt32Array { |
439 | | // 1. Build a list of (index, raw_view, length) |
440 | | let mut valids: Vec<_>; |
441 | | // 2. Compute the number of non-null entries to partially sort |
442 | 0 | let vlimit: usize = match (limit, options.nulls_first) { |
443 | 0 | (Some(l), true) => l.saturating_sub(nulls.len()).min(value_indices.len()), |
444 | 0 | _ => value_indices.len(), |
445 | | }; |
446 | | // 3.a Check if all views are inline (no data buffers) |
447 | 0 | if values.data_buffers().is_empty() { |
448 | 0 | valids = value_indices |
449 | 0 | .into_iter() |
450 | 0 | .map(|idx| { |
451 | | // SAFETY: we know idx < values.len() |
452 | 0 | let raw = unsafe { *values.views().get_unchecked(idx as usize) }; |
453 | 0 | let inline_key = GenericByteViewArray::<T>::inline_key_fast(raw); |
454 | 0 | (idx, inline_key) |
455 | 0 | }) |
456 | 0 | .collect(); |
457 | 0 | let cmp_inline = |a: &(u32, u128), b: &(u32, u128)| a.1.cmp(&b.1); |
458 | | |
459 | | // Partially sort according to ascending/descending |
460 | 0 | if !options.descending { |
461 | 0 | sort_unstable_by(&mut valids, vlimit, cmp_inline); |
462 | 0 | } else { |
463 | 0 | sort_unstable_by(&mut valids, vlimit, |x, y| cmp_inline(x, y).reverse()); |
464 | | } |
465 | | } else { |
466 | 0 | valids = value_indices |
467 | 0 | .into_iter() |
468 | 0 | .map(|idx| { |
469 | | // SAFETY: we know idx < values.len() |
470 | 0 | let raw = unsafe { *values.views().get_unchecked(idx as usize) }; |
471 | 0 | (idx, raw) |
472 | 0 | }) |
473 | 0 | .collect(); |
474 | | // 3.b Mixed comparator: first prefix, then inline vs full comparison |
475 | 0 | let cmp_mixed = |a: &(u32, u128), b: &(u32, u128)| { |
476 | 0 | let (_, raw_a) = *a; |
477 | 0 | let (_, raw_b) = *b; |
478 | 0 | let len_a = raw_a as u32; |
479 | 0 | let len_b = raw_b as u32; |
480 | | // 3.b.1 Both inline (≤12 bytes): compare full 128-bit key including length |
481 | 0 | if len_a <= MAX_INLINE_VIEW_LEN && len_b <= MAX_INLINE_VIEW_LEN { |
482 | 0 | return GenericByteViewArray::<T>::inline_key_fast(raw_a) |
483 | 0 | .cmp(&GenericByteViewArray::<T>::inline_key_fast(raw_b)); |
484 | 0 | } |
485 | | |
486 | | // 3.b.2 Compare 4-byte prefix in big-endian order |
487 | 0 | let pref_a = ByteView::from(raw_a).prefix.swap_bytes(); |
488 | 0 | let pref_b = ByteView::from(raw_b).prefix.swap_bytes(); |
489 | 0 | if pref_a != pref_b { |
490 | 0 | return pref_a.cmp(&pref_b); |
491 | 0 | } |
492 | | |
493 | | // 3.b.3 Fallback to full byte-slice comparison |
494 | 0 | let full_a: &[u8] = unsafe { values.value_unchecked(a.0 as usize).as_ref() }; |
495 | 0 | let full_b: &[u8] = unsafe { values.value_unchecked(b.0 as usize).as_ref() }; |
496 | 0 | full_a.cmp(full_b) |
497 | 0 | }; |
498 | | |
499 | | // 3.b.4 Partially sort according to ascending/descending |
500 | 0 | if !options.descending { |
501 | 0 | sort_unstable_by(&mut valids, vlimit, cmp_mixed); |
502 | 0 | } else { |
503 | 0 | sort_unstable_by(&mut valids, vlimit, |x, y| cmp_mixed(x, y).reverse()); |
504 | | } |
505 | | } |
506 | | |
507 | | // 5. Assemble nulls and sorted indices into final output |
508 | 0 | let total = valids.len() + nulls.len(); |
509 | 0 | let out_limit = limit.unwrap_or(total).min(total); |
510 | 0 | let mut out = Vec::with_capacity(total); |
511 | | |
512 | 0 | if options.nulls_first { |
513 | | // Place null indices first |
514 | 0 | out.extend_from_slice(&nulls[..nulls.len().min(out_limit)]); |
515 | 0 | let rem = out_limit - out.len(); |
516 | 0 | out.extend(valids.iter().map(|&(i, _)| i).take(rem)); |
517 | | } else { |
518 | | // Place non-null indices first |
519 | 0 | out.extend(valids.iter().map(|&(i, _)| i).take(out_limit)); |
520 | 0 | let rem = out_limit - out.len(); |
521 | 0 | out.extend_from_slice(&nulls[..rem]); |
522 | | } |
523 | | |
524 | 0 | out.into() |
525 | 0 | } |
526 | | |
527 | 0 | fn sort_fixed_size_binary( |
528 | 0 | values: &FixedSizeBinaryArray, |
529 | 0 | value_indices: Vec<u32>, |
530 | 0 | nulls: Vec<u32>, |
531 | 0 | options: SortOptions, |
532 | 0 | limit: Option<usize>, |
533 | 0 | ) -> UInt32Array { |
534 | 0 | let mut valids = value_indices |
535 | 0 | .iter() |
536 | 0 | .copied() |
537 | 0 | .map(|index| (index, values.value(index as usize))) |
538 | 0 | .collect::<Vec<(u32, &[u8])>>(); |
539 | 0 | sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() |
540 | 0 | } |
541 | | |
542 | 0 | fn sort_dictionary<K: ArrowDictionaryKeyType>( |
543 | 0 | dict: &DictionaryArray<K>, |
544 | 0 | value_indices: Vec<u32>, |
545 | 0 | null_indices: Vec<u32>, |
546 | 0 | options: SortOptions, |
547 | 0 | limit: Option<usize>, |
548 | 0 | ) -> Result<UInt32Array, ArrowError> { |
549 | 0 | let keys: &PrimitiveArray<K> = dict.keys(); |
550 | 0 | let rank = child_rank(dict.values().as_ref(), options)?; |
551 | | |
552 | | // create tuples that are used for sorting |
553 | 0 | let mut valids = value_indices |
554 | 0 | .into_iter() |
555 | 0 | .map(|index| { |
556 | 0 | let key: K::Native = keys.value(index as usize); |
557 | 0 | (index, rank[key.as_usize()]) |
558 | 0 | }) |
559 | 0 | .collect::<Vec<(u32, u32)>>(); |
560 | | |
561 | 0 | Ok(sort_impl(options, &mut valids, &null_indices, limit, |a, b| a.cmp(&b)).into()) |
562 | 0 | } |
563 | | |
564 | 0 | fn sort_list<O: OffsetSizeTrait>( |
565 | 0 | array: &GenericListArray<O>, |
566 | 0 | value_indices: Vec<u32>, |
567 | 0 | null_indices: Vec<u32>, |
568 | 0 | options: SortOptions, |
569 | 0 | limit: Option<usize>, |
570 | 0 | ) -> Result<UInt32Array, ArrowError> { |
571 | 0 | let rank = child_rank(array.values().as_ref(), options)?; |
572 | 0 | let offsets = array.value_offsets(); |
573 | 0 | let mut valids = value_indices |
574 | 0 | .into_iter() |
575 | 0 | .map(|index| { |
576 | 0 | let end = offsets[index as usize + 1].as_usize(); |
577 | 0 | let start = offsets[index as usize].as_usize(); |
578 | 0 | (index, &rank[start..end]) |
579 | 0 | }) |
580 | 0 | .collect::<Vec<(u32, &[u32])>>(); |
581 | 0 | Ok(sort_impl(options, &mut valids, &null_indices, limit, Ord::cmp).into()) |
582 | 0 | } |
583 | | |
584 | 0 | fn sort_fixed_size_list( |
585 | 0 | array: &FixedSizeListArray, |
586 | 0 | value_indices: Vec<u32>, |
587 | 0 | null_indices: Vec<u32>, |
588 | 0 | options: SortOptions, |
589 | 0 | limit: Option<usize>, |
590 | 0 | ) -> Result<UInt32Array, ArrowError> { |
591 | 0 | let rank = child_rank(array.values().as_ref(), options)?; |
592 | 0 | let size = array.value_length() as usize; |
593 | 0 | let mut valids = value_indices |
594 | 0 | .into_iter() |
595 | 0 | .map(|index| { |
596 | 0 | let start = index as usize * size; |
597 | 0 | (index, &rank[start..start + size]) |
598 | 0 | }) |
599 | 0 | .collect::<Vec<(u32, &[u32])>>(); |
600 | 0 | Ok(sort_impl(options, &mut valids, &null_indices, limit, Ord::cmp).into()) |
601 | 0 | } |
602 | | |
603 | | #[inline(never)] |
604 | 0 | fn sort_impl<T: Copy>( |
605 | 0 | options: SortOptions, |
606 | 0 | valids: &mut [(u32, T)], |
607 | 0 | nulls: &[u32], |
608 | 0 | limit: Option<usize>, |
609 | 0 | mut cmp: impl FnMut(T, T) -> Ordering, |
610 | 0 | ) -> Vec<u32> { |
611 | 0 | let v_limit = match (limit, options.nulls_first) { |
612 | 0 | (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), |
613 | 0 | _ => valids.len(), |
614 | | }; |
615 | | |
616 | 0 | match options.descending { |
617 | 0 | false => sort_unstable_by(valids, v_limit, |a, b| cmp(a.1, b.1)), |
618 | 0 | true => sort_unstable_by(valids, v_limit, |a, b| cmp(a.1, b.1).reverse()), |
619 | | } |
620 | | |
621 | 0 | let len = valids.len() + nulls.len(); |
622 | 0 | let limit = limit.unwrap_or(len).min(len); |
623 | 0 | let mut out = Vec::with_capacity(len); |
624 | 0 | match options.nulls_first { |
625 | | true => { |
626 | 0 | out.extend_from_slice(&nulls[..nulls.len().min(limit)]); |
627 | 0 | let remaining = limit - out.len(); |
628 | 0 | out.extend(valids.iter().map(|x| x.0).take(remaining)); |
629 | | } |
630 | | false => { |
631 | 0 | out.extend(valids.iter().map(|x| x.0).take(limit)); |
632 | 0 | let remaining = limit - out.len(); |
633 | 0 | out.extend_from_slice(&nulls[..remaining]) |
634 | | } |
635 | | } |
636 | 0 | out |
637 | 0 | } |
638 | | |
639 | | /// Computes the rank for a set of child values |
640 | 0 | fn child_rank(values: &dyn Array, options: SortOptions) -> Result<Vec<u32>, ArrowError> { |
641 | | // If parent sort order is descending we need to invert the value of nulls_first so that |
642 | | // when the parent is sorted based on the produced ranks, nulls are still ordered correctly |
643 | 0 | let value_options = Some(SortOptions { |
644 | 0 | descending: false, |
645 | 0 | nulls_first: options.nulls_first != options.descending, |
646 | 0 | }); |
647 | 0 | rank(values, value_options) |
648 | 0 | } |
649 | | |
650 | | // Sort run array and return sorted run array. |
651 | | // The output RunArray will be encoded at the same level as input run array. |
652 | | // For e.g. an input RunArray { run_ends = [2,4,6,8], values = [1,2,1,2] } |
653 | | // will result in output RunArray { run_ends = [2,4,6,8], values = [1,1,2,2] } |
654 | | // and not RunArray { run_ends = [4,8], values = [1,2] } |
655 | 0 | fn sort_run( |
656 | 0 | values: &dyn Array, |
657 | 0 | options: Option<SortOptions>, |
658 | 0 | limit: Option<usize>, |
659 | 0 | ) -> Result<ArrayRef, ArrowError> { |
660 | 0 | match values.data_type() { |
661 | 0 | DataType::RunEndEncoded(run_ends_field, _) => match run_ends_field.data_type() { |
662 | 0 | DataType::Int16 => sort_run_downcasted::<Int16Type>(values, options, limit), |
663 | 0 | DataType::Int32 => sort_run_downcasted::<Int32Type>(values, options, limit), |
664 | 0 | DataType::Int64 => sort_run_downcasted::<Int64Type>(values, options, limit), |
665 | 0 | dt => unreachable!("Not valid run ends data type {dt}"), |
666 | | }, |
667 | 0 | dt => Err(ArrowError::InvalidArgumentError(format!( |
668 | 0 | "Input is not a run encoded array. Input data type {dt}" |
669 | 0 | ))), |
670 | | } |
671 | 0 | } |
672 | | |
673 | 0 | fn sort_run_downcasted<R: RunEndIndexType>( |
674 | 0 | values: &dyn Array, |
675 | 0 | options: Option<SortOptions>, |
676 | 0 | limit: Option<usize>, |
677 | 0 | ) -> Result<ArrayRef, ArrowError> { |
678 | 0 | let run_array = values.as_any().downcast_ref::<RunArray<R>>().unwrap(); |
679 | | |
680 | | // Determine the length of output run array. |
681 | 0 | let output_len = if let Some(limit) = limit { |
682 | 0 | limit.min(run_array.len()) |
683 | | } else { |
684 | 0 | run_array.len() |
685 | | }; |
686 | | |
687 | 0 | let run_ends = run_array.run_ends(); |
688 | | |
689 | 0 | let mut new_run_ends_builder = BufferBuilder::<R::Native>::new(run_ends.len()); |
690 | 0 | let mut new_run_end: usize = 0; |
691 | 0 | let mut new_physical_len: usize = 0; |
692 | | |
693 | 0 | let consume_runs = |run_length, _| { |
694 | 0 | new_run_end += run_length; |
695 | 0 | new_physical_len += 1; |
696 | 0 | new_run_ends_builder.append(R::Native::from_usize(new_run_end).unwrap()); |
697 | 0 | }; |
698 | | |
699 | 0 | let (values_indices, run_values) = sort_run_inner(run_array, options, output_len, consume_runs); |
700 | | |
701 | 0 | let new_run_ends = unsafe { |
702 | | // Safety: |
703 | | // The function builds a valid run_ends array and hence need not be validated. |
704 | 0 | ArrayDataBuilder::new(R::DATA_TYPE) |
705 | 0 | .len(new_physical_len) |
706 | 0 | .add_buffer(new_run_ends_builder.finish()) |
707 | 0 | .build_unchecked() |
708 | | }; |
709 | | |
710 | | // slice the sorted value indices based on limit. |
711 | 0 | let new_values_indices: PrimitiveArray<UInt32Type> = values_indices |
712 | 0 | .slice(0, new_run_ends.len()) |
713 | 0 | .into_data() |
714 | 0 | .into(); |
715 | | |
716 | 0 | let new_values = take(&run_values, &new_values_indices, None)?; |
717 | | |
718 | | // Build sorted run array |
719 | 0 | let builder = ArrayDataBuilder::new(run_array.data_type().clone()) |
720 | 0 | .len(new_run_end) |
721 | 0 | .add_child_data(new_run_ends) |
722 | 0 | .add_child_data(new_values.into_data()); |
723 | 0 | let array_data: RunArray<R> = unsafe { |
724 | | // Safety: |
725 | | // This function builds a valid run array and hence can skip validation. |
726 | 0 | builder.build_unchecked().into() |
727 | | }; |
728 | 0 | Ok(Arc::new(array_data)) |
729 | 0 | } |
730 | | |
731 | | // Sort to indices for run encoded array. |
732 | | // This function will be slow for run array as it decodes the physical indices to |
733 | | // logical indices and to get the run array back, the logical indices has to be |
734 | | // encoded back to run array. |
735 | 0 | fn sort_run_to_indices<R: RunEndIndexType>( |
736 | 0 | values: &dyn Array, |
737 | 0 | options: SortOptions, |
738 | 0 | limit: Option<usize>, |
739 | 0 | ) -> UInt32Array { |
740 | 0 | let run_array = values.as_any().downcast_ref::<RunArray<R>>().unwrap(); |
741 | 0 | let output_len = if let Some(limit) = limit { |
742 | 0 | limit.min(run_array.len()) |
743 | | } else { |
744 | 0 | run_array.len() |
745 | | }; |
746 | 0 | let mut result: Vec<u32> = Vec::with_capacity(output_len); |
747 | | |
748 | | //Add all logical indices belonging to a physical index to the output |
749 | 0 | let consume_runs = |run_length, logical_start| { |
750 | 0 | result.extend(logical_start as u32..(logical_start + run_length) as u32); |
751 | 0 | }; |
752 | 0 | sort_run_inner(run_array, Some(options), output_len, consume_runs); |
753 | | |
754 | 0 | UInt32Array::from(result) |
755 | 0 | } |
756 | | |
757 | 0 | fn sort_run_inner<R: RunEndIndexType, F>( |
758 | 0 | run_array: &RunArray<R>, |
759 | 0 | options: Option<SortOptions>, |
760 | 0 | output_len: usize, |
761 | 0 | mut consume_runs: F, |
762 | 0 | ) -> (PrimitiveArray<UInt32Type>, ArrayRef) |
763 | 0 | where |
764 | 0 | F: FnMut(usize, usize), |
765 | | { |
766 | | // slice the run_array.values based on offset and length. |
767 | 0 | let start_physical_index = run_array.get_start_physical_index(); |
768 | 0 | let end_physical_index = run_array.get_end_physical_index(); |
769 | 0 | let physical_len = end_physical_index - start_physical_index + 1; |
770 | 0 | let run_values = run_array.values().slice(start_physical_index, physical_len); |
771 | | |
772 | | // All the values have to be sorted irrespective of input limit. |
773 | 0 | let values_indices = sort_to_indices(&run_values, options, None).unwrap(); |
774 | | |
775 | 0 | let mut remaining_len = output_len; |
776 | | |
777 | 0 | let run_ends = run_array.run_ends().values(); |
778 | | |
779 | 0 | assert_eq!( |
780 | | 0, |
781 | 0 | values_indices.null_count(), |
782 | 0 | "The output of sort_to_indices should not have null values. Its values is {}", |
783 | 0 | values_indices.null_count() |
784 | | ); |
785 | | |
786 | | // Calculate `run length` of sorted value indices. |
787 | | // Find the `logical index` at which the run starts. |
788 | | // Call the consumer using the run length and starting logical index. |
789 | 0 | for physical_index in values_indices.values() { |
790 | | // As the values were sliced with offset = start_physical_index, it has to be added back |
791 | | // before accessing `RunArray::run_ends` |
792 | 0 | let physical_index = *physical_index as usize + start_physical_index; |
793 | | |
794 | | // calculate the run length and logical index of sorted values |
795 | 0 | let (run_length, logical_index_start) = unsafe { |
796 | | // Safety: |
797 | | // The index will be within bounds as its in bounds of start_physical_index |
798 | | // and len, both of which are within bounds of run_array |
799 | 0 | if physical_index == start_physical_index { |
800 | 0 | ( |
801 | 0 | run_ends.get_unchecked(physical_index).as_usize() - run_array.offset(), |
802 | 0 | 0, |
803 | 0 | ) |
804 | 0 | } else if physical_index == end_physical_index { |
805 | 0 | let prev_run_end = run_ends.get_unchecked(physical_index - 1).as_usize(); |
806 | 0 | ( |
807 | 0 | run_array.offset() + run_array.len() - prev_run_end, |
808 | 0 | prev_run_end - run_array.offset(), |
809 | 0 | ) |
810 | | } else { |
811 | 0 | let prev_run_end = run_ends.get_unchecked(physical_index - 1).as_usize(); |
812 | 0 | ( |
813 | 0 | run_ends.get_unchecked(physical_index).as_usize() - prev_run_end, |
814 | 0 | prev_run_end - run_array.offset(), |
815 | 0 | ) |
816 | | } |
817 | | }; |
818 | 0 | let new_run_length = run_length.min(remaining_len); |
819 | 0 | consume_runs(new_run_length, logical_index_start); |
820 | 0 | remaining_len -= new_run_length; |
821 | | |
822 | 0 | if remaining_len == 0 { |
823 | 0 | break; |
824 | 0 | } |
825 | | } |
826 | | |
827 | 0 | if remaining_len > 0 { |
828 | 0 | panic!("Remaining length should be zero its values is {remaining_len}") |
829 | 0 | } |
830 | 0 | (values_indices, run_values) |
831 | 0 | } |
832 | | |
833 | | /// One column to be used in lexicographical sort |
834 | | #[derive(Clone, Debug)] |
835 | | pub struct SortColumn { |
836 | | /// The column to sort |
837 | | pub values: ArrayRef, |
838 | | /// Sort options for this column |
839 | | pub options: Option<SortOptions>, |
840 | | } |
841 | | |
842 | | /// Sort a list of `ArrayRef` using `SortOptions` provided for each array. |
843 | | /// |
844 | | /// Performs a stable lexicographical sort on values and indices. |
845 | | /// |
846 | | /// Returns an `ArrowError::ComputeError(String)` if any of the array type is either unsupported by |
847 | | /// `lexsort_to_indices` or `take`. |
848 | | /// |
849 | | /// Example: |
850 | | /// |
851 | | /// ``` |
852 | | /// # use std::convert::From; |
853 | | /// # use std::sync::Arc; |
854 | | /// # use arrow_array::{ArrayRef, StringArray, PrimitiveArray}; |
855 | | /// # use arrow_array::types::Int64Type; |
856 | | /// # use arrow_array::cast::AsArray; |
857 | | /// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort}; |
858 | | /// |
859 | | /// let sorted_columns = lexsort(&vec![ |
860 | | /// SortColumn { |
861 | | /// values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
862 | | /// None, |
863 | | /// Some(-2), |
864 | | /// Some(89), |
865 | | /// Some(-64), |
866 | | /// Some(101), |
867 | | /// ])) as ArrayRef, |
868 | | /// options: None, |
869 | | /// }, |
870 | | /// SortColumn { |
871 | | /// values: Arc::new(StringArray::from(vec![ |
872 | | /// Some("hello"), |
873 | | /// Some("world"), |
874 | | /// Some(","), |
875 | | /// Some("foobar"), |
876 | | /// Some("!"), |
877 | | /// ])) as ArrayRef, |
878 | | /// options: Some(SortOptions { |
879 | | /// descending: true, |
880 | | /// nulls_first: false, |
881 | | /// }), |
882 | | /// }, |
883 | | /// ], None).unwrap(); |
884 | | /// |
885 | | /// assert_eq!(sorted_columns[0].as_primitive::<Int64Type>().value(1), -64); |
886 | | /// assert!(sorted_columns[0].is_null(0)); |
887 | | /// ``` |
888 | | /// |
889 | | /// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow-row/latest/arrow_row/) |
890 | | /// may be significantly faster |
891 | | /// |
892 | 0 | pub fn lexsort(columns: &[SortColumn], limit: Option<usize>) -> Result<Vec<ArrayRef>, ArrowError> { |
893 | 0 | let indices = lexsort_to_indices(columns, limit)?; |
894 | 0 | columns |
895 | 0 | .iter() |
896 | 0 | .map(|c| take(c.values.as_ref(), &indices, None)) |
897 | 0 | .collect() |
898 | 0 | } |
899 | | |
900 | | /// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer |
901 | | /// (`UInt32Array`) of indices. |
902 | | /// |
903 | | /// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow-row/latest/arrow_row/) |
904 | | /// may be significantly faster |
905 | 0 | pub fn lexsort_to_indices( |
906 | 0 | columns: &[SortColumn], |
907 | 0 | limit: Option<usize>, |
908 | 0 | ) -> Result<UInt32Array, ArrowError> { |
909 | 0 | if columns.is_empty() { |
910 | 0 | return Err(ArrowError::InvalidArgumentError( |
911 | 0 | "Sort requires at least one column".to_string(), |
912 | 0 | )); |
913 | 0 | } |
914 | 0 | if columns.len() == 1 && can_sort_to_indices(columns[0].values.data_type()) { |
915 | | // fallback to non-lexical sort |
916 | 0 | let column = &columns[0]; |
917 | 0 | return sort_to_indices(&column.values, column.options, limit); |
918 | 0 | } |
919 | | |
920 | 0 | let row_count = columns[0].values.len(); |
921 | 0 | if columns.iter().any(|item| item.values.len() != row_count) { |
922 | 0 | return Err(ArrowError::ComputeError( |
923 | 0 | "lexical sort columns have different row counts".to_string(), |
924 | 0 | )); |
925 | 0 | }; |
926 | | |
927 | 0 | let mut value_indices = (0..row_count).collect::<Vec<usize>>(); |
928 | 0 | let mut len = value_indices.len(); |
929 | | |
930 | 0 | if let Some(limit) = limit { |
931 | 0 | len = limit.min(len); |
932 | 0 | } |
933 | | |
934 | | // Instantiate specialized versions of comparisons for small numbers |
935 | | // of columns as it helps the compiler generate better code. |
936 | 0 | match columns.len() { |
937 | | 2 => { |
938 | 0 | sort_fixed_column::<2>(columns, &mut value_indices, len)?; |
939 | | } |
940 | | 3 => { |
941 | 0 | sort_fixed_column::<3>(columns, &mut value_indices, len)?; |
942 | | } |
943 | | 4 => { |
944 | 0 | sort_fixed_column::<4>(columns, &mut value_indices, len)?; |
945 | | } |
946 | | 5 => { |
947 | 0 | sort_fixed_column::<5>(columns, &mut value_indices, len)?; |
948 | | } |
949 | | _ => { |
950 | 0 | let lexicographical_comparator = LexicographicalComparator::try_new(columns)?; |
951 | | // uint32 can be sorted unstably |
952 | 0 | sort_unstable_by(&mut value_indices, len, |a, b| { |
953 | 0 | lexicographical_comparator.compare(*a, *b) |
954 | 0 | }); |
955 | | } |
956 | | } |
957 | 0 | Ok(UInt32Array::from( |
958 | 0 | value_indices[..len] |
959 | 0 | .iter() |
960 | 0 | .map(|i| *i as u32) |
961 | 0 | .collect::<Vec<_>>(), |
962 | | )) |
963 | 0 | } |
964 | | |
965 | | // Sort a fixed number of columns using FixedLexicographicalComparator |
966 | 0 | fn sort_fixed_column<const N: usize>( |
967 | 0 | columns: &[SortColumn], |
968 | 0 | value_indices: &mut [usize], |
969 | 0 | len: usize, |
970 | 0 | ) -> Result<(), ArrowError> { |
971 | 0 | let lexicographical_comparator = FixedLexicographicalComparator::<N>::try_new(columns)?; |
972 | 0 | sort_unstable_by(value_indices, len, |a, b| { |
973 | 0 | lexicographical_comparator.compare(*a, *b) |
974 | 0 | }); |
975 | 0 | Ok(()) |
976 | 0 | } |
977 | | |
978 | | /// It's unstable_sort, may not preserve the order of equal elements |
979 | 0 | pub fn partial_sort<T, F>(v: &mut [T], limit: usize, mut is_less: F) |
980 | 0 | where |
981 | 0 | F: FnMut(&T, &T) -> Ordering, |
982 | | { |
983 | 0 | if let Some(n) = limit.checked_sub(1) { |
984 | 0 | let (before, _mid, _after) = v.select_nth_unstable_by(n, &mut is_less); |
985 | 0 | before.sort_unstable_by(is_less); |
986 | 0 | } |
987 | 0 | } |
988 | | |
989 | | /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data |
990 | | /// at given two indices. The lifetime is the same at the data wrapped. |
991 | | pub struct LexicographicalComparator { |
992 | | compare_items: Vec<DynComparator>, |
993 | | } |
994 | | |
995 | | impl LexicographicalComparator { |
996 | | /// lexicographically compare values at the wrapped columns with given indices. |
997 | 0 | pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { |
998 | 0 | for comparator in &self.compare_items { |
999 | 0 | match comparator(a_idx, b_idx) { |
1000 | 0 | Ordering::Equal => continue, |
1001 | 0 | r => return r, |
1002 | | } |
1003 | | } |
1004 | 0 | Ordering::Equal |
1005 | 0 | } |
1006 | | |
1007 | | /// Create a new lex comparator that will wrap the given sort columns and give comparison |
1008 | | /// results with two indices. |
1009 | 0 | pub fn try_new(columns: &[SortColumn]) -> Result<LexicographicalComparator, ArrowError> { |
1010 | 0 | let compare_items = columns |
1011 | 0 | .iter() |
1012 | 0 | .map(|c| { |
1013 | 0 | make_comparator( |
1014 | 0 | c.values.as_ref(), |
1015 | 0 | c.values.as_ref(), |
1016 | 0 | c.options.unwrap_or_default(), |
1017 | | ) |
1018 | 0 | }) |
1019 | 0 | .collect::<Result<Vec<_>, ArrowError>>()?; |
1020 | 0 | Ok(LexicographicalComparator { compare_items }) |
1021 | 0 | } |
1022 | | } |
1023 | | |
1024 | | /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data |
1025 | | /// at given two indices. This version of the comparator is for compile-time constant number of columns. |
1026 | | /// The lifetime is the same at the data wrapped. |
1027 | | pub struct FixedLexicographicalComparator<const N: usize> { |
1028 | | compare_items: [DynComparator; N], |
1029 | | } |
1030 | | |
1031 | | impl<const N: usize> FixedLexicographicalComparator<N> { |
1032 | | /// lexicographically compare values at the wrapped columns with given indices. |
1033 | 0 | pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { |
1034 | 0 | for comparator in &self.compare_items { |
1035 | 0 | match comparator(a_idx, b_idx) { |
1036 | 0 | Ordering::Equal => continue, |
1037 | 0 | r => return r, |
1038 | | } |
1039 | | } |
1040 | 0 | Ordering::Equal |
1041 | 0 | } |
1042 | | |
1043 | | /// Create a new lex comparator that will wrap the given sort columns and give comparison |
1044 | | /// results with two indices. |
1045 | | /// The number of columns should be equal to the compile-time constant N. |
1046 | 0 | pub fn try_new( |
1047 | 0 | columns: &[SortColumn], |
1048 | 0 | ) -> Result<FixedLexicographicalComparator<N>, ArrowError> { |
1049 | 0 | let compare_items = columns |
1050 | 0 | .iter() |
1051 | 0 | .map(|c| { |
1052 | 0 | make_comparator( |
1053 | 0 | c.values.as_ref(), |
1054 | 0 | c.values.as_ref(), |
1055 | 0 | c.options.unwrap_or_default(), |
1056 | | ) |
1057 | 0 | }) |
1058 | 0 | .collect::<Result<Vec<_>, ArrowError>>()? |
1059 | 0 | .try_into(); |
1060 | 0 | let compare_items: [Box<dyn Fn(usize, usize) -> Ordering + Send + Sync + 'static>; N] = |
1061 | 0 | compare_items.map_err(|_| { |
1062 | 0 | ArrowError::ComputeError("Could not create fixed size array".to_string()) |
1063 | 0 | })?; |
1064 | 0 | Ok(FixedLexicographicalComparator { compare_items }) |
1065 | 0 | } |
1066 | | } |
1067 | | |
1068 | | #[cfg(test)] |
1069 | | mod tests { |
1070 | | use super::*; |
1071 | | use arrow_array::builder::{ |
1072 | | BooleanBuilder, FixedSizeListBuilder, GenericListBuilder, Int64Builder, ListBuilder, |
1073 | | PrimitiveRunBuilder, |
1074 | | }; |
1075 | | use arrow_buffer::{i256, NullBuffer}; |
1076 | | use arrow_schema::Field; |
1077 | | use half::f16; |
1078 | | use rand::rngs::StdRng; |
1079 | | use rand::seq::SliceRandom; |
1080 | | use rand::{Rng, RngCore, SeedableRng}; |
1081 | | |
1082 | | fn create_decimal_array<T: DecimalType>( |
1083 | | data: Vec<Option<usize>>, |
1084 | | precision: u8, |
1085 | | scale: i8, |
1086 | | ) -> PrimitiveArray<T> { |
1087 | | data.into_iter() |
1088 | | .map(|x| x.and_then(T::Native::from_usize)) |
1089 | | .collect::<PrimitiveArray<T>>() |
1090 | | .with_precision_and_scale(precision, scale) |
1091 | | .unwrap() |
1092 | | } |
1093 | | |
1094 | | fn create_decimal256_array(data: Vec<Option<i256>>) -> Decimal256Array { |
1095 | | data.into_iter() |
1096 | | .collect::<Decimal256Array>() |
1097 | | .with_precision_and_scale(53, 6) |
1098 | | .unwrap() |
1099 | | } |
1100 | | |
1101 | | fn test_sort_to_indices_decimal_array<T: DecimalType>( |
1102 | | data: Vec<Option<usize>>, |
1103 | | options: Option<SortOptions>, |
1104 | | limit: Option<usize>, |
1105 | | expected_data: Vec<u32>, |
1106 | | precision: u8, |
1107 | | scale: i8, |
1108 | | ) { |
1109 | | let output = create_decimal_array::<T>(data, precision, scale); |
1110 | | let expected = UInt32Array::from(expected_data); |
1111 | | let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); |
1112 | | assert_eq!(output, expected) |
1113 | | } |
1114 | | |
1115 | | fn test_sort_to_indices_decimal256_array( |
1116 | | data: Vec<Option<i256>>, |
1117 | | options: Option<SortOptions>, |
1118 | | limit: Option<usize>, |
1119 | | expected_data: Vec<u32>, |
1120 | | ) { |
1121 | | let output = create_decimal256_array(data); |
1122 | | let expected = UInt32Array::from(expected_data); |
1123 | | let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); |
1124 | | assert_eq!(output, expected) |
1125 | | } |
1126 | | |
1127 | | fn test_sort_decimal_array<T: DecimalType>( |
1128 | | data: Vec<Option<usize>>, |
1129 | | options: Option<SortOptions>, |
1130 | | limit: Option<usize>, |
1131 | | expected_data: Vec<Option<usize>>, |
1132 | | p: u8, |
1133 | | s: i8, |
1134 | | ) { |
1135 | | let output = create_decimal_array::<T>(data, p, s); |
1136 | | let expected = Arc::new(create_decimal_array::<T>(expected_data, p, s)) as ArrayRef; |
1137 | | let output = match limit { |
1138 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1139 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1140 | | }; |
1141 | | assert_eq!(&output, &expected) |
1142 | | } |
1143 | | |
1144 | | fn test_sort_decimal256_array( |
1145 | | data: Vec<Option<i256>>, |
1146 | | options: Option<SortOptions>, |
1147 | | limit: Option<usize>, |
1148 | | expected_data: Vec<Option<i256>>, |
1149 | | ) { |
1150 | | let output = create_decimal256_array(data); |
1151 | | let expected = Arc::new(create_decimal256_array(expected_data)) as ArrayRef; |
1152 | | let output = match limit { |
1153 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1154 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1155 | | }; |
1156 | | assert_eq!(&output, &expected) |
1157 | | } |
1158 | | |
1159 | | fn test_sort_to_indices_boolean_arrays( |
1160 | | data: Vec<Option<bool>>, |
1161 | | options: Option<SortOptions>, |
1162 | | limit: Option<usize>, |
1163 | | expected_data: Vec<u32>, |
1164 | | ) { |
1165 | | let output = BooleanArray::from(data); |
1166 | | let expected = UInt32Array::from(expected_data); |
1167 | | let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); |
1168 | | assert_eq!(output, expected) |
1169 | | } |
1170 | | |
1171 | | fn test_sort_to_indices_primitive_arrays<T>( |
1172 | | data: Vec<Option<T::Native>>, |
1173 | | options: Option<SortOptions>, |
1174 | | limit: Option<usize>, |
1175 | | expected_data: Vec<u32>, |
1176 | | ) where |
1177 | | T: ArrowPrimitiveType, |
1178 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
1179 | | { |
1180 | | let output = PrimitiveArray::<T>::from(data); |
1181 | | let expected = UInt32Array::from(expected_data); |
1182 | | let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); |
1183 | | assert_eq!(output, expected) |
1184 | | } |
1185 | | |
1186 | | fn test_sort_primitive_arrays<T>( |
1187 | | data: Vec<Option<T::Native>>, |
1188 | | options: Option<SortOptions>, |
1189 | | limit: Option<usize>, |
1190 | | expected_data: Vec<Option<T::Native>>, |
1191 | | ) where |
1192 | | T: ArrowPrimitiveType, |
1193 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
1194 | | { |
1195 | | let output = PrimitiveArray::<T>::from(data); |
1196 | | let expected = Arc::new(PrimitiveArray::<T>::from(expected_data)) as ArrayRef; |
1197 | | let output = match limit { |
1198 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1199 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1200 | | }; |
1201 | | assert_eq!(&output, &expected) |
1202 | | } |
1203 | | |
1204 | | fn test_sort_to_indices_string_arrays( |
1205 | | data: Vec<Option<&str>>, |
1206 | | options: Option<SortOptions>, |
1207 | | limit: Option<usize>, |
1208 | | expected_data: Vec<u32>, |
1209 | | ) { |
1210 | | let output = StringArray::from(data); |
1211 | | let expected = UInt32Array::from(expected_data); |
1212 | | let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); |
1213 | | assert_eq!(output, expected) |
1214 | | } |
1215 | | |
1216 | | /// Tests both Utf8 and LargeUtf8 |
1217 | | fn test_sort_string_arrays( |
1218 | | data: Vec<Option<&str>>, |
1219 | | options: Option<SortOptions>, |
1220 | | limit: Option<usize>, |
1221 | | expected_data: Vec<Option<&str>>, |
1222 | | ) { |
1223 | | let output = StringArray::from(data.clone()); |
1224 | | let expected = Arc::new(StringArray::from(expected_data.clone())) as ArrayRef; |
1225 | | let output = match limit { |
1226 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1227 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1228 | | }; |
1229 | | assert_eq!(&output, &expected); |
1230 | | |
1231 | | let output = LargeStringArray::from(data.clone()); |
1232 | | let expected = Arc::new(LargeStringArray::from(expected_data.clone())) as ArrayRef; |
1233 | | let output = match limit { |
1234 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1235 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1236 | | }; |
1237 | | assert_eq!(&output, &expected); |
1238 | | |
1239 | | let output = StringViewArray::from(data); |
1240 | | let expected = Arc::new(StringViewArray::from(expected_data)) as ArrayRef; |
1241 | | let output = match limit { |
1242 | | Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), |
1243 | | _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), |
1244 | | }; |
1245 | | assert_eq!(&output, &expected); |
1246 | | } |
1247 | | |
1248 | | fn test_sort_string_dict_arrays<T: ArrowDictionaryKeyType>( |
1249 | | data: Vec<Option<&str>>, |
1250 | | options: Option<SortOptions>, |
1251 | | limit: Option<usize>, |
1252 | | expected_data: Vec<Option<&str>>, |
1253 | | ) { |
1254 | | let array = data.into_iter().collect::<DictionaryArray<T>>(); |
1255 | | let array_values = array.values().clone(); |
1256 | | let dict = array_values |
1257 | | .as_any() |
1258 | | .downcast_ref::<StringArray>() |
1259 | | .expect("Unable to get dictionary values"); |
1260 | | |
1261 | | let sorted = match limit { |
1262 | | Some(_) => sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap(), |
1263 | | _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), |
1264 | | }; |
1265 | | let sorted = sorted |
1266 | | .as_any() |
1267 | | .downcast_ref::<DictionaryArray<T>>() |
1268 | | .unwrap(); |
1269 | | let sorted_values = sorted.values(); |
1270 | | let sorted_dict = sorted_values |
1271 | | .as_any() |
1272 | | .downcast_ref::<StringArray>() |
1273 | | .expect("Unable to get dictionary values"); |
1274 | | let sorted_keys = sorted.keys(); |
1275 | | |
1276 | | assert_eq!(sorted_dict, dict); |
1277 | | |
1278 | | let sorted_strings = StringArray::from_iter((0..sorted.len()).map(|i| { |
1279 | | if sorted.is_valid(i) { |
1280 | | Some(sorted_dict.value(sorted_keys.value(i).as_usize())) |
1281 | | } else { |
1282 | | None |
1283 | | } |
1284 | | })); |
1285 | | let expected = StringArray::from(expected_data); |
1286 | | |
1287 | | assert_eq!(sorted_strings, expected) |
1288 | | } |
1289 | | |
1290 | | fn test_sort_primitive_dict_arrays<K: ArrowDictionaryKeyType, T: ArrowPrimitiveType>( |
1291 | | keys: PrimitiveArray<K>, |
1292 | | values: PrimitiveArray<T>, |
1293 | | options: Option<SortOptions>, |
1294 | | limit: Option<usize>, |
1295 | | expected_data: Vec<Option<T::Native>>, |
1296 | | ) where |
1297 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
1298 | | { |
1299 | | let array = DictionaryArray::<K>::new(keys, Arc::new(values)); |
1300 | | let array_values = array.values().clone(); |
1301 | | let dict = array_values.as_primitive::<T>(); |
1302 | | |
1303 | | let sorted = match limit { |
1304 | | Some(_) => sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap(), |
1305 | | _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), |
1306 | | }; |
1307 | | let sorted = sorted |
1308 | | .as_any() |
1309 | | .downcast_ref::<DictionaryArray<K>>() |
1310 | | .unwrap(); |
1311 | | let sorted_values = sorted.values(); |
1312 | | let sorted_dict = sorted_values |
1313 | | .as_any() |
1314 | | .downcast_ref::<PrimitiveArray<T>>() |
1315 | | .expect("Unable to get dictionary values"); |
1316 | | let sorted_keys = sorted.keys(); |
1317 | | |
1318 | | assert_eq!(sorted_dict, dict); |
1319 | | |
1320 | | let sorted_values: PrimitiveArray<T> = From::<Vec<Option<T::Native>>>::from( |
1321 | | (0..sorted.len()) |
1322 | | .map(|i| { |
1323 | | let key = sorted_keys.value(i).as_usize(); |
1324 | | if sorted.is_valid(i) && sorted_dict.is_valid(key) { |
1325 | | Some(sorted_dict.value(key)) |
1326 | | } else { |
1327 | | None |
1328 | | } |
1329 | | }) |
1330 | | .collect::<Vec<Option<T::Native>>>(), |
1331 | | ); |
1332 | | let expected: PrimitiveArray<T> = From::<Vec<Option<T::Native>>>::from(expected_data); |
1333 | | |
1334 | | assert_eq!(sorted_values, expected) |
1335 | | } |
1336 | | |
1337 | | fn test_sort_list_arrays<T>( |
1338 | | data: Vec<Option<Vec<Option<T::Native>>>>, |
1339 | | options: Option<SortOptions>, |
1340 | | limit: Option<usize>, |
1341 | | expected_data: Vec<Option<Vec<Option<T::Native>>>>, |
1342 | | fixed_length: Option<i32>, |
1343 | | ) where |
1344 | | T: ArrowPrimitiveType, |
1345 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
1346 | | { |
1347 | | // for FixedSizedList |
1348 | | if let Some(length) = fixed_length { |
1349 | | let input = Arc::new(FixedSizeListArray::from_iter_primitive::<T, _, _>( |
1350 | | data.clone(), |
1351 | | length, |
1352 | | )); |
1353 | | let sorted = match limit { |
1354 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1355 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1356 | | }; |
1357 | | let expected = Arc::new(FixedSizeListArray::from_iter_primitive::<T, _, _>( |
1358 | | expected_data.clone(), |
1359 | | length, |
1360 | | )) as ArrayRef; |
1361 | | |
1362 | | assert_eq!(&sorted, &expected); |
1363 | | } |
1364 | | |
1365 | | // for List |
1366 | | let input = Arc::new(ListArray::from_iter_primitive::<T, _, _>(data.clone())); |
1367 | | let sorted = match limit { |
1368 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1369 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1370 | | }; |
1371 | | let expected = Arc::new(ListArray::from_iter_primitive::<T, _, _>( |
1372 | | expected_data.clone(), |
1373 | | )) as ArrayRef; |
1374 | | |
1375 | | assert_eq!(&sorted, &expected); |
1376 | | |
1377 | | // for LargeList |
1378 | | let input = Arc::new(LargeListArray::from_iter_primitive::<T, _, _>(data)); |
1379 | | let sorted = match limit { |
1380 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1381 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1382 | | }; |
1383 | | let expected = Arc::new(LargeListArray::from_iter_primitive::<T, _, _>( |
1384 | | expected_data, |
1385 | | )) as ArrayRef; |
1386 | | |
1387 | | assert_eq!(&sorted, &expected); |
1388 | | } |
1389 | | |
1390 | | fn test_lex_sort_arrays( |
1391 | | input: Vec<SortColumn>, |
1392 | | expected_output: Vec<ArrayRef>, |
1393 | | limit: Option<usize>, |
1394 | | ) { |
1395 | | let sorted = lexsort(&input, limit).unwrap(); |
1396 | | |
1397 | | for (result, expected) in sorted.iter().zip(expected_output.iter()) { |
1398 | | assert_eq!(result, expected); |
1399 | | } |
1400 | | } |
1401 | | |
1402 | | /// slice all arrays in expected_output to offset/length |
1403 | | fn slice_arrays(expected_output: Vec<ArrayRef>, offset: usize, length: usize) -> Vec<ArrayRef> { |
1404 | | expected_output |
1405 | | .into_iter() |
1406 | | .map(|array| array.slice(offset, length)) |
1407 | | .collect() |
1408 | | } |
1409 | | |
1410 | | fn test_sort_binary_arrays( |
1411 | | data: Vec<Option<Vec<u8>>>, |
1412 | | options: Option<SortOptions>, |
1413 | | limit: Option<usize>, |
1414 | | expected_data: Vec<Option<Vec<u8>>>, |
1415 | | fixed_length: Option<i32>, |
1416 | | ) { |
1417 | | // Fixed size binary array |
1418 | | if let Some(length) = fixed_length { |
1419 | | let input = Arc::new( |
1420 | | FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.iter().cloned(), length) |
1421 | | .unwrap(), |
1422 | | ); |
1423 | | let sorted = match limit { |
1424 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1425 | | None => sort(&(input as ArrayRef), options).unwrap(), |
1426 | | }; |
1427 | | let expected = Arc::new( |
1428 | | FixedSizeBinaryArray::try_from_sparse_iter_with_size( |
1429 | | expected_data.iter().cloned(), |
1430 | | length, |
1431 | | ) |
1432 | | .unwrap(), |
1433 | | ) as ArrayRef; |
1434 | | |
1435 | | assert_eq!(&sorted, &expected); |
1436 | | } |
1437 | | |
1438 | | // Generic size binary array |
1439 | | fn make_generic_binary_array<S: OffsetSizeTrait>( |
1440 | | data: &[Option<Vec<u8>>], |
1441 | | ) -> Arc<GenericBinaryArray<S>> { |
1442 | | Arc::new(GenericBinaryArray::<S>::from_opt_vec( |
1443 | | data.iter() |
1444 | | .map(|binary| binary.as_ref().map(Vec::as_slice)) |
1445 | | .collect(), |
1446 | | )) |
1447 | | } |
1448 | | |
1449 | | // BinaryArray |
1450 | | let input = make_generic_binary_array::<i32>(&data); |
1451 | | let sorted = match limit { |
1452 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1453 | | None => sort(&(input as ArrayRef), options).unwrap(), |
1454 | | }; |
1455 | | let expected = make_generic_binary_array::<i32>(&expected_data) as ArrayRef; |
1456 | | assert_eq!(&sorted, &expected); |
1457 | | |
1458 | | // LargeBinaryArray |
1459 | | let input = make_generic_binary_array::<i64>(&data); |
1460 | | let sorted = match limit { |
1461 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1462 | | None => sort(&(input as ArrayRef), options).unwrap(), |
1463 | | }; |
1464 | | let expected = make_generic_binary_array::<i64>(&expected_data) as ArrayRef; |
1465 | | assert_eq!(&sorted, &expected); |
1466 | | } |
1467 | | |
1468 | | #[test] |
1469 | | fn test_sort_to_indices_primitives() { |
1470 | | test_sort_to_indices_primitive_arrays::<Int8Type>( |
1471 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1472 | | None, |
1473 | | None, |
1474 | | vec![0, 5, 3, 1, 4, 2], |
1475 | | ); |
1476 | | test_sort_to_indices_primitive_arrays::<Int16Type>( |
1477 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1478 | | None, |
1479 | | None, |
1480 | | vec![0, 5, 3, 1, 4, 2], |
1481 | | ); |
1482 | | test_sort_to_indices_primitive_arrays::<Int32Type>( |
1483 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1484 | | None, |
1485 | | None, |
1486 | | vec![0, 5, 3, 1, 4, 2], |
1487 | | ); |
1488 | | test_sort_to_indices_primitive_arrays::<Int64Type>( |
1489 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1490 | | None, |
1491 | | None, |
1492 | | vec![0, 5, 3, 1, 4, 2], |
1493 | | ); |
1494 | | test_sort_to_indices_primitive_arrays::<Float16Type>( |
1495 | | vec![ |
1496 | | None, |
1497 | | Some(f16::from_f32(-0.05)), |
1498 | | Some(f16::from_f32(2.225)), |
1499 | | Some(f16::from_f32(-1.01)), |
1500 | | Some(f16::from_f32(-0.05)), |
1501 | | None, |
1502 | | ], |
1503 | | None, |
1504 | | None, |
1505 | | vec![0, 5, 3, 1, 4, 2], |
1506 | | ); |
1507 | | test_sort_to_indices_primitive_arrays::<Float32Type>( |
1508 | | vec![ |
1509 | | None, |
1510 | | Some(-0.05), |
1511 | | Some(2.225), |
1512 | | Some(-1.01), |
1513 | | Some(-0.05), |
1514 | | None, |
1515 | | ], |
1516 | | None, |
1517 | | None, |
1518 | | vec![0, 5, 3, 1, 4, 2], |
1519 | | ); |
1520 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1521 | | vec![ |
1522 | | None, |
1523 | | Some(-0.05), |
1524 | | Some(2.225), |
1525 | | Some(-1.01), |
1526 | | Some(-0.05), |
1527 | | None, |
1528 | | ], |
1529 | | None, |
1530 | | None, |
1531 | | vec![0, 5, 3, 1, 4, 2], |
1532 | | ); |
1533 | | |
1534 | | // descending |
1535 | | test_sort_to_indices_primitive_arrays::<Int8Type>( |
1536 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1537 | | Some(SortOptions { |
1538 | | descending: true, |
1539 | | nulls_first: false, |
1540 | | }), |
1541 | | None, |
1542 | | vec![2, 1, 4, 3, 0, 5], |
1543 | | ); |
1544 | | |
1545 | | test_sort_to_indices_primitive_arrays::<Int16Type>( |
1546 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1547 | | Some(SortOptions { |
1548 | | descending: true, |
1549 | | nulls_first: false, |
1550 | | }), |
1551 | | None, |
1552 | | vec![2, 1, 4, 3, 0, 5], |
1553 | | ); |
1554 | | |
1555 | | test_sort_to_indices_primitive_arrays::<Int32Type>( |
1556 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1557 | | Some(SortOptions { |
1558 | | descending: true, |
1559 | | nulls_first: false, |
1560 | | }), |
1561 | | None, |
1562 | | vec![2, 1, 4, 3, 0, 5], |
1563 | | ); |
1564 | | |
1565 | | test_sort_to_indices_primitive_arrays::<Int64Type>( |
1566 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1567 | | Some(SortOptions { |
1568 | | descending: true, |
1569 | | nulls_first: false, |
1570 | | }), |
1571 | | None, |
1572 | | vec![2, 1, 4, 3, 0, 5], |
1573 | | ); |
1574 | | |
1575 | | test_sort_to_indices_primitive_arrays::<Float16Type>( |
1576 | | vec![ |
1577 | | None, |
1578 | | Some(f16::from_f32(0.005)), |
1579 | | Some(f16::from_f32(20.22)), |
1580 | | Some(f16::from_f32(-10.3)), |
1581 | | Some(f16::from_f32(0.005)), |
1582 | | None, |
1583 | | ], |
1584 | | Some(SortOptions { |
1585 | | descending: true, |
1586 | | nulls_first: false, |
1587 | | }), |
1588 | | None, |
1589 | | vec![2, 1, 4, 3, 0, 5], |
1590 | | ); |
1591 | | |
1592 | | test_sort_to_indices_primitive_arrays::<Float32Type>( |
1593 | | vec![ |
1594 | | None, |
1595 | | Some(0.005), |
1596 | | Some(20.22), |
1597 | | Some(-10.3), |
1598 | | Some(0.005), |
1599 | | None, |
1600 | | ], |
1601 | | Some(SortOptions { |
1602 | | descending: true, |
1603 | | nulls_first: false, |
1604 | | }), |
1605 | | None, |
1606 | | vec![2, 1, 4, 3, 0, 5], |
1607 | | ); |
1608 | | |
1609 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1610 | | vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], |
1611 | | Some(SortOptions { |
1612 | | descending: true, |
1613 | | nulls_first: false, |
1614 | | }), |
1615 | | None, |
1616 | | vec![2, 1, 4, 3, 0, 5], |
1617 | | ); |
1618 | | |
1619 | | // descending, nulls first |
1620 | | test_sort_to_indices_primitive_arrays::<Int8Type>( |
1621 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1622 | | Some(SortOptions { |
1623 | | descending: true, |
1624 | | nulls_first: true, |
1625 | | }), |
1626 | | None, |
1627 | | vec![0, 5, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] |
1628 | | ); |
1629 | | |
1630 | | test_sort_to_indices_primitive_arrays::<Int16Type>( |
1631 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1632 | | Some(SortOptions { |
1633 | | descending: true, |
1634 | | nulls_first: true, |
1635 | | }), |
1636 | | None, |
1637 | | vec![0, 5, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] |
1638 | | ); |
1639 | | |
1640 | | test_sort_to_indices_primitive_arrays::<Int32Type>( |
1641 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1642 | | Some(SortOptions { |
1643 | | descending: true, |
1644 | | nulls_first: true, |
1645 | | }), |
1646 | | None, |
1647 | | vec![0, 5, 2, 1, 4, 3], |
1648 | | ); |
1649 | | |
1650 | | test_sort_to_indices_primitive_arrays::<Int64Type>( |
1651 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
1652 | | Some(SortOptions { |
1653 | | descending: true, |
1654 | | nulls_first: true, |
1655 | | }), |
1656 | | None, |
1657 | | vec![0, 5, 2, 1, 4, 3], |
1658 | | ); |
1659 | | |
1660 | | test_sort_to_indices_primitive_arrays::<Float16Type>( |
1661 | | vec![ |
1662 | | None, |
1663 | | Some(f16::from_f32(0.1)), |
1664 | | Some(f16::from_f32(0.2)), |
1665 | | Some(f16::from_f32(-1.3)), |
1666 | | Some(f16::from_f32(0.01)), |
1667 | | None, |
1668 | | ], |
1669 | | Some(SortOptions { |
1670 | | descending: true, |
1671 | | nulls_first: true, |
1672 | | }), |
1673 | | None, |
1674 | | vec![0, 5, 2, 1, 4, 3], |
1675 | | ); |
1676 | | |
1677 | | test_sort_to_indices_primitive_arrays::<Float32Type>( |
1678 | | vec![None, Some(0.1), Some(0.2), Some(-1.3), Some(0.01), None], |
1679 | | Some(SortOptions { |
1680 | | descending: true, |
1681 | | nulls_first: true, |
1682 | | }), |
1683 | | None, |
1684 | | vec![0, 5, 2, 1, 4, 3], |
1685 | | ); |
1686 | | |
1687 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1688 | | vec![None, Some(10.1), Some(100.2), Some(-1.3), Some(10.01), None], |
1689 | | Some(SortOptions { |
1690 | | descending: true, |
1691 | | nulls_first: true, |
1692 | | }), |
1693 | | None, |
1694 | | vec![0, 5, 2, 1, 4, 3], |
1695 | | ); |
1696 | | |
1697 | | // valid values less than limit with extra nulls |
1698 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1699 | | vec![Some(2.0), None, None, Some(1.0)], |
1700 | | Some(SortOptions { |
1701 | | descending: false, |
1702 | | nulls_first: false, |
1703 | | }), |
1704 | | Some(3), |
1705 | | vec![3, 0, 1], |
1706 | | ); |
1707 | | |
1708 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1709 | | vec![Some(2.0), None, None, Some(1.0)], |
1710 | | Some(SortOptions { |
1711 | | descending: false, |
1712 | | nulls_first: true, |
1713 | | }), |
1714 | | Some(3), |
1715 | | vec![1, 2, 3], |
1716 | | ); |
1717 | | |
1718 | | // more nulls than limit |
1719 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1720 | | vec![Some(1.0), None, None, None], |
1721 | | Some(SortOptions { |
1722 | | descending: false, |
1723 | | nulls_first: true, |
1724 | | }), |
1725 | | Some(2), |
1726 | | vec![1, 2], |
1727 | | ); |
1728 | | |
1729 | | test_sort_to_indices_primitive_arrays::<Float64Type>( |
1730 | | vec![Some(1.0), None, None, None], |
1731 | | Some(SortOptions { |
1732 | | descending: false, |
1733 | | nulls_first: false, |
1734 | | }), |
1735 | | Some(2), |
1736 | | vec![0, 1], |
1737 | | ); |
1738 | | } |
1739 | | |
1740 | | #[test] |
1741 | | fn test_sort_to_indices_primitive_more_nulls_than_limit() { |
1742 | | test_sort_to_indices_primitive_arrays::<Int32Type>( |
1743 | | vec![None, None, Some(3), None, Some(1), None, Some(2)], |
1744 | | Some(SortOptions { |
1745 | | descending: false, |
1746 | | nulls_first: false, |
1747 | | }), |
1748 | | Some(2), |
1749 | | vec![4, 6], |
1750 | | ); |
1751 | | } |
1752 | | |
1753 | | #[test] |
1754 | | fn test_sort_boolean() { |
1755 | | // boolean |
1756 | | test_sort_to_indices_boolean_arrays( |
1757 | | vec![None, Some(false), Some(true), Some(true), Some(false), None], |
1758 | | None, |
1759 | | None, |
1760 | | vec![0, 5, 1, 4, 2, 3], |
1761 | | ); |
1762 | | |
1763 | | // boolean, descending |
1764 | | test_sort_to_indices_boolean_arrays( |
1765 | | vec![None, Some(false), Some(true), Some(true), Some(false), None], |
1766 | | Some(SortOptions { |
1767 | | descending: true, |
1768 | | nulls_first: false, |
1769 | | }), |
1770 | | None, |
1771 | | vec![2, 3, 1, 4, 0, 5], |
1772 | | ); |
1773 | | |
1774 | | // boolean, descending, nulls first |
1775 | | test_sort_to_indices_boolean_arrays( |
1776 | | vec![None, Some(false), Some(true), Some(true), Some(false), None], |
1777 | | Some(SortOptions { |
1778 | | descending: true, |
1779 | | nulls_first: true, |
1780 | | }), |
1781 | | None, |
1782 | | vec![0, 5, 2, 3, 1, 4], |
1783 | | ); |
1784 | | |
1785 | | // boolean, descending, nulls first, limit |
1786 | | test_sort_to_indices_boolean_arrays( |
1787 | | vec![None, Some(false), Some(true), Some(true), Some(false), None], |
1788 | | Some(SortOptions { |
1789 | | descending: true, |
1790 | | nulls_first: true, |
1791 | | }), |
1792 | | Some(3), |
1793 | | vec![0, 5, 2], |
1794 | | ); |
1795 | | |
1796 | | // valid values less than limit with extra nulls |
1797 | | test_sort_to_indices_boolean_arrays( |
1798 | | vec![Some(true), None, None, Some(false)], |
1799 | | Some(SortOptions { |
1800 | | descending: false, |
1801 | | nulls_first: false, |
1802 | | }), |
1803 | | Some(3), |
1804 | | vec![3, 0, 1], |
1805 | | ); |
1806 | | |
1807 | | test_sort_to_indices_boolean_arrays( |
1808 | | vec![Some(true), None, None, Some(false)], |
1809 | | Some(SortOptions { |
1810 | | descending: false, |
1811 | | nulls_first: true, |
1812 | | }), |
1813 | | Some(3), |
1814 | | vec![1, 2, 3], |
1815 | | ); |
1816 | | |
1817 | | // more nulls than limit |
1818 | | test_sort_to_indices_boolean_arrays( |
1819 | | vec![Some(true), None, None, None], |
1820 | | Some(SortOptions { |
1821 | | descending: false, |
1822 | | nulls_first: true, |
1823 | | }), |
1824 | | Some(2), |
1825 | | vec![1, 2], |
1826 | | ); |
1827 | | |
1828 | | test_sort_to_indices_boolean_arrays( |
1829 | | vec![Some(true), None, None, None], |
1830 | | Some(SortOptions { |
1831 | | descending: false, |
1832 | | nulls_first: false, |
1833 | | }), |
1834 | | Some(2), |
1835 | | vec![0, 1], |
1836 | | ); |
1837 | | } |
1838 | | |
1839 | | /// Test sort boolean on each permutation of with/without limit and GenericListArray/FixedSizeListArray |
1840 | | /// |
1841 | | /// The input data must have the same length for all list items so that we can test FixedSizeListArray |
1842 | | /// |
1843 | | fn test_every_config_sort_boolean_list_arrays( |
1844 | | data: Vec<Option<Vec<Option<bool>>>>, |
1845 | | options: Option<SortOptions>, |
1846 | | expected_data: Vec<Option<Vec<Option<bool>>>>, |
1847 | | ) { |
1848 | | let first_length = data |
1849 | | .iter() |
1850 | | .find_map(|x| x.as_ref().map(|x| x.len())) |
1851 | | .unwrap_or(0); |
1852 | | let first_non_match_length = data |
1853 | | .iter() |
1854 | | .map(|x| x.as_ref().map(|x| x.len()).unwrap_or(first_length)) |
1855 | | .position(|x| x != first_length); |
1856 | | |
1857 | | assert_eq!( |
1858 | | first_non_match_length, None, |
1859 | | "All list items should have the same length {first_length}, input data is invalid" |
1860 | | ); |
1861 | | |
1862 | | let first_non_match_length = expected_data |
1863 | | .iter() |
1864 | | .map(|x| x.as_ref().map(|x| x.len()).unwrap_or(first_length)) |
1865 | | .position(|x| x != first_length); |
1866 | | |
1867 | | assert_eq!( |
1868 | | first_non_match_length, None, |
1869 | | "All list items should have the same length {first_length}, expected data is invalid" |
1870 | | ); |
1871 | | |
1872 | | let limit = expected_data.len().saturating_div(2); |
1873 | | |
1874 | | for &with_limit in &[false, true] { |
1875 | | let (limit, expected_data) = if with_limit { |
1876 | | ( |
1877 | | Some(limit), |
1878 | | expected_data.iter().take(limit).cloned().collect(), |
1879 | | ) |
1880 | | } else { |
1881 | | (None, expected_data.clone()) |
1882 | | }; |
1883 | | |
1884 | | for &fixed_length in &[None, Some(first_length as i32)] { |
1885 | | test_sort_boolean_list_arrays( |
1886 | | data.clone(), |
1887 | | options, |
1888 | | limit, |
1889 | | expected_data.clone(), |
1890 | | fixed_length, |
1891 | | ); |
1892 | | } |
1893 | | } |
1894 | | } |
1895 | | |
1896 | | fn test_sort_boolean_list_arrays( |
1897 | | data: Vec<Option<Vec<Option<bool>>>>, |
1898 | | options: Option<SortOptions>, |
1899 | | limit: Option<usize>, |
1900 | | expected_data: Vec<Option<Vec<Option<bool>>>>, |
1901 | | fixed_length: Option<i32>, |
1902 | | ) { |
1903 | | fn build_fixed_boolean_list_array( |
1904 | | data: Vec<Option<Vec<Option<bool>>>>, |
1905 | | fixed_length: i32, |
1906 | | ) -> ArrayRef { |
1907 | | let mut builder = FixedSizeListBuilder::new( |
1908 | | BooleanBuilder::with_capacity(fixed_length as usize), |
1909 | | fixed_length, |
1910 | | ); |
1911 | | for sublist in data { |
1912 | | match sublist { |
1913 | | Some(sublist) => { |
1914 | | builder.values().extend(sublist); |
1915 | | builder.append(true); |
1916 | | } |
1917 | | None => { |
1918 | | builder |
1919 | | .values() |
1920 | | .extend(std::iter::repeat_n(None, fixed_length as usize)); |
1921 | | builder.append(false); |
1922 | | } |
1923 | | } |
1924 | | } |
1925 | | Arc::new(builder.finish()) as ArrayRef |
1926 | | } |
1927 | | |
1928 | | fn build_generic_boolean_list_array<OffsetSize: OffsetSizeTrait>( |
1929 | | data: Vec<Option<Vec<Option<bool>>>>, |
1930 | | ) -> ArrayRef { |
1931 | | let mut builder = GenericListBuilder::<OffsetSize, _>::new(BooleanBuilder::new()); |
1932 | | builder.extend(data); |
1933 | | Arc::new(builder.finish()) as ArrayRef |
1934 | | } |
1935 | | |
1936 | | // for FixedSizedList |
1937 | | if let Some(length) = fixed_length { |
1938 | | let input = build_fixed_boolean_list_array(data.clone(), length); |
1939 | | let sorted = match limit { |
1940 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1941 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1942 | | }; |
1943 | | let expected = build_fixed_boolean_list_array(expected_data.clone(), length); |
1944 | | |
1945 | | assert_eq!(&sorted, &expected); |
1946 | | } |
1947 | | |
1948 | | // for List |
1949 | | let input = build_generic_boolean_list_array::<i32>(data.clone()); |
1950 | | let sorted = match limit { |
1951 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1952 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1953 | | }; |
1954 | | let expected = build_generic_boolean_list_array::<i32>(expected_data.clone()); |
1955 | | |
1956 | | assert_eq!(&sorted, &expected); |
1957 | | |
1958 | | // for LargeList |
1959 | | let input = build_generic_boolean_list_array::<i64>(data.clone()); |
1960 | | let sorted = match limit { |
1961 | | Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), |
1962 | | _ => sort(&(input as ArrayRef), options).unwrap(), |
1963 | | }; |
1964 | | let expected = build_generic_boolean_list_array::<i64>(expected_data.clone()); |
1965 | | |
1966 | | assert_eq!(&sorted, &expected); |
1967 | | } |
1968 | | |
1969 | | #[test] |
1970 | | fn test_sort_list_of_booleans() { |
1971 | | // These are all the possible combinations of boolean values |
1972 | | // There are 3^3 + 1 = 28 possible combinations (3 values to permutate - [true, false, null] and 1 None value) |
1973 | | #[rustfmt::skip] |
1974 | | let mut cases = vec![ |
1975 | | Some(vec![Some(true), Some(true), Some(true)]), |
1976 | | Some(vec![Some(true), Some(true), Some(false)]), |
1977 | | Some(vec![Some(true), Some(true), None]), |
1978 | | |
1979 | | Some(vec![Some(true), Some(false), Some(true)]), |
1980 | | Some(vec![Some(true), Some(false), Some(false)]), |
1981 | | Some(vec![Some(true), Some(false), None]), |
1982 | | |
1983 | | Some(vec![Some(true), None, Some(true)]), |
1984 | | Some(vec![Some(true), None, Some(false)]), |
1985 | | Some(vec![Some(true), None, None]), |
1986 | | |
1987 | | Some(vec![Some(false), Some(true), Some(true)]), |
1988 | | Some(vec![Some(false), Some(true), Some(false)]), |
1989 | | Some(vec![Some(false), Some(true), None]), |
1990 | | |
1991 | | Some(vec![Some(false), Some(false), Some(true)]), |
1992 | | Some(vec![Some(false), Some(false), Some(false)]), |
1993 | | Some(vec![Some(false), Some(false), None]), |
1994 | | |
1995 | | Some(vec![Some(false), None, Some(true)]), |
1996 | | Some(vec![Some(false), None, Some(false)]), |
1997 | | Some(vec![Some(false), None, None]), |
1998 | | |
1999 | | Some(vec![None, Some(true), Some(true)]), |
2000 | | Some(vec![None, Some(true), Some(false)]), |
2001 | | Some(vec![None, Some(true), None]), |
2002 | | |
2003 | | Some(vec![None, Some(false), Some(true)]), |
2004 | | Some(vec![None, Some(false), Some(false)]), |
2005 | | Some(vec![None, Some(false), None]), |
2006 | | |
2007 | | Some(vec![None, None, Some(true)]), |
2008 | | Some(vec![None, None, Some(false)]), |
2009 | | Some(vec![None, None, None]), |
2010 | | None, |
2011 | | ]; |
2012 | | |
2013 | | cases.shuffle(&mut StdRng::seed_from_u64(42)); |
2014 | | |
2015 | | // The order is false, true, null |
2016 | | #[rustfmt::skip] |
2017 | | let expected_descending_false_nulls_first_false = vec![ |
2018 | | Some(vec![Some(false), Some(false), Some(false)]), |
2019 | | Some(vec![Some(false), Some(false), Some(true)]), |
2020 | | Some(vec![Some(false), Some(false), None]), |
2021 | | |
2022 | | Some(vec![Some(false), Some(true), Some(false)]), |
2023 | | Some(vec![Some(false), Some(true), Some(true)]), |
2024 | | Some(vec![Some(false), Some(true), None]), |
2025 | | |
2026 | | Some(vec![Some(false), None, Some(false)]), |
2027 | | Some(vec![Some(false), None, Some(true)]), |
2028 | | Some(vec![Some(false), None, None]), |
2029 | | |
2030 | | Some(vec![Some(true), Some(false), Some(false)]), |
2031 | | Some(vec![Some(true), Some(false), Some(true)]), |
2032 | | Some(vec![Some(true), Some(false), None]), |
2033 | | |
2034 | | Some(vec![Some(true), Some(true), Some(false)]), |
2035 | | Some(vec![Some(true), Some(true), Some(true)]), |
2036 | | Some(vec![Some(true), Some(true), None]), |
2037 | | |
2038 | | Some(vec![Some(true), None, Some(false)]), |
2039 | | Some(vec![Some(true), None, Some(true)]), |
2040 | | Some(vec![Some(true), None, None]), |
2041 | | |
2042 | | Some(vec![None, Some(false), Some(false)]), |
2043 | | Some(vec![None, Some(false), Some(true)]), |
2044 | | Some(vec![None, Some(false), None]), |
2045 | | |
2046 | | Some(vec![None, Some(true), Some(false)]), |
2047 | | Some(vec![None, Some(true), Some(true)]), |
2048 | | Some(vec![None, Some(true), None]), |
2049 | | |
2050 | | Some(vec![None, None, Some(false)]), |
2051 | | Some(vec![None, None, Some(true)]), |
2052 | | Some(vec![None, None, None]), |
2053 | | None, |
2054 | | ]; |
2055 | | test_every_config_sort_boolean_list_arrays( |
2056 | | cases.clone(), |
2057 | | Some(SortOptions { |
2058 | | descending: false, |
2059 | | nulls_first: false, |
2060 | | }), |
2061 | | expected_descending_false_nulls_first_false, |
2062 | | ); |
2063 | | |
2064 | | // The order is null, false, true |
2065 | | #[rustfmt::skip] |
2066 | | let expected_descending_false_nulls_first_true = vec![ |
2067 | | None, |
2068 | | |
2069 | | Some(vec![None, None, None]), |
2070 | | Some(vec![None, None, Some(false)]), |
2071 | | Some(vec![None, None, Some(true)]), |
2072 | | |
2073 | | Some(vec![None, Some(false), None]), |
2074 | | Some(vec![None, Some(false), Some(false)]), |
2075 | | Some(vec![None, Some(false), Some(true)]), |
2076 | | |
2077 | | Some(vec![None, Some(true), None]), |
2078 | | Some(vec![None, Some(true), Some(false)]), |
2079 | | Some(vec![None, Some(true), Some(true)]), |
2080 | | |
2081 | | Some(vec![Some(false), None, None]), |
2082 | | Some(vec![Some(false), None, Some(false)]), |
2083 | | Some(vec![Some(false), None, Some(true)]), |
2084 | | |
2085 | | Some(vec![Some(false), Some(false), None]), |
2086 | | Some(vec![Some(false), Some(false), Some(false)]), |
2087 | | Some(vec![Some(false), Some(false), Some(true)]), |
2088 | | |
2089 | | Some(vec![Some(false), Some(true), None]), |
2090 | | Some(vec![Some(false), Some(true), Some(false)]), |
2091 | | Some(vec![Some(false), Some(true), Some(true)]), |
2092 | | |
2093 | | Some(vec![Some(true), None, None]), |
2094 | | Some(vec![Some(true), None, Some(false)]), |
2095 | | Some(vec![Some(true), None, Some(true)]), |
2096 | | |
2097 | | Some(vec![Some(true), Some(false), None]), |
2098 | | Some(vec![Some(true), Some(false), Some(false)]), |
2099 | | Some(vec![Some(true), Some(false), Some(true)]), |
2100 | | |
2101 | | Some(vec![Some(true), Some(true), None]), |
2102 | | Some(vec![Some(true), Some(true), Some(false)]), |
2103 | | Some(vec![Some(true), Some(true), Some(true)]), |
2104 | | ]; |
2105 | | |
2106 | | test_every_config_sort_boolean_list_arrays( |
2107 | | cases.clone(), |
2108 | | Some(SortOptions { |
2109 | | descending: false, |
2110 | | nulls_first: true, |
2111 | | }), |
2112 | | expected_descending_false_nulls_first_true, |
2113 | | ); |
2114 | | |
2115 | | // The order is true, false, null |
2116 | | #[rustfmt::skip] |
2117 | | let expected_descending_true_nulls_first_false = vec![ |
2118 | | Some(vec![Some(true), Some(true), Some(true)]), |
2119 | | Some(vec![Some(true), Some(true), Some(false)]), |
2120 | | Some(vec![Some(true), Some(true), None]), |
2121 | | |
2122 | | Some(vec![Some(true), Some(false), Some(true)]), |
2123 | | Some(vec![Some(true), Some(false), Some(false)]), |
2124 | | Some(vec![Some(true), Some(false), None]), |
2125 | | |
2126 | | Some(vec![Some(true), None, Some(true)]), |
2127 | | Some(vec![Some(true), None, Some(false)]), |
2128 | | Some(vec![Some(true), None, None]), |
2129 | | |
2130 | | Some(vec![Some(false), Some(true), Some(true)]), |
2131 | | Some(vec![Some(false), Some(true), Some(false)]), |
2132 | | Some(vec![Some(false), Some(true), None]), |
2133 | | |
2134 | | Some(vec![Some(false), Some(false), Some(true)]), |
2135 | | Some(vec![Some(false), Some(false), Some(false)]), |
2136 | | Some(vec![Some(false), Some(false), None]), |
2137 | | |
2138 | | Some(vec![Some(false), None, Some(true)]), |
2139 | | Some(vec![Some(false), None, Some(false)]), |
2140 | | Some(vec![Some(false), None, None]), |
2141 | | |
2142 | | Some(vec![None, Some(true), Some(true)]), |
2143 | | Some(vec![None, Some(true), Some(false)]), |
2144 | | Some(vec![None, Some(true), None]), |
2145 | | |
2146 | | Some(vec![None, Some(false), Some(true)]), |
2147 | | Some(vec![None, Some(false), Some(false)]), |
2148 | | Some(vec![None, Some(false), None]), |
2149 | | |
2150 | | Some(vec![None, None, Some(true)]), |
2151 | | Some(vec![None, None, Some(false)]), |
2152 | | Some(vec![None, None, None]), |
2153 | | |
2154 | | None, |
2155 | | ]; |
2156 | | test_every_config_sort_boolean_list_arrays( |
2157 | | cases.clone(), |
2158 | | Some(SortOptions { |
2159 | | descending: true, |
2160 | | nulls_first: false, |
2161 | | }), |
2162 | | expected_descending_true_nulls_first_false, |
2163 | | ); |
2164 | | |
2165 | | // The order is null, true, false |
2166 | | #[rustfmt::skip] |
2167 | | let expected_descending_true_nulls_first_true = vec![ |
2168 | | None, |
2169 | | |
2170 | | Some(vec![None, None, None]), |
2171 | | Some(vec![None, None, Some(true)]), |
2172 | | Some(vec![None, None, Some(false)]), |
2173 | | |
2174 | | Some(vec![None, Some(true), None]), |
2175 | | Some(vec![None, Some(true), Some(true)]), |
2176 | | Some(vec![None, Some(true), Some(false)]), |
2177 | | |
2178 | | Some(vec![None, Some(false), None]), |
2179 | | Some(vec![None, Some(false), Some(true)]), |
2180 | | Some(vec![None, Some(false), Some(false)]), |
2181 | | |
2182 | | Some(vec![Some(true), None, None]), |
2183 | | Some(vec![Some(true), None, Some(true)]), |
2184 | | Some(vec![Some(true), None, Some(false)]), |
2185 | | |
2186 | | Some(vec![Some(true), Some(true), None]), |
2187 | | Some(vec![Some(true), Some(true), Some(true)]), |
2188 | | Some(vec![Some(true), Some(true), Some(false)]), |
2189 | | |
2190 | | Some(vec![Some(true), Some(false), None]), |
2191 | | Some(vec![Some(true), Some(false), Some(true)]), |
2192 | | Some(vec![Some(true), Some(false), Some(false)]), |
2193 | | |
2194 | | Some(vec![Some(false), None, None]), |
2195 | | Some(vec![Some(false), None, Some(true)]), |
2196 | | Some(vec![Some(false), None, Some(false)]), |
2197 | | |
2198 | | Some(vec![Some(false), Some(true), None]), |
2199 | | Some(vec![Some(false), Some(true), Some(true)]), |
2200 | | Some(vec![Some(false), Some(true), Some(false)]), |
2201 | | |
2202 | | Some(vec![Some(false), Some(false), None]), |
2203 | | Some(vec![Some(false), Some(false), Some(true)]), |
2204 | | Some(vec![Some(false), Some(false), Some(false)]), |
2205 | | ]; |
2206 | | // Testing with limit false and fixed_length None |
2207 | | test_every_config_sort_boolean_list_arrays( |
2208 | | cases.clone(), |
2209 | | Some(SortOptions { |
2210 | | descending: true, |
2211 | | nulls_first: true, |
2212 | | }), |
2213 | | expected_descending_true_nulls_first_true, |
2214 | | ); |
2215 | | } |
2216 | | |
2217 | | fn test_sort_indices_decimal<T: DecimalType>(precision: u8, scale: i8) { |
2218 | | // decimal default |
2219 | | test_sort_to_indices_decimal_array::<T>( |
2220 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2221 | | None, |
2222 | | None, |
2223 | | vec![0, 6, 4, 2, 3, 5, 1], |
2224 | | precision, |
2225 | | scale, |
2226 | | ); |
2227 | | // decimal descending |
2228 | | test_sort_to_indices_decimal_array::<T>( |
2229 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2230 | | Some(SortOptions { |
2231 | | descending: true, |
2232 | | nulls_first: false, |
2233 | | }), |
2234 | | None, |
2235 | | vec![1, 5, 3, 2, 4, 0, 6], |
2236 | | precision, |
2237 | | scale, |
2238 | | ); |
2239 | | // decimal null_first and descending |
2240 | | test_sort_to_indices_decimal_array::<T>( |
2241 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2242 | | Some(SortOptions { |
2243 | | descending: true, |
2244 | | nulls_first: true, |
2245 | | }), |
2246 | | None, |
2247 | | vec![0, 6, 1, 5, 3, 2, 4], |
2248 | | precision, |
2249 | | scale, |
2250 | | ); |
2251 | | // decimal null_first |
2252 | | test_sort_to_indices_decimal_array::<T>( |
2253 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2254 | | Some(SortOptions { |
2255 | | descending: false, |
2256 | | nulls_first: true, |
2257 | | }), |
2258 | | None, |
2259 | | vec![0, 6, 4, 2, 3, 5, 1], |
2260 | | precision, |
2261 | | scale, |
2262 | | ); |
2263 | | // limit |
2264 | | test_sort_to_indices_decimal_array::<T>( |
2265 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2266 | | None, |
2267 | | Some(3), |
2268 | | vec![0, 6, 4], |
2269 | | precision, |
2270 | | scale, |
2271 | | ); |
2272 | | // limit descending |
2273 | | test_sort_to_indices_decimal_array::<T>( |
2274 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2275 | | Some(SortOptions { |
2276 | | descending: true, |
2277 | | nulls_first: false, |
2278 | | }), |
2279 | | Some(3), |
2280 | | vec![1, 5, 3], |
2281 | | precision, |
2282 | | scale, |
2283 | | ); |
2284 | | // limit descending null_first |
2285 | | test_sort_to_indices_decimal_array::<T>( |
2286 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2287 | | Some(SortOptions { |
2288 | | descending: true, |
2289 | | nulls_first: true, |
2290 | | }), |
2291 | | Some(3), |
2292 | | vec![0, 6, 1], |
2293 | | precision, |
2294 | | scale, |
2295 | | ); |
2296 | | // limit null_first |
2297 | | test_sort_to_indices_decimal_array::<T>( |
2298 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2299 | | Some(SortOptions { |
2300 | | descending: false, |
2301 | | nulls_first: true, |
2302 | | }), |
2303 | | Some(3), |
2304 | | vec![0, 6, 4], |
2305 | | precision, |
2306 | | scale, |
2307 | | ); |
2308 | | } |
2309 | | |
2310 | | #[test] |
2311 | | fn test_sort_indices_decimal32() { |
2312 | | test_sort_indices_decimal::<Decimal32Type>(8, 3); |
2313 | | } |
2314 | | |
2315 | | #[test] |
2316 | | fn test_sort_indices_decimal64() { |
2317 | | test_sort_indices_decimal::<Decimal64Type>(17, 5); |
2318 | | } |
2319 | | |
2320 | | #[test] |
2321 | | fn test_sort_indices_decimal128() { |
2322 | | test_sort_indices_decimal::<Decimal128Type>(23, 6); |
2323 | | } |
2324 | | |
2325 | | #[test] |
2326 | | fn test_sort_indices_decimal256() { |
2327 | | test_sort_indices_decimal::<Decimal256Type>(53, 6); |
2328 | | } |
2329 | | |
2330 | | #[test] |
2331 | | fn test_sort_indices_decimal256_max_min() { |
2332 | | let data = vec![ |
2333 | | None, |
2334 | | Some(i256::MIN), |
2335 | | Some(i256::from_i128(1)), |
2336 | | Some(i256::MAX), |
2337 | | Some(i256::from_i128(-1)), |
2338 | | ]; |
2339 | | test_sort_to_indices_decimal256_array( |
2340 | | data.clone(), |
2341 | | Some(SortOptions { |
2342 | | descending: false, |
2343 | | nulls_first: true, |
2344 | | }), |
2345 | | None, |
2346 | | vec![0, 1, 4, 2, 3], |
2347 | | ); |
2348 | | |
2349 | | test_sort_to_indices_decimal256_array( |
2350 | | data.clone(), |
2351 | | Some(SortOptions { |
2352 | | descending: true, |
2353 | | nulls_first: true, |
2354 | | }), |
2355 | | None, |
2356 | | vec![0, 3, 2, 4, 1], |
2357 | | ); |
2358 | | |
2359 | | test_sort_to_indices_decimal256_array( |
2360 | | data.clone(), |
2361 | | Some(SortOptions { |
2362 | | descending: false, |
2363 | | nulls_first: true, |
2364 | | }), |
2365 | | Some(4), |
2366 | | vec![0, 1, 4, 2], |
2367 | | ); |
2368 | | |
2369 | | test_sort_to_indices_decimal256_array( |
2370 | | data.clone(), |
2371 | | Some(SortOptions { |
2372 | | descending: true, |
2373 | | nulls_first: true, |
2374 | | }), |
2375 | | Some(4), |
2376 | | vec![0, 3, 2, 4], |
2377 | | ); |
2378 | | } |
2379 | | |
2380 | | fn test_sort_decimal<T: DecimalType>(precision: u8, scale: i8) { |
2381 | | // decimal default |
2382 | | test_sort_decimal_array::<T>( |
2383 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2384 | | None, |
2385 | | None, |
2386 | | vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], |
2387 | | precision, |
2388 | | scale, |
2389 | | ); |
2390 | | // decimal descending |
2391 | | test_sort_decimal_array::<T>( |
2392 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2393 | | Some(SortOptions { |
2394 | | descending: true, |
2395 | | nulls_first: false, |
2396 | | }), |
2397 | | None, |
2398 | | vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None], |
2399 | | precision, |
2400 | | scale, |
2401 | | ); |
2402 | | // decimal null_first and descending |
2403 | | test_sort_decimal_array::<T>( |
2404 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2405 | | Some(SortOptions { |
2406 | | descending: true, |
2407 | | nulls_first: true, |
2408 | | }), |
2409 | | None, |
2410 | | vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)], |
2411 | | precision, |
2412 | | scale, |
2413 | | ); |
2414 | | // decimal null_first |
2415 | | test_sort_decimal_array::<T>( |
2416 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2417 | | Some(SortOptions { |
2418 | | descending: false, |
2419 | | nulls_first: true, |
2420 | | }), |
2421 | | None, |
2422 | | vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], |
2423 | | precision, |
2424 | | scale, |
2425 | | ); |
2426 | | // limit |
2427 | | test_sort_decimal_array::<T>( |
2428 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2429 | | None, |
2430 | | Some(3), |
2431 | | vec![None, None, Some(1)], |
2432 | | precision, |
2433 | | scale, |
2434 | | ); |
2435 | | // limit descending |
2436 | | test_sort_decimal_array::<T>( |
2437 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2438 | | Some(SortOptions { |
2439 | | descending: true, |
2440 | | nulls_first: false, |
2441 | | }), |
2442 | | Some(3), |
2443 | | vec![Some(5), Some(4), Some(3)], |
2444 | | precision, |
2445 | | scale, |
2446 | | ); |
2447 | | // limit descending null_first |
2448 | | test_sort_decimal_array::<T>( |
2449 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2450 | | Some(SortOptions { |
2451 | | descending: true, |
2452 | | nulls_first: true, |
2453 | | }), |
2454 | | Some(3), |
2455 | | vec![None, None, Some(5)], |
2456 | | precision, |
2457 | | scale, |
2458 | | ); |
2459 | | // limit null_first |
2460 | | test_sort_decimal_array::<T>( |
2461 | | vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], |
2462 | | Some(SortOptions { |
2463 | | descending: false, |
2464 | | nulls_first: true, |
2465 | | }), |
2466 | | Some(3), |
2467 | | vec![None, None, Some(1)], |
2468 | | precision, |
2469 | | scale, |
2470 | | ); |
2471 | | } |
2472 | | |
2473 | | #[test] |
2474 | | fn test_sort_decimal32() { |
2475 | | test_sort_decimal::<Decimal32Type>(8, 3); |
2476 | | } |
2477 | | |
2478 | | #[test] |
2479 | | fn test_sort_decimal64() { |
2480 | | test_sort_decimal::<Decimal64Type>(17, 5); |
2481 | | } |
2482 | | |
2483 | | #[test] |
2484 | | fn test_sort_decimal128() { |
2485 | | test_sort_decimal::<Decimal128Type>(23, 6); |
2486 | | } |
2487 | | |
2488 | | #[test] |
2489 | | fn test_sort_decimal256() { |
2490 | | test_sort_decimal::<Decimal256Type>(53, 6); |
2491 | | } |
2492 | | |
2493 | | #[test] |
2494 | | fn test_sort_decimal256_max_min() { |
2495 | | test_sort_decimal256_array( |
2496 | | vec![ |
2497 | | None, |
2498 | | Some(i256::MIN), |
2499 | | Some(i256::from_i128(1)), |
2500 | | Some(i256::MAX), |
2501 | | Some(i256::from_i128(-1)), |
2502 | | None, |
2503 | | ], |
2504 | | Some(SortOptions { |
2505 | | descending: false, |
2506 | | nulls_first: true, |
2507 | | }), |
2508 | | None, |
2509 | | vec![ |
2510 | | None, |
2511 | | None, |
2512 | | Some(i256::MIN), |
2513 | | Some(i256::from_i128(-1)), |
2514 | | Some(i256::from_i128(1)), |
2515 | | Some(i256::MAX), |
2516 | | ], |
2517 | | ); |
2518 | | |
2519 | | test_sort_decimal256_array( |
2520 | | vec![ |
2521 | | None, |
2522 | | Some(i256::MIN), |
2523 | | Some(i256::from_i128(1)), |
2524 | | Some(i256::MAX), |
2525 | | Some(i256::from_i128(-1)), |
2526 | | None, |
2527 | | ], |
2528 | | Some(SortOptions { |
2529 | | descending: true, |
2530 | | nulls_first: true, |
2531 | | }), |
2532 | | None, |
2533 | | vec![ |
2534 | | None, |
2535 | | None, |
2536 | | Some(i256::MAX), |
2537 | | Some(i256::from_i128(1)), |
2538 | | Some(i256::from_i128(-1)), |
2539 | | Some(i256::MIN), |
2540 | | ], |
2541 | | ); |
2542 | | |
2543 | | test_sort_decimal256_array( |
2544 | | vec![ |
2545 | | None, |
2546 | | Some(i256::MIN), |
2547 | | Some(i256::from_i128(1)), |
2548 | | Some(i256::MAX), |
2549 | | Some(i256::from_i128(-1)), |
2550 | | None, |
2551 | | ], |
2552 | | Some(SortOptions { |
2553 | | descending: false, |
2554 | | nulls_first: true, |
2555 | | }), |
2556 | | Some(4), |
2557 | | vec![None, None, Some(i256::MIN), Some(i256::from_i128(-1))], |
2558 | | ); |
2559 | | |
2560 | | test_sort_decimal256_array( |
2561 | | vec![ |
2562 | | None, |
2563 | | Some(i256::MIN), |
2564 | | Some(i256::from_i128(1)), |
2565 | | Some(i256::MAX), |
2566 | | Some(i256::from_i128(-1)), |
2567 | | None, |
2568 | | ], |
2569 | | Some(SortOptions { |
2570 | | descending: true, |
2571 | | nulls_first: true, |
2572 | | }), |
2573 | | Some(4), |
2574 | | vec![None, None, Some(i256::MAX), Some(i256::from_i128(1))], |
2575 | | ); |
2576 | | } |
2577 | | |
2578 | | #[test] |
2579 | | fn test_sort_primitives() { |
2580 | | // default case |
2581 | | test_sort_primitive_arrays::<UInt8Type>( |
2582 | | vec![None, Some(3), Some(5), Some(2), Some(3), None], |
2583 | | None, |
2584 | | None, |
2585 | | vec![None, None, Some(2), Some(3), Some(3), Some(5)], |
2586 | | ); |
2587 | | test_sort_primitive_arrays::<UInt16Type>( |
2588 | | vec![None, Some(3), Some(5), Some(2), Some(3), None], |
2589 | | None, |
2590 | | None, |
2591 | | vec![None, None, Some(2), Some(3), Some(3), Some(5)], |
2592 | | ); |
2593 | | test_sort_primitive_arrays::<UInt32Type>( |
2594 | | vec![None, Some(3), Some(5), Some(2), Some(3), None], |
2595 | | None, |
2596 | | None, |
2597 | | vec![None, None, Some(2), Some(3), Some(3), Some(5)], |
2598 | | ); |
2599 | | test_sort_primitive_arrays::<UInt64Type>( |
2600 | | vec![None, Some(3), Some(5), Some(2), Some(3), None], |
2601 | | None, |
2602 | | None, |
2603 | | vec![None, None, Some(2), Some(3), Some(3), Some(5)], |
2604 | | ); |
2605 | | |
2606 | | // descending |
2607 | | test_sort_primitive_arrays::<Int8Type>( |
2608 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2609 | | Some(SortOptions { |
2610 | | descending: true, |
2611 | | nulls_first: false, |
2612 | | }), |
2613 | | None, |
2614 | | vec![Some(2), Some(0), Some(0), Some(-1), None, None], |
2615 | | ); |
2616 | | test_sort_primitive_arrays::<Int16Type>( |
2617 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2618 | | Some(SortOptions { |
2619 | | descending: true, |
2620 | | nulls_first: false, |
2621 | | }), |
2622 | | None, |
2623 | | vec![Some(2), Some(0), Some(0), Some(-1), None, None], |
2624 | | ); |
2625 | | test_sort_primitive_arrays::<Int32Type>( |
2626 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2627 | | Some(SortOptions { |
2628 | | descending: true, |
2629 | | nulls_first: false, |
2630 | | }), |
2631 | | None, |
2632 | | vec![Some(2), Some(0), Some(0), Some(-1), None, None], |
2633 | | ); |
2634 | | test_sort_primitive_arrays::<Int16Type>( |
2635 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2636 | | Some(SortOptions { |
2637 | | descending: true, |
2638 | | nulls_first: false, |
2639 | | }), |
2640 | | None, |
2641 | | vec![Some(2), Some(0), Some(0), Some(-1), None, None], |
2642 | | ); |
2643 | | |
2644 | | // descending, nulls first |
2645 | | test_sort_primitive_arrays::<Int8Type>( |
2646 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2647 | | Some(SortOptions { |
2648 | | descending: true, |
2649 | | nulls_first: true, |
2650 | | }), |
2651 | | None, |
2652 | | vec![None, None, Some(2), Some(0), Some(0), Some(-1)], |
2653 | | ); |
2654 | | test_sort_primitive_arrays::<Int16Type>( |
2655 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2656 | | Some(SortOptions { |
2657 | | descending: true, |
2658 | | nulls_first: true, |
2659 | | }), |
2660 | | None, |
2661 | | vec![None, None, Some(2), Some(0), Some(0), Some(-1)], |
2662 | | ); |
2663 | | test_sort_primitive_arrays::<Int32Type>( |
2664 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2665 | | Some(SortOptions { |
2666 | | descending: true, |
2667 | | nulls_first: true, |
2668 | | }), |
2669 | | None, |
2670 | | vec![None, None, Some(2), Some(0), Some(0), Some(-1)], |
2671 | | ); |
2672 | | test_sort_primitive_arrays::<Int64Type>( |
2673 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2674 | | Some(SortOptions { |
2675 | | descending: true, |
2676 | | nulls_first: true, |
2677 | | }), |
2678 | | None, |
2679 | | vec![None, None, Some(2), Some(0), Some(0), Some(-1)], |
2680 | | ); |
2681 | | |
2682 | | test_sort_primitive_arrays::<Int64Type>( |
2683 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2684 | | Some(SortOptions { |
2685 | | descending: true, |
2686 | | nulls_first: true, |
2687 | | }), |
2688 | | Some(3), |
2689 | | vec![None, None, Some(2)], |
2690 | | ); |
2691 | | |
2692 | | test_sort_primitive_arrays::<Float16Type>( |
2693 | | vec![ |
2694 | | None, |
2695 | | Some(f16::from_f32(0.0)), |
2696 | | Some(f16::from_f32(2.0)), |
2697 | | Some(f16::from_f32(-1.0)), |
2698 | | Some(f16::from_f32(0.0)), |
2699 | | None, |
2700 | | ], |
2701 | | Some(SortOptions { |
2702 | | descending: true, |
2703 | | nulls_first: true, |
2704 | | }), |
2705 | | None, |
2706 | | vec![ |
2707 | | None, |
2708 | | None, |
2709 | | Some(f16::from_f32(2.0)), |
2710 | | Some(f16::from_f32(0.0)), |
2711 | | Some(f16::from_f32(0.0)), |
2712 | | Some(f16::from_f32(-1.0)), |
2713 | | ], |
2714 | | ); |
2715 | | |
2716 | | test_sort_primitive_arrays::<Float32Type>( |
2717 | | vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], |
2718 | | Some(SortOptions { |
2719 | | descending: true, |
2720 | | nulls_first: true, |
2721 | | }), |
2722 | | None, |
2723 | | vec![None, None, Some(2.0), Some(0.0), Some(0.0), Some(-1.0)], |
2724 | | ); |
2725 | | test_sort_primitive_arrays::<Float64Type>( |
2726 | | vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], |
2727 | | Some(SortOptions { |
2728 | | descending: true, |
2729 | | nulls_first: true, |
2730 | | }), |
2731 | | None, |
2732 | | vec![None, None, Some(f64::NAN), Some(2.0), Some(0.0), Some(-1.0)], |
2733 | | ); |
2734 | | test_sort_primitive_arrays::<Float64Type>( |
2735 | | vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], |
2736 | | Some(SortOptions { |
2737 | | descending: true, |
2738 | | nulls_first: true, |
2739 | | }), |
2740 | | None, |
2741 | | vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], |
2742 | | ); |
2743 | | |
2744 | | // int8 nulls first |
2745 | | test_sort_primitive_arrays::<Int8Type>( |
2746 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2747 | | Some(SortOptions { |
2748 | | descending: false, |
2749 | | nulls_first: true, |
2750 | | }), |
2751 | | None, |
2752 | | vec![None, None, Some(-1), Some(0), Some(0), Some(2)], |
2753 | | ); |
2754 | | test_sort_primitive_arrays::<Int16Type>( |
2755 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2756 | | Some(SortOptions { |
2757 | | descending: false, |
2758 | | nulls_first: true, |
2759 | | }), |
2760 | | None, |
2761 | | vec![None, None, Some(-1), Some(0), Some(0), Some(2)], |
2762 | | ); |
2763 | | test_sort_primitive_arrays::<Int32Type>( |
2764 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2765 | | Some(SortOptions { |
2766 | | descending: false, |
2767 | | nulls_first: true, |
2768 | | }), |
2769 | | None, |
2770 | | vec![None, None, Some(-1), Some(0), Some(0), Some(2)], |
2771 | | ); |
2772 | | test_sort_primitive_arrays::<Int64Type>( |
2773 | | vec![None, Some(0), Some(2), Some(-1), Some(0), None], |
2774 | | Some(SortOptions { |
2775 | | descending: false, |
2776 | | nulls_first: true, |
2777 | | }), |
2778 | | None, |
2779 | | vec![None, None, Some(-1), Some(0), Some(0), Some(2)], |
2780 | | ); |
2781 | | test_sort_primitive_arrays::<Float16Type>( |
2782 | | vec![ |
2783 | | None, |
2784 | | Some(f16::from_f32(0.0)), |
2785 | | Some(f16::from_f32(2.0)), |
2786 | | Some(f16::from_f32(-1.0)), |
2787 | | Some(f16::from_f32(0.0)), |
2788 | | None, |
2789 | | ], |
2790 | | Some(SortOptions { |
2791 | | descending: false, |
2792 | | nulls_first: true, |
2793 | | }), |
2794 | | None, |
2795 | | vec![ |
2796 | | None, |
2797 | | None, |
2798 | | Some(f16::from_f32(-1.0)), |
2799 | | Some(f16::from_f32(0.0)), |
2800 | | Some(f16::from_f32(0.0)), |
2801 | | Some(f16::from_f32(2.0)), |
2802 | | ], |
2803 | | ); |
2804 | | test_sort_primitive_arrays::<Float32Type>( |
2805 | | vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], |
2806 | | Some(SortOptions { |
2807 | | descending: false, |
2808 | | nulls_first: true, |
2809 | | }), |
2810 | | None, |
2811 | | vec![None, None, Some(-1.0), Some(0.0), Some(0.0), Some(2.0)], |
2812 | | ); |
2813 | | test_sort_primitive_arrays::<Float64Type>( |
2814 | | vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], |
2815 | | Some(SortOptions { |
2816 | | descending: false, |
2817 | | nulls_first: true, |
2818 | | }), |
2819 | | None, |
2820 | | vec![None, None, Some(-1.0), Some(0.0), Some(2.0), Some(f64::NAN)], |
2821 | | ); |
2822 | | test_sort_primitive_arrays::<Float64Type>( |
2823 | | vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], |
2824 | | Some(SortOptions { |
2825 | | descending: false, |
2826 | | nulls_first: true, |
2827 | | }), |
2828 | | None, |
2829 | | vec![Some(1.0), Some(f64::NAN), Some(f64::NAN), Some(f64::NAN)], |
2830 | | ); |
2831 | | |
2832 | | // limit |
2833 | | test_sort_primitive_arrays::<Float64Type>( |
2834 | | vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], |
2835 | | Some(SortOptions { |
2836 | | descending: false, |
2837 | | nulls_first: true, |
2838 | | }), |
2839 | | Some(2), |
2840 | | vec![Some(1.0), Some(f64::NAN)], |
2841 | | ); |
2842 | | |
2843 | | // limit with actual value |
2844 | | test_sort_primitive_arrays::<Float64Type>( |
2845 | | vec![Some(2.0), Some(4.0), Some(3.0), Some(1.0)], |
2846 | | Some(SortOptions { |
2847 | | descending: false, |
2848 | | nulls_first: true, |
2849 | | }), |
2850 | | Some(3), |
2851 | | vec![Some(1.0), Some(2.0), Some(3.0)], |
2852 | | ); |
2853 | | |
2854 | | // valid values less than limit with extra nulls |
2855 | | test_sort_primitive_arrays::<Float64Type>( |
2856 | | vec![Some(2.0), None, None, Some(1.0)], |
2857 | | Some(SortOptions { |
2858 | | descending: false, |
2859 | | nulls_first: false, |
2860 | | }), |
2861 | | Some(3), |
2862 | | vec![Some(1.0), Some(2.0), None], |
2863 | | ); |
2864 | | |
2865 | | test_sort_primitive_arrays::<Float64Type>( |
2866 | | vec![Some(2.0), None, None, Some(1.0)], |
2867 | | Some(SortOptions { |
2868 | | descending: false, |
2869 | | nulls_first: true, |
2870 | | }), |
2871 | | Some(3), |
2872 | | vec![None, None, Some(1.0)], |
2873 | | ); |
2874 | | |
2875 | | // more nulls than limit |
2876 | | test_sort_primitive_arrays::<Float64Type>( |
2877 | | vec![Some(2.0), None, None, None], |
2878 | | Some(SortOptions { |
2879 | | descending: false, |
2880 | | nulls_first: true, |
2881 | | }), |
2882 | | Some(2), |
2883 | | vec![None, None], |
2884 | | ); |
2885 | | |
2886 | | test_sort_primitive_arrays::<Float64Type>( |
2887 | | vec![Some(2.0), None, None, None], |
2888 | | Some(SortOptions { |
2889 | | descending: false, |
2890 | | nulls_first: false, |
2891 | | }), |
2892 | | Some(2), |
2893 | | vec![Some(2.0), None], |
2894 | | ); |
2895 | | } |
2896 | | |
2897 | | #[test] |
2898 | | fn test_sort_to_indices_strings() { |
2899 | | test_sort_to_indices_string_arrays( |
2900 | | vec![ |
2901 | | None, |
2902 | | Some("bad"), |
2903 | | Some("sad"), |
2904 | | None, |
2905 | | Some("glad"), |
2906 | | Some("-ad"), |
2907 | | ], |
2908 | | None, |
2909 | | None, |
2910 | | vec![0, 3, 5, 1, 4, 2], |
2911 | | ); |
2912 | | |
2913 | | test_sort_to_indices_string_arrays( |
2914 | | vec![ |
2915 | | None, |
2916 | | Some("bad"), |
2917 | | Some("sad"), |
2918 | | None, |
2919 | | Some("glad"), |
2920 | | Some("-ad"), |
2921 | | ], |
2922 | | Some(SortOptions { |
2923 | | descending: true, |
2924 | | nulls_first: false, |
2925 | | }), |
2926 | | None, |
2927 | | vec![2, 4, 1, 5, 0, 3], |
2928 | | ); |
2929 | | |
2930 | | test_sort_to_indices_string_arrays( |
2931 | | vec![ |
2932 | | None, |
2933 | | Some("bad"), |
2934 | | Some("sad"), |
2935 | | None, |
2936 | | Some("glad"), |
2937 | | Some("-ad"), |
2938 | | ], |
2939 | | Some(SortOptions { |
2940 | | descending: false, |
2941 | | nulls_first: true, |
2942 | | }), |
2943 | | None, |
2944 | | vec![0, 3, 5, 1, 4, 2], |
2945 | | ); |
2946 | | |
2947 | | test_sort_to_indices_string_arrays( |
2948 | | vec![ |
2949 | | None, |
2950 | | Some("bad"), |
2951 | | Some("sad"), |
2952 | | None, |
2953 | | Some("glad"), |
2954 | | Some("-ad"), |
2955 | | ], |
2956 | | Some(SortOptions { |
2957 | | descending: true, |
2958 | | nulls_first: true, |
2959 | | }), |
2960 | | None, |
2961 | | vec![0, 3, 2, 4, 1, 5], |
2962 | | ); |
2963 | | |
2964 | | test_sort_to_indices_string_arrays( |
2965 | | vec![ |
2966 | | None, |
2967 | | Some("bad"), |
2968 | | Some("sad"), |
2969 | | None, |
2970 | | Some("glad"), |
2971 | | Some("-ad"), |
2972 | | ], |
2973 | | Some(SortOptions { |
2974 | | descending: true, |
2975 | | nulls_first: true, |
2976 | | }), |
2977 | | Some(3), |
2978 | | vec![0, 3, 2], |
2979 | | ); |
2980 | | |
2981 | | // valid values less than limit with extra nulls |
2982 | | test_sort_to_indices_string_arrays( |
2983 | | vec![Some("def"), None, None, Some("abc")], |
2984 | | Some(SortOptions { |
2985 | | descending: false, |
2986 | | nulls_first: false, |
2987 | | }), |
2988 | | Some(3), |
2989 | | vec![3, 0, 1], |
2990 | | ); |
2991 | | |
2992 | | test_sort_to_indices_string_arrays( |
2993 | | vec![Some("def"), None, None, Some("abc")], |
2994 | | Some(SortOptions { |
2995 | | descending: false, |
2996 | | nulls_first: true, |
2997 | | }), |
2998 | | Some(3), |
2999 | | vec![1, 2, 3], |
3000 | | ); |
3001 | | |
3002 | | // more nulls than limit |
3003 | | test_sort_to_indices_string_arrays( |
3004 | | vec![Some("def"), None, None, None], |
3005 | | Some(SortOptions { |
3006 | | descending: false, |
3007 | | nulls_first: true, |
3008 | | }), |
3009 | | Some(2), |
3010 | | vec![1, 2], |
3011 | | ); |
3012 | | |
3013 | | test_sort_to_indices_string_arrays( |
3014 | | vec![Some("def"), None, None, None], |
3015 | | Some(SortOptions { |
3016 | | descending: false, |
3017 | | nulls_first: false, |
3018 | | }), |
3019 | | Some(2), |
3020 | | vec![0, 1], |
3021 | | ); |
3022 | | } |
3023 | | |
3024 | | #[test] |
3025 | | fn test_sort_strings() { |
3026 | | test_sort_string_arrays( |
3027 | | vec![ |
3028 | | None, |
3029 | | Some("bad"), |
3030 | | Some("sad"), |
3031 | | Some("long string longer than 12 bytes"), |
3032 | | None, |
3033 | | Some("glad"), |
3034 | | Some("lang string longer than 12 bytes"), |
3035 | | Some("-ad"), |
3036 | | ], |
3037 | | None, |
3038 | | None, |
3039 | | vec![ |
3040 | | None, |
3041 | | None, |
3042 | | Some("-ad"), |
3043 | | Some("bad"), |
3044 | | Some("glad"), |
3045 | | Some("lang string longer than 12 bytes"), |
3046 | | Some("long string longer than 12 bytes"), |
3047 | | Some("sad"), |
3048 | | ], |
3049 | | ); |
3050 | | |
3051 | | test_sort_string_arrays( |
3052 | | vec![ |
3053 | | None, |
3054 | | Some("bad"), |
3055 | | Some("sad"), |
3056 | | Some("long string longer than 12 bytes"), |
3057 | | None, |
3058 | | Some("glad"), |
3059 | | Some("lang string longer than 12 bytes"), |
3060 | | Some("-ad"), |
3061 | | ], |
3062 | | Some(SortOptions { |
3063 | | descending: true, |
3064 | | nulls_first: false, |
3065 | | }), |
3066 | | None, |
3067 | | vec![ |
3068 | | Some("sad"), |
3069 | | Some("long string longer than 12 bytes"), |
3070 | | Some("lang string longer than 12 bytes"), |
3071 | | Some("glad"), |
3072 | | Some("bad"), |
3073 | | Some("-ad"), |
3074 | | None, |
3075 | | None, |
3076 | | ], |
3077 | | ); |
3078 | | |
3079 | | test_sort_string_arrays( |
3080 | | vec![ |
3081 | | None, |
3082 | | Some("bad"), |
3083 | | Some("long string longer than 12 bytes"), |
3084 | | Some("sad"), |
3085 | | None, |
3086 | | Some("glad"), |
3087 | | Some("lang string longer than 12 bytes"), |
3088 | | Some("-ad"), |
3089 | | ], |
3090 | | Some(SortOptions { |
3091 | | descending: false, |
3092 | | nulls_first: true, |
3093 | | }), |
3094 | | None, |
3095 | | vec![ |
3096 | | None, |
3097 | | None, |
3098 | | Some("-ad"), |
3099 | | Some("bad"), |
3100 | | Some("glad"), |
3101 | | Some("lang string longer than 12 bytes"), |
3102 | | Some("long string longer than 12 bytes"), |
3103 | | Some("sad"), |
3104 | | ], |
3105 | | ); |
3106 | | |
3107 | | test_sort_string_arrays( |
3108 | | vec![ |
3109 | | None, |
3110 | | Some("bad"), |
3111 | | Some("long string longer than 12 bytes"), |
3112 | | Some("sad"), |
3113 | | None, |
3114 | | Some("glad"), |
3115 | | Some("lang string longer than 12 bytes"), |
3116 | | Some("-ad"), |
3117 | | ], |
3118 | | Some(SortOptions { |
3119 | | descending: true, |
3120 | | nulls_first: true, |
3121 | | }), |
3122 | | None, |
3123 | | vec![ |
3124 | | None, |
3125 | | None, |
3126 | | Some("sad"), |
3127 | | Some("long string longer than 12 bytes"), |
3128 | | Some("lang string longer than 12 bytes"), |
3129 | | Some("glad"), |
3130 | | Some("bad"), |
3131 | | Some("-ad"), |
3132 | | ], |
3133 | | ); |
3134 | | |
3135 | | test_sort_string_arrays( |
3136 | | vec![ |
3137 | | None, |
3138 | | Some("bad"), |
3139 | | Some("long string longer than 12 bytes"), |
3140 | | Some("sad"), |
3141 | | None, |
3142 | | Some("glad"), |
3143 | | Some("lang string longer than 12 bytes"), |
3144 | | Some("-ad"), |
3145 | | ], |
3146 | | Some(SortOptions { |
3147 | | descending: true, |
3148 | | nulls_first: true, |
3149 | | }), |
3150 | | Some(3), |
3151 | | vec![None, None, Some("sad")], |
3152 | | ); |
3153 | | |
3154 | | // valid values less than limit with extra nulls |
3155 | | test_sort_string_arrays( |
3156 | | vec![ |
3157 | | Some("def long string longer than 12"), |
3158 | | None, |
3159 | | None, |
3160 | | Some("abc"), |
3161 | | ], |
3162 | | Some(SortOptions { |
3163 | | descending: false, |
3164 | | nulls_first: false, |
3165 | | }), |
3166 | | Some(3), |
3167 | | vec![Some("abc"), Some("def long string longer than 12"), None], |
3168 | | ); |
3169 | | |
3170 | | test_sort_string_arrays( |
3171 | | vec![ |
3172 | | Some("def long string longer than 12"), |
3173 | | None, |
3174 | | None, |
3175 | | Some("abc"), |
3176 | | ], |
3177 | | Some(SortOptions { |
3178 | | descending: false, |
3179 | | nulls_first: true, |
3180 | | }), |
3181 | | Some(3), |
3182 | | vec![None, None, Some("abc")], |
3183 | | ); |
3184 | | |
3185 | | // more nulls than limit |
3186 | | test_sort_string_arrays( |
3187 | | vec![Some("def long string longer than 12"), None, None, None], |
3188 | | Some(SortOptions { |
3189 | | descending: false, |
3190 | | nulls_first: true, |
3191 | | }), |
3192 | | Some(2), |
3193 | | vec![None, None], |
3194 | | ); |
3195 | | |
3196 | | test_sort_string_arrays( |
3197 | | vec![Some("def long string longer than 12"), None, None, None], |
3198 | | Some(SortOptions { |
3199 | | descending: false, |
3200 | | nulls_first: false, |
3201 | | }), |
3202 | | Some(2), |
3203 | | vec![Some("def long string longer than 12"), None], |
3204 | | ); |
3205 | | } |
3206 | | |
3207 | | #[test] |
3208 | | fn test_sort_run_to_run() { |
3209 | | test_sort_run_inner(|array, sort_options, limit| sort_run(array, sort_options, limit)); |
3210 | | } |
3211 | | |
3212 | | #[test] |
3213 | | fn test_sort_run_to_indices() { |
3214 | | test_sort_run_inner(|array, sort_options, limit| { |
3215 | | let indices = sort_to_indices(array, sort_options, limit).unwrap(); |
3216 | | take(array, &indices, None) |
3217 | | }); |
3218 | | } |
3219 | | |
3220 | | fn test_sort_run_inner<F>(sort_fn: F) |
3221 | | where |
3222 | | F: Fn(&dyn Array, Option<SortOptions>, Option<usize>) -> Result<ArrayRef, ArrowError>, |
3223 | | { |
3224 | | // Create an input array for testing |
3225 | | let total_len = 80; |
3226 | | let vals: Vec<Option<i32>> = vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; |
3227 | | let repeats: Vec<usize> = vec![1, 3, 2, 4]; |
3228 | | let mut input_array: Vec<Option<i32>> = Vec::with_capacity(total_len); |
3229 | | for ix in 0_usize..32 { |
3230 | | let repeat: usize = repeats[ix % repeats.len()]; |
3231 | | let val: Option<i32> = vals[ix % vals.len()]; |
3232 | | input_array.resize(input_array.len() + repeat, val); |
3233 | | } |
3234 | | |
3235 | | // create run array using input_array |
3236 | | // Encode the input_array to run array |
3237 | | let mut builder = |
3238 | | PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len()); |
3239 | | builder.extend(input_array.iter().copied()); |
3240 | | let run_array = builder.finish(); |
3241 | | |
3242 | | // slice lengths that are tested |
3243 | | let slice_lens = [ |
3244 | | 1, 2, 3, 4, 5, 6, 7, 37, 38, 39, 40, 41, 42, 43, 74, 75, 76, 77, 78, 79, 80, |
3245 | | ]; |
3246 | | for slice_len in slice_lens { |
3247 | | test_sort_run_inner2( |
3248 | | input_array.as_slice(), |
3249 | | &run_array, |
3250 | | 0, |
3251 | | slice_len, |
3252 | | None, |
3253 | | &sort_fn, |
3254 | | ); |
3255 | | test_sort_run_inner2( |
3256 | | input_array.as_slice(), |
3257 | | &run_array, |
3258 | | total_len - slice_len, |
3259 | | slice_len, |
3260 | | None, |
3261 | | &sort_fn, |
3262 | | ); |
3263 | | // Test with non zero limit |
3264 | | if slice_len > 1 { |
3265 | | test_sort_run_inner2( |
3266 | | input_array.as_slice(), |
3267 | | &run_array, |
3268 | | 0, |
3269 | | slice_len, |
3270 | | Some(slice_len / 2), |
3271 | | &sort_fn, |
3272 | | ); |
3273 | | test_sort_run_inner2( |
3274 | | input_array.as_slice(), |
3275 | | &run_array, |
3276 | | total_len - slice_len, |
3277 | | slice_len, |
3278 | | Some(slice_len / 2), |
3279 | | &sort_fn, |
3280 | | ); |
3281 | | } |
3282 | | } |
3283 | | } |
3284 | | |
3285 | | fn test_sort_run_inner2<F>( |
3286 | | input_array: &[Option<i32>], |
3287 | | run_array: &RunArray<Int16Type>, |
3288 | | offset: usize, |
3289 | | length: usize, |
3290 | | limit: Option<usize>, |
3291 | | sort_fn: &F, |
3292 | | ) where |
3293 | | F: Fn(&dyn Array, Option<SortOptions>, Option<usize>) -> Result<ArrayRef, ArrowError>, |
3294 | | { |
3295 | | // Run the sort and build actual result |
3296 | | let sliced_array = run_array.slice(offset, length); |
3297 | | let sorted_sliced_array = sort_fn(&sliced_array, None, limit).unwrap(); |
3298 | | let sorted_run_array = sorted_sliced_array |
3299 | | .as_any() |
3300 | | .downcast_ref::<RunArray<Int16Type>>() |
3301 | | .unwrap(); |
3302 | | let typed_run_array = sorted_run_array |
3303 | | .downcast::<PrimitiveArray<Int32Type>>() |
3304 | | .unwrap(); |
3305 | | let actual: Vec<Option<i32>> = typed_run_array.into_iter().collect(); |
3306 | | |
3307 | | // build expected result. |
3308 | | let mut sliced_input = input_array[offset..(offset + length)].to_owned(); |
3309 | | sliced_input.sort(); |
3310 | | let expected = if let Some(limit) = limit { |
3311 | | sliced_input.iter().take(limit).copied().collect() |
3312 | | } else { |
3313 | | sliced_input |
3314 | | }; |
3315 | | |
3316 | | assert_eq!(expected, actual) |
3317 | | } |
3318 | | |
3319 | | #[test] |
3320 | | fn test_sort_string_dicts() { |
3321 | | test_sort_string_dict_arrays::<Int8Type>( |
3322 | | vec![ |
3323 | | None, |
3324 | | Some("bad"), |
3325 | | Some("sad"), |
3326 | | None, |
3327 | | Some("glad"), |
3328 | | Some("-ad"), |
3329 | | ], |
3330 | | None, |
3331 | | None, |
3332 | | vec![ |
3333 | | None, |
3334 | | None, |
3335 | | Some("-ad"), |
3336 | | Some("bad"), |
3337 | | Some("glad"), |
3338 | | Some("sad"), |
3339 | | ], |
3340 | | ); |
3341 | | |
3342 | | test_sort_string_dict_arrays::<Int16Type>( |
3343 | | vec![ |
3344 | | None, |
3345 | | Some("bad"), |
3346 | | Some("sad"), |
3347 | | None, |
3348 | | Some("glad"), |
3349 | | Some("-ad"), |
3350 | | ], |
3351 | | Some(SortOptions { |
3352 | | descending: true, |
3353 | | nulls_first: false, |
3354 | | }), |
3355 | | None, |
3356 | | vec![ |
3357 | | Some("sad"), |
3358 | | Some("glad"), |
3359 | | Some("bad"), |
3360 | | Some("-ad"), |
3361 | | None, |
3362 | | None, |
3363 | | ], |
3364 | | ); |
3365 | | |
3366 | | test_sort_string_dict_arrays::<Int32Type>( |
3367 | | vec![ |
3368 | | None, |
3369 | | Some("bad"), |
3370 | | Some("sad"), |
3371 | | None, |
3372 | | Some("glad"), |
3373 | | Some("-ad"), |
3374 | | ], |
3375 | | Some(SortOptions { |
3376 | | descending: false, |
3377 | | nulls_first: true, |
3378 | | }), |
3379 | | None, |
3380 | | vec![ |
3381 | | None, |
3382 | | None, |
3383 | | Some("-ad"), |
3384 | | Some("bad"), |
3385 | | Some("glad"), |
3386 | | Some("sad"), |
3387 | | ], |
3388 | | ); |
3389 | | |
3390 | | test_sort_string_dict_arrays::<Int16Type>( |
3391 | | vec![ |
3392 | | None, |
3393 | | Some("bad"), |
3394 | | Some("sad"), |
3395 | | None, |
3396 | | Some("glad"), |
3397 | | Some("-ad"), |
3398 | | ], |
3399 | | Some(SortOptions { |
3400 | | descending: true, |
3401 | | nulls_first: true, |
3402 | | }), |
3403 | | None, |
3404 | | vec![ |
3405 | | None, |
3406 | | None, |
3407 | | Some("sad"), |
3408 | | Some("glad"), |
3409 | | Some("bad"), |
3410 | | Some("-ad"), |
3411 | | ], |
3412 | | ); |
3413 | | |
3414 | | test_sort_string_dict_arrays::<Int16Type>( |
3415 | | vec![ |
3416 | | None, |
3417 | | Some("bad"), |
3418 | | Some("sad"), |
3419 | | None, |
3420 | | Some("glad"), |
3421 | | Some("-ad"), |
3422 | | ], |
3423 | | Some(SortOptions { |
3424 | | descending: true, |
3425 | | nulls_first: true, |
3426 | | }), |
3427 | | Some(3), |
3428 | | vec![None, None, Some("sad")], |
3429 | | ); |
3430 | | |
3431 | | // valid values less than limit with extra nulls |
3432 | | test_sort_string_dict_arrays::<Int16Type>( |
3433 | | vec![Some("def"), None, None, Some("abc")], |
3434 | | Some(SortOptions { |
3435 | | descending: false, |
3436 | | nulls_first: false, |
3437 | | }), |
3438 | | Some(3), |
3439 | | vec![Some("abc"), Some("def"), None], |
3440 | | ); |
3441 | | |
3442 | | test_sort_string_dict_arrays::<Int16Type>( |
3443 | | vec![Some("def"), None, None, Some("abc")], |
3444 | | Some(SortOptions { |
3445 | | descending: false, |
3446 | | nulls_first: true, |
3447 | | }), |
3448 | | Some(3), |
3449 | | vec![None, None, Some("abc")], |
3450 | | ); |
3451 | | |
3452 | | // more nulls than limit |
3453 | | test_sort_string_dict_arrays::<Int16Type>( |
3454 | | vec![Some("def"), None, None, None], |
3455 | | Some(SortOptions { |
3456 | | descending: false, |
3457 | | nulls_first: true, |
3458 | | }), |
3459 | | Some(2), |
3460 | | vec![None, None], |
3461 | | ); |
3462 | | |
3463 | | test_sort_string_dict_arrays::<Int16Type>( |
3464 | | vec![Some("def"), None, None, None], |
3465 | | Some(SortOptions { |
3466 | | descending: false, |
3467 | | nulls_first: false, |
3468 | | }), |
3469 | | Some(2), |
3470 | | vec![Some("def"), None], |
3471 | | ); |
3472 | | } |
3473 | | |
3474 | | #[test] |
3475 | | fn test_sort_list() { |
3476 | | test_sort_list_arrays::<Int8Type>( |
3477 | | vec![ |
3478 | | Some(vec![Some(1)]), |
3479 | | Some(vec![Some(4)]), |
3480 | | Some(vec![Some(2)]), |
3481 | | Some(vec![Some(3)]), |
3482 | | ], |
3483 | | Some(SortOptions { |
3484 | | descending: false, |
3485 | | nulls_first: false, |
3486 | | }), |
3487 | | None, |
3488 | | vec![ |
3489 | | Some(vec![Some(1)]), |
3490 | | Some(vec![Some(2)]), |
3491 | | Some(vec![Some(3)]), |
3492 | | Some(vec![Some(4)]), |
3493 | | ], |
3494 | | Some(1), |
3495 | | ); |
3496 | | |
3497 | | test_sort_list_arrays::<Float16Type>( |
3498 | | vec![ |
3499 | | Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(0.0))]), |
3500 | | Some(vec![ |
3501 | | Some(f16::from_f32(4.0)), |
3502 | | Some(f16::from_f32(3.0)), |
3503 | | Some(f16::from_f32(2.0)), |
3504 | | Some(f16::from_f32(1.0)), |
3505 | | ]), |
3506 | | Some(vec![ |
3507 | | Some(f16::from_f32(2.0)), |
3508 | | Some(f16::from_f32(3.0)), |
3509 | | Some(f16::from_f32(4.0)), |
3510 | | ]), |
3511 | | Some(vec![ |
3512 | | Some(f16::from_f32(3.0)), |
3513 | | Some(f16::from_f32(3.0)), |
3514 | | Some(f16::from_f32(3.0)), |
3515 | | Some(f16::from_f32(3.0)), |
3516 | | ]), |
3517 | | Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(1.0))]), |
3518 | | ], |
3519 | | Some(SortOptions { |
3520 | | descending: false, |
3521 | | nulls_first: false, |
3522 | | }), |
3523 | | None, |
3524 | | vec![ |
3525 | | Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(0.0))]), |
3526 | | Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(1.0))]), |
3527 | | Some(vec![ |
3528 | | Some(f16::from_f32(2.0)), |
3529 | | Some(f16::from_f32(3.0)), |
3530 | | Some(f16::from_f32(4.0)), |
3531 | | ]), |
3532 | | Some(vec![ |
3533 | | Some(f16::from_f32(3.0)), |
3534 | | Some(f16::from_f32(3.0)), |
3535 | | Some(f16::from_f32(3.0)), |
3536 | | Some(f16::from_f32(3.0)), |
3537 | | ]), |
3538 | | Some(vec![ |
3539 | | Some(f16::from_f32(4.0)), |
3540 | | Some(f16::from_f32(3.0)), |
3541 | | Some(f16::from_f32(2.0)), |
3542 | | Some(f16::from_f32(1.0)), |
3543 | | ]), |
3544 | | ], |
3545 | | None, |
3546 | | ); |
3547 | | |
3548 | | test_sort_list_arrays::<Float32Type>( |
3549 | | vec![ |
3550 | | Some(vec![Some(1.0), Some(0.0)]), |
3551 | | Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), |
3552 | | Some(vec![Some(2.0), Some(3.0), Some(4.0)]), |
3553 | | Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), |
3554 | | Some(vec![Some(1.0), Some(1.0)]), |
3555 | | ], |
3556 | | Some(SortOptions { |
3557 | | descending: false, |
3558 | | nulls_first: false, |
3559 | | }), |
3560 | | None, |
3561 | | vec![ |
3562 | | Some(vec![Some(1.0), Some(0.0)]), |
3563 | | Some(vec![Some(1.0), Some(1.0)]), |
3564 | | Some(vec![Some(2.0), Some(3.0), Some(4.0)]), |
3565 | | Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), |
3566 | | Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), |
3567 | | ], |
3568 | | None, |
3569 | | ); |
3570 | | |
3571 | | test_sort_list_arrays::<Float64Type>( |
3572 | | vec![ |
3573 | | Some(vec![Some(1.0), Some(0.0)]), |
3574 | | Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), |
3575 | | Some(vec![Some(2.0), Some(3.0), Some(4.0)]), |
3576 | | Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), |
3577 | | Some(vec![Some(1.0), Some(1.0)]), |
3578 | | ], |
3579 | | Some(SortOptions { |
3580 | | descending: false, |
3581 | | nulls_first: false, |
3582 | | }), |
3583 | | None, |
3584 | | vec![ |
3585 | | Some(vec![Some(1.0), Some(0.0)]), |
3586 | | Some(vec![Some(1.0), Some(1.0)]), |
3587 | | Some(vec![Some(2.0), Some(3.0), Some(4.0)]), |
3588 | | Some(vec![Some(3.0), Some(3.0), Some(3.0), Some(3.0)]), |
3589 | | Some(vec![Some(4.0), Some(3.0), Some(2.0), Some(1.0)]), |
3590 | | ], |
3591 | | None, |
3592 | | ); |
3593 | | |
3594 | | test_sort_list_arrays::<Int32Type>( |
3595 | | vec![ |
3596 | | Some(vec![Some(1), Some(0)]), |
3597 | | Some(vec![Some(4), Some(3), Some(2), Some(1)]), |
3598 | | Some(vec![Some(2), Some(3), Some(4)]), |
3599 | | Some(vec![Some(3), Some(3), Some(3), Some(3)]), |
3600 | | Some(vec![Some(1), Some(1)]), |
3601 | | ], |
3602 | | Some(SortOptions { |
3603 | | descending: false, |
3604 | | nulls_first: false, |
3605 | | }), |
3606 | | None, |
3607 | | vec![ |
3608 | | Some(vec![Some(1), Some(0)]), |
3609 | | Some(vec![Some(1), Some(1)]), |
3610 | | Some(vec![Some(2), Some(3), Some(4)]), |
3611 | | Some(vec![Some(3), Some(3), Some(3), Some(3)]), |
3612 | | Some(vec![Some(4), Some(3), Some(2), Some(1)]), |
3613 | | ], |
3614 | | None, |
3615 | | ); |
3616 | | |
3617 | | test_sort_list_arrays::<Int32Type>( |
3618 | | vec![ |
3619 | | None, |
3620 | | Some(vec![Some(4), None, Some(2)]), |
3621 | | Some(vec![Some(2), Some(3), Some(4)]), |
3622 | | None, |
3623 | | Some(vec![Some(3), Some(3), None]), |
3624 | | ], |
3625 | | Some(SortOptions { |
3626 | | descending: false, |
3627 | | nulls_first: false, |
3628 | | }), |
3629 | | None, |
3630 | | vec![ |
3631 | | Some(vec![Some(2), Some(3), Some(4)]), |
3632 | | Some(vec![Some(3), Some(3), None]), |
3633 | | Some(vec![Some(4), None, Some(2)]), |
3634 | | None, |
3635 | | None, |
3636 | | ], |
3637 | | Some(3), |
3638 | | ); |
3639 | | |
3640 | | test_sort_list_arrays::<Int32Type>( |
3641 | | vec![ |
3642 | | Some(vec![Some(1), Some(0)]), |
3643 | | Some(vec![Some(4), Some(3), Some(2), Some(1)]), |
3644 | | Some(vec![Some(2), Some(3), Some(4)]), |
3645 | | Some(vec![Some(3), Some(3), Some(3), Some(3)]), |
3646 | | Some(vec![Some(1), Some(1)]), |
3647 | | ], |
3648 | | Some(SortOptions { |
3649 | | descending: false, |
3650 | | nulls_first: false, |
3651 | | }), |
3652 | | Some(2), |
3653 | | vec![Some(vec![Some(1), Some(0)]), Some(vec![Some(1), Some(1)])], |
3654 | | None, |
3655 | | ); |
3656 | | |
3657 | | // valid values less than limit with extra nulls |
3658 | | test_sort_list_arrays::<Int32Type>( |
3659 | | vec![Some(vec![Some(1)]), None, None, Some(vec![Some(2)])], |
3660 | | Some(SortOptions { |
3661 | | descending: false, |
3662 | | nulls_first: false, |
3663 | | }), |
3664 | | Some(3), |
3665 | | vec![Some(vec![Some(1)]), Some(vec![Some(2)]), None], |
3666 | | None, |
3667 | | ); |
3668 | | |
3669 | | test_sort_list_arrays::<Int32Type>( |
3670 | | vec![Some(vec![Some(1)]), None, None, Some(vec![Some(2)])], |
3671 | | Some(SortOptions { |
3672 | | descending: false, |
3673 | | nulls_first: true, |
3674 | | }), |
3675 | | Some(3), |
3676 | | vec![None, None, Some(vec![Some(1)])], |
3677 | | None, |
3678 | | ); |
3679 | | |
3680 | | // more nulls than limit |
3681 | | test_sort_list_arrays::<Int32Type>( |
3682 | | vec![Some(vec![Some(1)]), None, None, None], |
3683 | | Some(SortOptions { |
3684 | | descending: false, |
3685 | | nulls_first: true, |
3686 | | }), |
3687 | | Some(2), |
3688 | | vec![None, None], |
3689 | | None, |
3690 | | ); |
3691 | | |
3692 | | test_sort_list_arrays::<Int32Type>( |
3693 | | vec![Some(vec![Some(1)]), None, None, None], |
3694 | | Some(SortOptions { |
3695 | | descending: false, |
3696 | | nulls_first: false, |
3697 | | }), |
3698 | | Some(2), |
3699 | | vec![Some(vec![Some(1)]), None], |
3700 | | None, |
3701 | | ); |
3702 | | } |
3703 | | |
3704 | | #[test] |
3705 | | fn test_sort_binary() { |
3706 | | test_sort_binary_arrays( |
3707 | | vec![ |
3708 | | Some(vec![0, 0, 0]), |
3709 | | Some(vec![0, 0, 5]), |
3710 | | Some(vec![0, 0, 3]), |
3711 | | Some(vec![0, 0, 7]), |
3712 | | Some(vec![0, 0, 1]), |
3713 | | ], |
3714 | | Some(SortOptions { |
3715 | | descending: false, |
3716 | | nulls_first: false, |
3717 | | }), |
3718 | | None, |
3719 | | vec![ |
3720 | | Some(vec![0, 0, 0]), |
3721 | | Some(vec![0, 0, 1]), |
3722 | | Some(vec![0, 0, 3]), |
3723 | | Some(vec![0, 0, 5]), |
3724 | | Some(vec![0, 0, 7]), |
3725 | | ], |
3726 | | Some(3), |
3727 | | ); |
3728 | | |
3729 | | // with nulls |
3730 | | test_sort_binary_arrays( |
3731 | | vec![ |
3732 | | Some(vec![0, 0, 0]), |
3733 | | None, |
3734 | | Some(vec![0, 0, 3]), |
3735 | | Some(vec![0, 0, 7]), |
3736 | | Some(vec![0, 0, 1]), |
3737 | | None, |
3738 | | ], |
3739 | | Some(SortOptions { |
3740 | | descending: false, |
3741 | | nulls_first: false, |
3742 | | }), |
3743 | | None, |
3744 | | vec![ |
3745 | | Some(vec![0, 0, 0]), |
3746 | | Some(vec![0, 0, 1]), |
3747 | | Some(vec![0, 0, 3]), |
3748 | | Some(vec![0, 0, 7]), |
3749 | | None, |
3750 | | None, |
3751 | | ], |
3752 | | Some(3), |
3753 | | ); |
3754 | | |
3755 | | test_sort_binary_arrays( |
3756 | | vec![ |
3757 | | Some(vec![3, 5, 7]), |
3758 | | None, |
3759 | | Some(vec![1, 7, 1]), |
3760 | | Some(vec![2, 7, 3]), |
3761 | | None, |
3762 | | Some(vec![1, 4, 3]), |
3763 | | ], |
3764 | | Some(SortOptions { |
3765 | | descending: false, |
3766 | | nulls_first: false, |
3767 | | }), |
3768 | | None, |
3769 | | vec![ |
3770 | | Some(vec![1, 4, 3]), |
3771 | | Some(vec![1, 7, 1]), |
3772 | | Some(vec![2, 7, 3]), |
3773 | | Some(vec![3, 5, 7]), |
3774 | | None, |
3775 | | None, |
3776 | | ], |
3777 | | Some(3), |
3778 | | ); |
3779 | | |
3780 | | // descending |
3781 | | test_sort_binary_arrays( |
3782 | | vec![ |
3783 | | Some(vec![0, 0, 0]), |
3784 | | None, |
3785 | | Some(vec![0, 0, 3]), |
3786 | | Some(vec![0, 0, 7]), |
3787 | | Some(vec![0, 0, 1]), |
3788 | | None, |
3789 | | ], |
3790 | | Some(SortOptions { |
3791 | | descending: true, |
3792 | | nulls_first: false, |
3793 | | }), |
3794 | | None, |
3795 | | vec![ |
3796 | | Some(vec![0, 0, 7]), |
3797 | | Some(vec![0, 0, 3]), |
3798 | | Some(vec![0, 0, 1]), |
3799 | | Some(vec![0, 0, 0]), |
3800 | | None, |
3801 | | None, |
3802 | | ], |
3803 | | Some(3), |
3804 | | ); |
3805 | | |
3806 | | // nulls first |
3807 | | test_sort_binary_arrays( |
3808 | | vec![ |
3809 | | Some(vec![0, 0, 0]), |
3810 | | None, |
3811 | | Some(vec![0, 0, 3]), |
3812 | | Some(vec![0, 0, 7]), |
3813 | | Some(vec![0, 0, 1]), |
3814 | | None, |
3815 | | ], |
3816 | | Some(SortOptions { |
3817 | | descending: false, |
3818 | | nulls_first: true, |
3819 | | }), |
3820 | | None, |
3821 | | vec![ |
3822 | | None, |
3823 | | None, |
3824 | | Some(vec![0, 0, 0]), |
3825 | | Some(vec![0, 0, 1]), |
3826 | | Some(vec![0, 0, 3]), |
3827 | | Some(vec![0, 0, 7]), |
3828 | | ], |
3829 | | Some(3), |
3830 | | ); |
3831 | | |
3832 | | // limit |
3833 | | test_sort_binary_arrays( |
3834 | | vec![ |
3835 | | Some(vec![0, 0, 0]), |
3836 | | None, |
3837 | | Some(vec![0, 0, 3]), |
3838 | | Some(vec![0, 0, 7]), |
3839 | | Some(vec![0, 0, 1]), |
3840 | | None, |
3841 | | ], |
3842 | | Some(SortOptions { |
3843 | | descending: false, |
3844 | | nulls_first: true, |
3845 | | }), |
3846 | | Some(4), |
3847 | | vec![None, None, Some(vec![0, 0, 0]), Some(vec![0, 0, 1])], |
3848 | | Some(3), |
3849 | | ); |
3850 | | |
3851 | | // var length |
3852 | | test_sort_binary_arrays( |
3853 | | vec![ |
3854 | | Some(b"Hello".to_vec()), |
3855 | | None, |
3856 | | Some(b"from".to_vec()), |
3857 | | Some(b"Apache".to_vec()), |
3858 | | Some(b"Arrow-rs".to_vec()), |
3859 | | None, |
3860 | | ], |
3861 | | Some(SortOptions { |
3862 | | descending: false, |
3863 | | nulls_first: false, |
3864 | | }), |
3865 | | None, |
3866 | | vec![ |
3867 | | Some(b"Apache".to_vec()), |
3868 | | Some(b"Arrow-rs".to_vec()), |
3869 | | Some(b"Hello".to_vec()), |
3870 | | Some(b"from".to_vec()), |
3871 | | None, |
3872 | | None, |
3873 | | ], |
3874 | | None, |
3875 | | ); |
3876 | | |
3877 | | // limit |
3878 | | test_sort_binary_arrays( |
3879 | | vec![ |
3880 | | Some(b"Hello".to_vec()), |
3881 | | None, |
3882 | | Some(b"from".to_vec()), |
3883 | | Some(b"Apache".to_vec()), |
3884 | | Some(b"Arrow-rs".to_vec()), |
3885 | | None, |
3886 | | ], |
3887 | | Some(SortOptions { |
3888 | | descending: false, |
3889 | | nulls_first: true, |
3890 | | }), |
3891 | | Some(4), |
3892 | | vec![ |
3893 | | None, |
3894 | | None, |
3895 | | Some(b"Apache".to_vec()), |
3896 | | Some(b"Arrow-rs".to_vec()), |
3897 | | ], |
3898 | | None, |
3899 | | ); |
3900 | | } |
3901 | | |
3902 | | #[test] |
3903 | | fn test_lex_sort_single_column() { |
3904 | | let input = vec![SortColumn { |
3905 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3906 | | Some(17), |
3907 | | Some(2), |
3908 | | Some(-1), |
3909 | | Some(0), |
3910 | | ])) as ArrayRef, |
3911 | | options: None, |
3912 | | }]; |
3913 | | let expected = vec![Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3914 | | Some(-1), |
3915 | | Some(0), |
3916 | | Some(2), |
3917 | | Some(17), |
3918 | | ])) as ArrayRef]; |
3919 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
3920 | | test_lex_sort_arrays(input.clone(), slice_arrays(expected, 0, 2), Some(2)); |
3921 | | |
3922 | | // Explicitly test a limit on the sort as a demonstration |
3923 | | let expected = vec![Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3924 | | Some(-1), |
3925 | | Some(0), |
3926 | | Some(2), |
3927 | | ])) as ArrayRef]; |
3928 | | test_lex_sort_arrays(input, expected, Some(3)); |
3929 | | } |
3930 | | |
3931 | | #[test] |
3932 | | fn test_lex_sort_unaligned_rows() { |
3933 | | let input = vec![ |
3934 | | SortColumn { |
3935 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![None, Some(-1)])) |
3936 | | as ArrayRef, |
3937 | | options: None, |
3938 | | }, |
3939 | | SortColumn { |
3940 | | values: Arc::new(StringArray::from(vec![Some("foo")])) as ArrayRef, |
3941 | | options: None, |
3942 | | }, |
3943 | | ]; |
3944 | | assert!( |
3945 | | lexsort(&input, None).is_err(), |
3946 | | "lexsort should reject columns with different row counts" |
3947 | | ); |
3948 | | } |
3949 | | |
3950 | | #[test] |
3951 | | fn test_lex_sort_mixed_types() { |
3952 | | let input = vec![ |
3953 | | SortColumn { |
3954 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3955 | | Some(0), |
3956 | | Some(2), |
3957 | | Some(-1), |
3958 | | Some(0), |
3959 | | ])) as ArrayRef, |
3960 | | options: None, |
3961 | | }, |
3962 | | SortColumn { |
3963 | | values: Arc::new(PrimitiveArray::<UInt32Type>::from(vec![ |
3964 | | Some(101), |
3965 | | Some(8), |
3966 | | Some(7), |
3967 | | Some(102), |
3968 | | ])) as ArrayRef, |
3969 | | options: None, |
3970 | | }, |
3971 | | SortColumn { |
3972 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3973 | | Some(-1), |
3974 | | Some(-2), |
3975 | | Some(-3), |
3976 | | Some(-4), |
3977 | | ])) as ArrayRef, |
3978 | | options: None, |
3979 | | }, |
3980 | | ]; |
3981 | | let expected = vec![ |
3982 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3983 | | Some(-1), |
3984 | | Some(0), |
3985 | | Some(0), |
3986 | | Some(2), |
3987 | | ])) as ArrayRef, |
3988 | | Arc::new(PrimitiveArray::<UInt32Type>::from(vec![ |
3989 | | Some(7), |
3990 | | Some(101), |
3991 | | Some(102), |
3992 | | Some(8), |
3993 | | ])) as ArrayRef, |
3994 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
3995 | | Some(-3), |
3996 | | Some(-1), |
3997 | | Some(-4), |
3998 | | Some(-2), |
3999 | | ])) as ArrayRef, |
4000 | | ]; |
4001 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4002 | | test_lex_sort_arrays(input, slice_arrays(expected, 0, 2), Some(2)); |
4003 | | |
4004 | | // test mix of string and in64 with option |
4005 | | let input = vec![ |
4006 | | SortColumn { |
4007 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4008 | | Some(0), |
4009 | | Some(2), |
4010 | | Some(-1), |
4011 | | Some(0), |
4012 | | ])) as ArrayRef, |
4013 | | options: Some(SortOptions { |
4014 | | descending: true, |
4015 | | nulls_first: true, |
4016 | | }), |
4017 | | }, |
4018 | | SortColumn { |
4019 | | values: Arc::new(StringArray::from(vec![ |
4020 | | Some("foo"), |
4021 | | Some("9"), |
4022 | | Some("7"), |
4023 | | Some("bar"), |
4024 | | ])) as ArrayRef, |
4025 | | options: Some(SortOptions { |
4026 | | descending: true, |
4027 | | nulls_first: true, |
4028 | | }), |
4029 | | }, |
4030 | | ]; |
4031 | | let expected = vec![ |
4032 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4033 | | Some(2), |
4034 | | Some(0), |
4035 | | Some(0), |
4036 | | Some(-1), |
4037 | | ])) as ArrayRef, |
4038 | | Arc::new(StringArray::from(vec![ |
4039 | | Some("9"), |
4040 | | Some("foo"), |
4041 | | Some("bar"), |
4042 | | Some("7"), |
4043 | | ])) as ArrayRef, |
4044 | | ]; |
4045 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4046 | | test_lex_sort_arrays(input, slice_arrays(expected, 0, 3), Some(3)); |
4047 | | |
4048 | | // test sort with nulls first |
4049 | | let input = vec![ |
4050 | | SortColumn { |
4051 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4052 | | None, |
4053 | | Some(-1), |
4054 | | Some(2), |
4055 | | None, |
4056 | | ])) as ArrayRef, |
4057 | | options: Some(SortOptions { |
4058 | | descending: true, |
4059 | | nulls_first: true, |
4060 | | }), |
4061 | | }, |
4062 | | SortColumn { |
4063 | | values: Arc::new(StringArray::from(vec![ |
4064 | | Some("foo"), |
4065 | | Some("world"), |
4066 | | Some("hello"), |
4067 | | None, |
4068 | | ])) as ArrayRef, |
4069 | | options: Some(SortOptions { |
4070 | | descending: true, |
4071 | | nulls_first: true, |
4072 | | }), |
4073 | | }, |
4074 | | ]; |
4075 | | let expected = vec![ |
4076 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4077 | | None, |
4078 | | None, |
4079 | | Some(2), |
4080 | | Some(-1), |
4081 | | ])) as ArrayRef, |
4082 | | Arc::new(StringArray::from(vec![ |
4083 | | None, |
4084 | | Some("foo"), |
4085 | | Some("hello"), |
4086 | | Some("world"), |
4087 | | ])) as ArrayRef, |
4088 | | ]; |
4089 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4090 | | test_lex_sort_arrays(input, slice_arrays(expected, 0, 1), Some(1)); |
4091 | | |
4092 | | // test sort with nulls last |
4093 | | let input = vec![ |
4094 | | SortColumn { |
4095 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4096 | | None, |
4097 | | Some(-1), |
4098 | | Some(2), |
4099 | | None, |
4100 | | ])) as ArrayRef, |
4101 | | options: Some(SortOptions { |
4102 | | descending: true, |
4103 | | nulls_first: false, |
4104 | | }), |
4105 | | }, |
4106 | | SortColumn { |
4107 | | values: Arc::new(StringArray::from(vec![ |
4108 | | Some("foo"), |
4109 | | Some("world"), |
4110 | | Some("hello"), |
4111 | | None, |
4112 | | ])) as ArrayRef, |
4113 | | options: Some(SortOptions { |
4114 | | descending: true, |
4115 | | nulls_first: false, |
4116 | | }), |
4117 | | }, |
4118 | | ]; |
4119 | | let expected = vec![ |
4120 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4121 | | Some(2), |
4122 | | Some(-1), |
4123 | | None, |
4124 | | None, |
4125 | | ])) as ArrayRef, |
4126 | | Arc::new(StringArray::from(vec![ |
4127 | | Some("hello"), |
4128 | | Some("world"), |
4129 | | Some("foo"), |
4130 | | None, |
4131 | | ])) as ArrayRef, |
4132 | | ]; |
4133 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4134 | | test_lex_sort_arrays(input, slice_arrays(expected, 0, 2), Some(2)); |
4135 | | |
4136 | | // test sort with opposite options |
4137 | | let input = vec![ |
4138 | | SortColumn { |
4139 | | values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4140 | | None, |
4141 | | Some(-1), |
4142 | | Some(2), |
4143 | | Some(-1), |
4144 | | None, |
4145 | | ])) as ArrayRef, |
4146 | | options: Some(SortOptions { |
4147 | | descending: false, |
4148 | | nulls_first: false, |
4149 | | }), |
4150 | | }, |
4151 | | SortColumn { |
4152 | | values: Arc::new(StringArray::from(vec![ |
4153 | | Some("foo"), |
4154 | | Some("bar"), |
4155 | | Some("world"), |
4156 | | Some("hello"), |
4157 | | None, |
4158 | | ])) as ArrayRef, |
4159 | | options: Some(SortOptions { |
4160 | | descending: true, |
4161 | | nulls_first: true, |
4162 | | }), |
4163 | | }, |
4164 | | ]; |
4165 | | let expected = vec![ |
4166 | | Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
4167 | | Some(-1), |
4168 | | Some(-1), |
4169 | | Some(2), |
4170 | | None, |
4171 | | None, |
4172 | | ])) as ArrayRef, |
4173 | | Arc::new(StringArray::from(vec![ |
4174 | | Some("hello"), |
4175 | | Some("bar"), |
4176 | | Some("world"), |
4177 | | None, |
4178 | | Some("foo"), |
4179 | | ])) as ArrayRef, |
4180 | | ]; |
4181 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4182 | | test_lex_sort_arrays(input.clone(), slice_arrays(expected.clone(), 0, 5), Some(5)); |
4183 | | |
4184 | | // Limiting by more rows than present is ok |
4185 | | test_lex_sort_arrays(input, slice_arrays(expected, 0, 5), Some(10)); |
4186 | | |
4187 | | // test with FixedSizeListArray, arrays order: [UInt32, FixedSizeList(UInt32, 1)] |
4188 | | |
4189 | | // case1 |
4190 | | let primitive_array_data = vec![ |
4191 | | Some(2), |
4192 | | Some(3), |
4193 | | Some(2), |
4194 | | Some(0), |
4195 | | None, |
4196 | | Some(2), |
4197 | | Some(1), |
4198 | | Some(2), |
4199 | | ]; |
4200 | | let list_array_data = vec![ |
4201 | | None, |
4202 | | Some(vec![Some(4)]), |
4203 | | Some(vec![Some(3)]), |
4204 | | Some(vec![Some(1)]), |
4205 | | Some(vec![Some(5)]), |
4206 | | Some(vec![Some(0)]), |
4207 | | Some(vec![Some(2)]), |
4208 | | Some(vec![None]), |
4209 | | ]; |
4210 | | |
4211 | | let expected_primitive_array_data = vec![ |
4212 | | None, |
4213 | | Some(0), |
4214 | | Some(1), |
4215 | | Some(2), |
4216 | | Some(2), |
4217 | | Some(2), |
4218 | | Some(2), |
4219 | | Some(3), |
4220 | | ]; |
4221 | | let expected_list_array_data = vec![ |
4222 | | Some(vec![Some(5)]), |
4223 | | Some(vec![Some(1)]), |
4224 | | Some(vec![Some(2)]), |
4225 | | None, // <- |
4226 | | Some(vec![None]), |
4227 | | Some(vec![Some(0)]), |
4228 | | Some(vec![Some(3)]), // <- |
4229 | | Some(vec![Some(4)]), |
4230 | | ]; |
4231 | | test_lex_sort_mixed_types_with_fixed_size_list::<Int32Type>( |
4232 | | primitive_array_data.clone(), |
4233 | | list_array_data.clone(), |
4234 | | expected_primitive_array_data.clone(), |
4235 | | expected_list_array_data, |
4236 | | None, |
4237 | | None, |
4238 | | ); |
4239 | | |
4240 | | // case2 |
4241 | | let primitive_array_options = SortOptions { |
4242 | | descending: false, |
4243 | | nulls_first: true, |
4244 | | }; |
4245 | | let list_array_options = SortOptions { |
4246 | | descending: false, |
4247 | | nulls_first: false, // has been modified |
4248 | | }; |
4249 | | let expected_list_array_data = vec![ |
4250 | | Some(vec![Some(5)]), |
4251 | | Some(vec![Some(1)]), |
4252 | | Some(vec![Some(2)]), |
4253 | | Some(vec![Some(0)]), // <- |
4254 | | Some(vec![Some(3)]), |
4255 | | Some(vec![None]), |
4256 | | None, // <- |
4257 | | Some(vec![Some(4)]), |
4258 | | ]; |
4259 | | test_lex_sort_mixed_types_with_fixed_size_list::<Int32Type>( |
4260 | | primitive_array_data.clone(), |
4261 | | list_array_data.clone(), |
4262 | | expected_primitive_array_data.clone(), |
4263 | | expected_list_array_data, |
4264 | | Some(primitive_array_options), |
4265 | | Some(list_array_options), |
4266 | | ); |
4267 | | |
4268 | | // case3 |
4269 | | let primitive_array_options = SortOptions { |
4270 | | descending: false, |
4271 | | nulls_first: true, |
4272 | | }; |
4273 | | let list_array_options = SortOptions { |
4274 | | descending: true, // has been modified |
4275 | | nulls_first: true, |
4276 | | }; |
4277 | | let expected_list_array_data = vec![ |
4278 | | Some(vec![Some(5)]), |
4279 | | Some(vec![Some(1)]), |
4280 | | Some(vec![Some(2)]), |
4281 | | None, // <- |
4282 | | Some(vec![None]), |
4283 | | Some(vec![Some(3)]), |
4284 | | Some(vec![Some(0)]), // <- |
4285 | | Some(vec![Some(4)]), |
4286 | | ]; |
4287 | | test_lex_sort_mixed_types_with_fixed_size_list::<Int32Type>( |
4288 | | primitive_array_data.clone(), |
4289 | | list_array_data.clone(), |
4290 | | expected_primitive_array_data, |
4291 | | expected_list_array_data, |
4292 | | Some(primitive_array_options), |
4293 | | Some(list_array_options), |
4294 | | ); |
4295 | | |
4296 | | // test with ListArray/LargeListArray, arrays order: [List<UInt32>/LargeList<UInt32>, UInt32] |
4297 | | |
4298 | | let list_array_data = vec![ |
4299 | | Some(vec![Some(2), Some(1)]), // 0 |
4300 | | None, // 10 |
4301 | | Some(vec![Some(3)]), // 1 |
4302 | | Some(vec![Some(2), Some(0)]), // 2 |
4303 | | Some(vec![None, Some(2)]), // 3 |
4304 | | Some(vec![Some(0)]), // none |
4305 | | None, // 11 |
4306 | | Some(vec![Some(2), None]), // 4 |
4307 | | Some(vec![None]), // 5 |
4308 | | Some(vec![Some(2), Some(1)]), // 6 |
4309 | | ]; |
4310 | | let primitive_array_data = vec![ |
4311 | | Some(0), |
4312 | | Some(10), |
4313 | | Some(1), |
4314 | | Some(2), |
4315 | | Some(3), |
4316 | | None, |
4317 | | Some(11), |
4318 | | Some(4), |
4319 | | Some(5), |
4320 | | Some(6), |
4321 | | ]; |
4322 | | let expected_list_array_data = vec![ |
4323 | | None, |
4324 | | None, |
4325 | | Some(vec![None]), |
4326 | | Some(vec![None, Some(2)]), |
4327 | | Some(vec![Some(0)]), |
4328 | | Some(vec![Some(2), None]), |
4329 | | Some(vec![Some(2), Some(0)]), |
4330 | | Some(vec![Some(2), Some(1)]), |
4331 | | Some(vec![Some(2), Some(1)]), |
4332 | | Some(vec![Some(3)]), |
4333 | | ]; |
4334 | | let expected_primitive_array_data = vec![ |
4335 | | Some(10), |
4336 | | Some(11), |
4337 | | Some(5), |
4338 | | Some(3), |
4339 | | None, |
4340 | | Some(4), |
4341 | | Some(2), |
4342 | | Some(0), |
4343 | | Some(6), |
4344 | | Some(1), |
4345 | | ]; |
4346 | | test_lex_sort_mixed_types_with_list::<Int32Type>( |
4347 | | list_array_data.clone(), |
4348 | | primitive_array_data.clone(), |
4349 | | expected_list_array_data, |
4350 | | expected_primitive_array_data, |
4351 | | None, |
4352 | | None, |
4353 | | ); |
4354 | | } |
4355 | | |
4356 | | fn test_lex_sort_mixed_types_with_fixed_size_list<T>( |
4357 | | primitive_array_data: Vec<Option<T::Native>>, |
4358 | | list_array_data: Vec<Option<Vec<Option<T::Native>>>>, |
4359 | | expected_primitive_array_data: Vec<Option<T::Native>>, |
4360 | | expected_list_array_data: Vec<Option<Vec<Option<T::Native>>>>, |
4361 | | primitive_array_options: Option<SortOptions>, |
4362 | | list_array_options: Option<SortOptions>, |
4363 | | ) where |
4364 | | T: ArrowPrimitiveType, |
4365 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
4366 | | { |
4367 | | let input = vec![ |
4368 | | SortColumn { |
4369 | | values: Arc::new(PrimitiveArray::<T>::from(primitive_array_data.clone())) |
4370 | | as ArrayRef, |
4371 | | options: primitive_array_options, |
4372 | | }, |
4373 | | SortColumn { |
4374 | | values: Arc::new(FixedSizeListArray::from_iter_primitive::<T, _, _>( |
4375 | | list_array_data.clone(), |
4376 | | 1, |
4377 | | )) as ArrayRef, |
4378 | | options: list_array_options, |
4379 | | }, |
4380 | | ]; |
4381 | | |
4382 | | let expected = vec![ |
4383 | | Arc::new(PrimitiveArray::<T>::from( |
4384 | | expected_primitive_array_data.clone(), |
4385 | | )) as ArrayRef, |
4386 | | Arc::new(FixedSizeListArray::from_iter_primitive::<T, _, _>( |
4387 | | expected_list_array_data.clone(), |
4388 | | 1, |
4389 | | )) as ArrayRef, |
4390 | | ]; |
4391 | | |
4392 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4393 | | test_lex_sort_arrays(input.clone(), slice_arrays(expected.clone(), 0, 5), Some(5)); |
4394 | | } |
4395 | | |
4396 | | fn test_lex_sort_mixed_types_with_list<T>( |
4397 | | list_array_data: Vec<Option<Vec<Option<T::Native>>>>, |
4398 | | primitive_array_data: Vec<Option<T::Native>>, |
4399 | | expected_list_array_data: Vec<Option<Vec<Option<T::Native>>>>, |
4400 | | expected_primitive_array_data: Vec<Option<T::Native>>, |
4401 | | list_array_options: Option<SortOptions>, |
4402 | | primitive_array_options: Option<SortOptions>, |
4403 | | ) where |
4404 | | T: ArrowPrimitiveType, |
4405 | | PrimitiveArray<T>: From<Vec<Option<T::Native>>>, |
4406 | | { |
4407 | | macro_rules! run_test { |
4408 | | ($ARRAY_TYPE:ident) => { |
4409 | | let input = vec![ |
4410 | | SortColumn { |
4411 | | values: Arc::new(<$ARRAY_TYPE>::from_iter_primitive::<T, _, _>( |
4412 | | list_array_data.clone(), |
4413 | | )) as ArrayRef, |
4414 | | options: list_array_options.clone(), |
4415 | | }, |
4416 | | SortColumn { |
4417 | | values: Arc::new(PrimitiveArray::<T>::from(primitive_array_data.clone())) |
4418 | | as ArrayRef, |
4419 | | options: primitive_array_options.clone(), |
4420 | | }, |
4421 | | ]; |
4422 | | |
4423 | | let expected = vec![ |
4424 | | Arc::new(<$ARRAY_TYPE>::from_iter_primitive::<T, _, _>( |
4425 | | expected_list_array_data.clone(), |
4426 | | )) as ArrayRef, |
4427 | | Arc::new(PrimitiveArray::<T>::from( |
4428 | | expected_primitive_array_data.clone(), |
4429 | | )) as ArrayRef, |
4430 | | ]; |
4431 | | |
4432 | | test_lex_sort_arrays(input.clone(), expected.clone(), None); |
4433 | | test_lex_sort_arrays(input.clone(), slice_arrays(expected.clone(), 0, 5), Some(5)); |
4434 | | }; |
4435 | | } |
4436 | | run_test!(ListArray); |
4437 | | run_test!(LargeListArray); |
4438 | | } |
4439 | | |
4440 | | #[test] |
4441 | | fn test_partial_sort() { |
4442 | | let mut before: Vec<&str> = vec![ |
4443 | | "a", "cat", "mat", "on", "sat", "the", "xxx", "xxxx", "fdadfdsf", |
4444 | | ]; |
4445 | | let mut d = before.clone(); |
4446 | | d.sort_unstable(); |
4447 | | |
4448 | | for last in 0..before.len() { |
4449 | | partial_sort(&mut before, last, |a, b| a.cmp(b)); |
4450 | | assert_eq!(&d[0..last], &before.as_slice()[0..last]); |
4451 | | } |
4452 | | } |
4453 | | |
4454 | | #[test] |
4455 | | fn test_partial_rand_sort() { |
4456 | | let size = 1000u32; |
4457 | | let mut rng = StdRng::seed_from_u64(42); |
4458 | | let mut before: Vec<u32> = (0..size).map(|_| rng.random::<u32>()).collect(); |
4459 | | let mut d = before.clone(); |
4460 | | let last = (rng.next_u32() % size) as usize; |
4461 | | d.sort_unstable(); |
4462 | | |
4463 | | partial_sort(&mut before, last, |a, b| a.cmp(b)); |
4464 | | assert_eq!(&d[0..last], &before[0..last]); |
4465 | | } |
4466 | | |
4467 | | #[test] |
4468 | | fn test_sort_int8_dicts() { |
4469 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4470 | | let values = Int8Array::from(vec![1, 3, 5]); |
4471 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4472 | | keys, |
4473 | | values, |
4474 | | None, |
4475 | | None, |
4476 | | vec![None, None, Some(1), Some(3), Some(5), Some(5)], |
4477 | | ); |
4478 | | |
4479 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4480 | | let values = Int8Array::from(vec![1, 3, 5]); |
4481 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4482 | | keys, |
4483 | | values, |
4484 | | Some(SortOptions { |
4485 | | descending: true, |
4486 | | nulls_first: false, |
4487 | | }), |
4488 | | None, |
4489 | | vec![Some(5), Some(5), Some(3), Some(1), None, None], |
4490 | | ); |
4491 | | |
4492 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4493 | | let values = Int8Array::from(vec![1, 3, 5]); |
4494 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4495 | | keys, |
4496 | | values, |
4497 | | Some(SortOptions { |
4498 | | descending: false, |
4499 | | nulls_first: false, |
4500 | | }), |
4501 | | None, |
4502 | | vec![Some(1), Some(3), Some(5), Some(5), None, None], |
4503 | | ); |
4504 | | |
4505 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4506 | | let values = Int8Array::from(vec![1, 3, 5]); |
4507 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4508 | | keys, |
4509 | | values, |
4510 | | Some(SortOptions { |
4511 | | descending: true, |
4512 | | nulls_first: true, |
4513 | | }), |
4514 | | Some(3), |
4515 | | vec![None, None, Some(5)], |
4516 | | ); |
4517 | | |
4518 | | // Values have `None`. |
4519 | | let keys = Int8Array::from(vec![ |
4520 | | Some(1_i8), |
4521 | | None, |
4522 | | Some(3), |
4523 | | None, |
4524 | | Some(2), |
4525 | | Some(3), |
4526 | | Some(0), |
4527 | | ]); |
4528 | | let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); |
4529 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4530 | | keys, |
4531 | | values, |
4532 | | None, |
4533 | | None, |
4534 | | vec![None, None, None, Some(1), Some(3), Some(5), Some(5)], |
4535 | | ); |
4536 | | |
4537 | | let keys = Int8Array::from(vec![ |
4538 | | Some(1_i8), |
4539 | | None, |
4540 | | Some(3), |
4541 | | None, |
4542 | | Some(2), |
4543 | | Some(3), |
4544 | | Some(0), |
4545 | | ]); |
4546 | | let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); |
4547 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4548 | | keys, |
4549 | | values, |
4550 | | Some(SortOptions { |
4551 | | descending: false, |
4552 | | nulls_first: false, |
4553 | | }), |
4554 | | None, |
4555 | | vec![Some(1), Some(3), Some(5), Some(5), None, None, None], |
4556 | | ); |
4557 | | |
4558 | | let keys = Int8Array::from(vec![ |
4559 | | Some(1_i8), |
4560 | | None, |
4561 | | Some(3), |
4562 | | None, |
4563 | | Some(2), |
4564 | | Some(3), |
4565 | | Some(0), |
4566 | | ]); |
4567 | | let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); |
4568 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4569 | | keys, |
4570 | | values, |
4571 | | Some(SortOptions { |
4572 | | descending: true, |
4573 | | nulls_first: false, |
4574 | | }), |
4575 | | None, |
4576 | | vec![Some(5), Some(5), Some(3), Some(1), None, None, None], |
4577 | | ); |
4578 | | |
4579 | | let keys = Int8Array::from(vec![ |
4580 | | Some(1_i8), |
4581 | | None, |
4582 | | Some(3), |
4583 | | None, |
4584 | | Some(2), |
4585 | | Some(3), |
4586 | | Some(0), |
4587 | | ]); |
4588 | | let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); |
4589 | | test_sort_primitive_dict_arrays::<Int8Type, Int8Type>( |
4590 | | keys, |
4591 | | values, |
4592 | | Some(SortOptions { |
4593 | | descending: true, |
4594 | | nulls_first: true, |
4595 | | }), |
4596 | | None, |
4597 | | vec![None, None, None, Some(5), Some(5), Some(3), Some(1)], |
4598 | | ); |
4599 | | } |
4600 | | |
4601 | | #[test] |
4602 | | fn test_sort_f32_dicts() { |
4603 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4604 | | let values = Float32Array::from(vec![1.2, 3.0, 5.1]); |
4605 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4606 | | keys, |
4607 | | values, |
4608 | | None, |
4609 | | None, |
4610 | | vec![None, None, Some(1.2), Some(3.0), Some(5.1), Some(5.1)], |
4611 | | ); |
4612 | | |
4613 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4614 | | let values = Float32Array::from(vec![1.2, 3.0, 5.1]); |
4615 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4616 | | keys, |
4617 | | values, |
4618 | | Some(SortOptions { |
4619 | | descending: true, |
4620 | | nulls_first: false, |
4621 | | }), |
4622 | | None, |
4623 | | vec![Some(5.1), Some(5.1), Some(3.0), Some(1.2), None, None], |
4624 | | ); |
4625 | | |
4626 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4627 | | let values = Float32Array::from(vec![1.2, 3.0, 5.1]); |
4628 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4629 | | keys, |
4630 | | values, |
4631 | | Some(SortOptions { |
4632 | | descending: false, |
4633 | | nulls_first: false, |
4634 | | }), |
4635 | | None, |
4636 | | vec![Some(1.2), Some(3.0), Some(5.1), Some(5.1), None, None], |
4637 | | ); |
4638 | | |
4639 | | let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); |
4640 | | let values = Float32Array::from(vec![1.2, 3.0, 5.1]); |
4641 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4642 | | keys, |
4643 | | values, |
4644 | | Some(SortOptions { |
4645 | | descending: true, |
4646 | | nulls_first: true, |
4647 | | }), |
4648 | | Some(3), |
4649 | | vec![None, None, Some(5.1)], |
4650 | | ); |
4651 | | |
4652 | | // Values have `None`. |
4653 | | let keys = Int8Array::from(vec![ |
4654 | | Some(1_i8), |
4655 | | None, |
4656 | | Some(3), |
4657 | | None, |
4658 | | Some(2), |
4659 | | Some(3), |
4660 | | Some(0), |
4661 | | ]); |
4662 | | let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); |
4663 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4664 | | keys, |
4665 | | values, |
4666 | | None, |
4667 | | None, |
4668 | | vec![None, None, None, Some(1.2), Some(3.0), Some(5.1), Some(5.1)], |
4669 | | ); |
4670 | | |
4671 | | let keys = Int8Array::from(vec![ |
4672 | | Some(1_i8), |
4673 | | None, |
4674 | | Some(3), |
4675 | | None, |
4676 | | Some(2), |
4677 | | Some(3), |
4678 | | Some(0), |
4679 | | ]); |
4680 | | let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); |
4681 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4682 | | keys, |
4683 | | values, |
4684 | | Some(SortOptions { |
4685 | | descending: false, |
4686 | | nulls_first: false, |
4687 | | }), |
4688 | | None, |
4689 | | vec![Some(1.2), Some(3.0), Some(5.1), Some(5.1), None, None, None], |
4690 | | ); |
4691 | | |
4692 | | let keys = Int8Array::from(vec![ |
4693 | | Some(1_i8), |
4694 | | None, |
4695 | | Some(3), |
4696 | | None, |
4697 | | Some(2), |
4698 | | Some(3), |
4699 | | Some(0), |
4700 | | ]); |
4701 | | let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); |
4702 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4703 | | keys, |
4704 | | values, |
4705 | | Some(SortOptions { |
4706 | | descending: true, |
4707 | | nulls_first: false, |
4708 | | }), |
4709 | | None, |
4710 | | vec![Some(5.1), Some(5.1), Some(3.0), Some(1.2), None, None, None], |
4711 | | ); |
4712 | | |
4713 | | let keys = Int8Array::from(vec![ |
4714 | | Some(1_i8), |
4715 | | None, |
4716 | | Some(3), |
4717 | | None, |
4718 | | Some(2), |
4719 | | Some(3), |
4720 | | Some(0), |
4721 | | ]); |
4722 | | let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); |
4723 | | test_sort_primitive_dict_arrays::<Int8Type, Float32Type>( |
4724 | | keys, |
4725 | | values, |
4726 | | Some(SortOptions { |
4727 | | descending: true, |
4728 | | nulls_first: true, |
4729 | | }), |
4730 | | None, |
4731 | | vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)], |
4732 | | ); |
4733 | | } |
4734 | | |
4735 | | #[test] |
4736 | | fn test_lexicographic_comparator_null_dict_values() { |
4737 | | let values = Int32Array::new( |
4738 | | vec![1, 2, 3, 4].into(), |
4739 | | Some(NullBuffer::from(vec![true, false, false, true])), |
4740 | | ); |
4741 | | let keys = Int32Array::new( |
4742 | | vec![0, 1, 53, 3].into(), |
4743 | | Some(NullBuffer::from(vec![true, true, false, true])), |
4744 | | ); |
4745 | | // [1, NULL, NULL, 4] |
4746 | | let dict = DictionaryArray::new(keys, Arc::new(values)); |
4747 | | |
4748 | | let comparator = LexicographicalComparator::try_new(&[SortColumn { |
4749 | | values: Arc::new(dict), |
4750 | | options: None, |
4751 | | }]) |
4752 | | .unwrap(); |
4753 | | // 1.cmp(NULL) |
4754 | | assert_eq!(comparator.compare(0, 1), Ordering::Greater); |
4755 | | // NULL.cmp(NULL) |
4756 | | assert_eq!(comparator.compare(2, 1), Ordering::Equal); |
4757 | | // NULL.cmp(4) |
4758 | | assert_eq!(comparator.compare(2, 3), Ordering::Less); |
4759 | | } |
4760 | | |
4761 | | #[test] |
4762 | | fn sort_list_equal() { |
4763 | | let a = { |
4764 | | let mut builder = FixedSizeListBuilder::new(Int64Builder::new(), 2); |
4765 | | for value in [[1, 5], [0, 3], [1, 3]] { |
4766 | | builder.values().append_slice(&value); |
4767 | | builder.append(true); |
4768 | | } |
4769 | | builder.finish() |
4770 | | }; |
4771 | | |
4772 | | let sort_indices = sort_to_indices(&a, None, None).unwrap(); |
4773 | | assert_eq!(sort_indices.values(), &[1, 2, 0]); |
4774 | | |
4775 | | let a = { |
4776 | | let mut builder = ListBuilder::new(Int64Builder::new()); |
4777 | | for value in [[1, 5], [0, 3], [1, 3]] { |
4778 | | builder.values().append_slice(&value); |
4779 | | builder.append(true); |
4780 | | } |
4781 | | builder.finish() |
4782 | | }; |
4783 | | |
4784 | | let sort_indices = sort_to_indices(&a, None, None).unwrap(); |
4785 | | assert_eq!(sort_indices.values(), &[1, 2, 0]); |
4786 | | } |
4787 | | |
4788 | | #[test] |
4789 | | fn sort_struct_fallback_to_lexsort() { |
4790 | | let float = Arc::new(Float32Array::from(vec![1.0, -0.1, 3.5, 1.0])); |
4791 | | let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); |
4792 | | |
4793 | | let struct_array = StructArray::from(vec![ |
4794 | | ( |
4795 | | Arc::new(Field::new("b", DataType::Float32, false)), |
4796 | | float.clone() as ArrayRef, |
4797 | | ), |
4798 | | ( |
4799 | | Arc::new(Field::new("c", DataType::Int32, false)), |
4800 | | int.clone() as ArrayRef, |
4801 | | ), |
4802 | | ]); |
4803 | | |
4804 | | assert!(!can_sort_to_indices(struct_array.data_type())); |
4805 | | assert!(sort_to_indices(&struct_array, None, None) |
4806 | | .err() |
4807 | | .unwrap() |
4808 | | .to_string() |
4809 | | .contains("Sort not supported for data type")); |
4810 | | |
4811 | | let sort_columns = vec![SortColumn { |
4812 | | values: Arc::new(struct_array.clone()) as ArrayRef, |
4813 | | options: None, |
4814 | | }]; |
4815 | | let sorted = lexsort(&sort_columns, None).unwrap(); |
4816 | | |
4817 | | let expected_struct_array = Arc::new(StructArray::from(vec![ |
4818 | | ( |
4819 | | Arc::new(Field::new("b", DataType::Float32, false)), |
4820 | | Arc::new(Float32Array::from(vec![-0.1, 1.0, 1.0, 3.5])) as ArrayRef, |
4821 | | ), |
4822 | | ( |
4823 | | Arc::new(Field::new("c", DataType::Int32, false)), |
4824 | | Arc::new(Int32Array::from(vec![28, 31, 42, 19])) as ArrayRef, |
4825 | | ), |
4826 | | ])) as ArrayRef; |
4827 | | |
4828 | | assert_eq!(&sorted[0], &expected_struct_array); |
4829 | | } |
4830 | | |
4831 | | /// A simple, correct but slower reference implementation. |
4832 | | fn naive_partition(array: &BooleanArray) -> (Vec<u32>, Vec<u32>) { |
4833 | | let len = array.len(); |
4834 | | let mut valid = Vec::with_capacity(len); |
4835 | | let mut nulls = Vec::with_capacity(len); |
4836 | | for i in 0..len { |
4837 | | if array.is_valid(i) { |
4838 | | valid.push(i as u32); |
4839 | | } else { |
4840 | | nulls.push(i as u32); |
4841 | | } |
4842 | | } |
4843 | | (valid, nulls) |
4844 | | } |
4845 | | |
4846 | | #[test] |
4847 | | fn fuzz_partition_validity() { |
4848 | | let mut rng = StdRng::seed_from_u64(0xF00D_CAFE); |
4849 | | for _ in 0..1_000 { |
4850 | | // build a random BooleanArray with some nulls |
4851 | | let len = rng.random_range(0..512); |
4852 | | let mut builder = BooleanBuilder::new(); |
4853 | | for _ in 0..len { |
4854 | | if rng.random_bool(0.2) { |
4855 | | builder.append_null(); |
4856 | | } else { |
4857 | | builder.append_value(rng.random_bool(0.5)); |
4858 | | } |
4859 | | } |
4860 | | let array = builder.finish(); |
4861 | | |
4862 | | // Test both implementations on the full array |
4863 | | let (v1, n1) = partition_validity(&array); |
4864 | | let (v2, n2) = naive_partition(&array); |
4865 | | assert_eq!(v1, v2, "valid mismatch on full array"); |
4866 | | assert_eq!(n1, n2, "null mismatch on full array"); |
4867 | | |
4868 | | if len >= 8 { |
4869 | | // 1) Random slice within the array |
4870 | | let max_offset = len - 4; |
4871 | | let offset = rng.random_range(0..=max_offset); |
4872 | | let max_slice_len = len - offset; |
4873 | | let slice_len = rng.random_range(1..=max_slice_len); |
4874 | | |
4875 | | // Bind the sliced ArrayRef to keep it alive |
4876 | | let sliced = array.slice(offset, slice_len); |
4877 | | let slice = sliced |
4878 | | .as_any() |
4879 | | .downcast_ref::<BooleanArray>() |
4880 | | .expect("slice should be a BooleanArray"); |
4881 | | |
4882 | | let (sv1, sn1) = partition_validity(slice); |
4883 | | let (sv2, sn2) = naive_partition(slice); |
4884 | | assert_eq!( |
4885 | | sv1, sv2, |
4886 | | "valid mismatch on random slice at offset {offset} length {slice_len}", |
4887 | | ); |
4888 | | assert_eq!( |
4889 | | sn1, sn2, |
4890 | | "null mismatch on random slice at offset {offset} length {slice_len}", |
4891 | | ); |
4892 | | |
4893 | | // 2) Ensure we test slices that start beyond one 64-bit chunk boundary |
4894 | | if len > 68 { |
4895 | | let offset2 = rng.random_range(65..(len - 3)); |
4896 | | let len2 = rng.random_range(1..=(len - offset2)); |
4897 | | |
4898 | | let sliced2 = array.slice(offset2, len2); |
4899 | | let slice2 = sliced2 |
4900 | | .as_any() |
4901 | | .downcast_ref::<BooleanArray>() |
4902 | | .expect("slice2 should be a BooleanArray"); |
4903 | | |
4904 | | let (sv3, sn3) = partition_validity(slice2); |
4905 | | let (sv4, sn4) = naive_partition(slice2); |
4906 | | assert_eq!( |
4907 | | sv3, sv4, |
4908 | | "valid mismatch on chunk-crossing slice at offset {offset2} length {len2}", |
4909 | | ); |
4910 | | assert_eq!( |
4911 | | sn3, sn4, |
4912 | | "null mismatch on chunk-crossing slice at offset {offset2} length {len2}", |
4913 | | ); |
4914 | | } |
4915 | | } |
4916 | | } |
4917 | | } |
4918 | | |
4919 | | // A few small deterministic checks |
4920 | | #[test] |
4921 | | fn test_partition_edge_cases() { |
4922 | | // all valid |
4923 | | let array = BooleanArray::from(vec![Some(true), Some(false), Some(true)]); |
4924 | | let (valid, nulls) = partition_validity(&array); |
4925 | | assert_eq!(valid, vec![0, 1, 2]); |
4926 | | assert!(nulls.is_empty()); |
4927 | | |
4928 | | // all null |
4929 | | let array = BooleanArray::from(vec![None, None, None]); |
4930 | | let (valid, nulls) = partition_validity(&array); |
4931 | | assert!(valid.is_empty()); |
4932 | | assert_eq!(nulls, vec![0, 1, 2]); |
4933 | | |
4934 | | // alternating |
4935 | | let array = BooleanArray::from(vec![Some(true), None, Some(true), None]); |
4936 | | let (valid, nulls) = partition_validity(&array); |
4937 | | assert_eq!(valid, vec![0, 2]); |
4938 | | assert_eq!(nulls, vec![1, 3]); |
4939 | | } |
4940 | | |
4941 | | // Test specific edge case strings that exercise the 4-byte prefix logic |
4942 | | #[test] |
4943 | | fn test_specific_edge_cases() { |
4944 | | let test_cases = vec![ |
4945 | | // Key test cases for lengths 1-4 that test prefix padding |
4946 | | "a", "ab", "ba", "baa", "abba", "abbc", "abc", "cda", |
4947 | | // Test cases where first 4 bytes are same but subsequent bytes differ |
4948 | | "abcd", "abcde", "abcdf", "abcdaaa", "abcdbbb", |
4949 | | // Test cases with length < 4 that require padding |
4950 | | "z", "za", "zaa", "zaaa", "zaaab", // Empty string |
4951 | | "", // Test various length combinations with same prefix |
4952 | | "test", "test1", "test12", "test123", "test1234", |
4953 | | ]; |
4954 | | |
4955 | | // Use standard library sort as reference |
4956 | | let mut expected = test_cases.clone(); |
4957 | | expected.sort(); |
4958 | | |
4959 | | // Use our sorting algorithm |
4960 | | let string_array = StringArray::from(test_cases.clone()); |
4961 | | let indices: Vec<u32> = (0..test_cases.len() as u32).collect(); |
4962 | | let result = sort_bytes( |
4963 | | &string_array, |
4964 | | indices, |
4965 | | vec![], // no nulls |
4966 | | SortOptions::default(), |
4967 | | None, |
4968 | | ); |
4969 | | |
4970 | | // Verify results |
4971 | | let sorted_strings: Vec<&str> = result |
4972 | | .values() |
4973 | | .iter() |
4974 | | .map(|&idx| test_cases[idx as usize]) |
4975 | | .collect(); |
4976 | | |
4977 | | assert_eq!(sorted_strings, expected); |
4978 | | } |
4979 | | |
4980 | | // Test sorting correctness for different length combinations |
4981 | | #[test] |
4982 | | fn test_length_combinations() { |
4983 | | let test_cases = vec![ |
4984 | | // Focus on testing strings of length 1-4, as these affect padding logic |
4985 | | ("", 0), |
4986 | | ("a", 1), |
4987 | | ("ab", 2), |
4988 | | ("abc", 3), |
4989 | | ("abcd", 4), |
4990 | | ("abcde", 5), |
4991 | | ("b", 1), |
4992 | | ("ba", 2), |
4993 | | ("bab", 3), |
4994 | | ("babc", 4), |
4995 | | ("babcd", 5), |
4996 | | // Test same prefix with different lengths |
4997 | | ("test", 4), |
4998 | | ("test1", 5), |
4999 | | ("test12", 6), |
5000 | | ("test123", 7), |
5001 | | ]; |
5002 | | |
5003 | | let strings: Vec<&str> = test_cases.iter().map(|(s, _)| *s).collect(); |
5004 | | let mut expected = strings.clone(); |
5005 | | expected.sort(); |
5006 | | |
5007 | | let string_array = StringArray::from(strings.clone()); |
5008 | | let indices: Vec<u32> = (0..strings.len() as u32).collect(); |
5009 | | let result = sort_bytes(&string_array, indices, vec![], SortOptions::default(), None); |
5010 | | |
5011 | | let sorted_strings: Vec<&str> = result |
5012 | | .values() |
5013 | | .iter() |
5014 | | .map(|&idx| strings[idx as usize]) |
5015 | | .collect(); |
5016 | | |
5017 | | assert_eq!(sorted_strings, expected); |
5018 | | } |
5019 | | |
5020 | | // Test UTF-8 string handling |
5021 | | #[test] |
5022 | | fn test_utf8_strings() { |
5023 | | let test_cases = vec![ |
5024 | | "a", |
5025 | | "你", // 3-byte UTF-8 character |
5026 | | "你好", // 6 bytes |
5027 | | "你好世界", // 12 bytes |
5028 | | "🎉", // 4-byte emoji |
5029 | | "🎉🎊", // 8 bytes |
5030 | | "café", // Contains accent character |
5031 | | "naïve", |
5032 | | "Москва", // Cyrillic script |
5033 | | "東京", // Japanese kanji |
5034 | | "한국", // Korean |
5035 | | ]; |
5036 | | |
5037 | | let mut expected = test_cases.clone(); |
5038 | | expected.sort(); |
5039 | | |
5040 | | let string_array = StringArray::from(test_cases.clone()); |
5041 | | let indices: Vec<u32> = (0..test_cases.len() as u32).collect(); |
5042 | | let result = sort_bytes(&string_array, indices, vec![], SortOptions::default(), None); |
5043 | | |
5044 | | let sorted_strings: Vec<&str> = result |
5045 | | .values() |
5046 | | .iter() |
5047 | | .map(|&idx| test_cases[idx as usize]) |
5048 | | .collect(); |
5049 | | |
5050 | | assert_eq!(sorted_strings, expected); |
5051 | | } |
5052 | | |
5053 | | // Fuzz testing: generate random UTF-8 strings and verify sort correctness |
5054 | | #[test] |
5055 | | fn test_fuzz_random_strings() { |
5056 | | let mut rng = StdRng::seed_from_u64(42); // Fixed seed for reproducibility |
5057 | | |
5058 | | for _ in 0..100 { |
5059 | | // Run 100 rounds of fuzz testing |
5060 | | let mut test_strings = Vec::new(); |
5061 | | |
5062 | | // Generate 20-50 random strings |
5063 | | let num_strings = rng.random_range(20..=50); |
5064 | | |
5065 | | for _ in 0..num_strings { |
5066 | | let string = generate_random_string(&mut rng); |
5067 | | test_strings.push(string); |
5068 | | } |
5069 | | |
5070 | | // Use standard library sort as reference |
5071 | | let mut expected = test_strings.clone(); |
5072 | | expected.sort(); |
5073 | | |
5074 | | // Use our sorting algorithm |
5075 | | let string_array = StringArray::from(test_strings.clone()); |
5076 | | let indices: Vec<u32> = (0..test_strings.len() as u32).collect(); |
5077 | | let result = sort_bytes(&string_array, indices, vec![], SortOptions::default(), None); |
5078 | | |
5079 | | let sorted_strings: Vec<String> = result |
5080 | | .values() |
5081 | | .iter() |
5082 | | .map(|&idx| test_strings[idx as usize].clone()) |
5083 | | .collect(); |
5084 | | |
5085 | | assert_eq!( |
5086 | | sorted_strings, expected, |
5087 | | "Fuzz test failed with input: {test_strings:?}" |
5088 | | ); |
5089 | | } |
5090 | | } |
5091 | | |
5092 | | // Helper function to generate random UTF-8 strings |
5093 | | fn generate_random_string(rng: &mut StdRng) -> String { |
5094 | | // Bias towards generating short strings, especially length 1-4 |
5095 | | let length = if rng.random_bool(0.6) { |
5096 | | rng.random_range(0..=4) // 60% probability for 0-4 length strings |
5097 | | } else { |
5098 | | rng.random_range(5..=20) // 40% probability for longer strings |
5099 | | }; |
5100 | | |
5101 | | if length == 0 { |
5102 | | return String::new(); |
5103 | | } |
5104 | | |
5105 | | let mut result = String::new(); |
5106 | | let mut current_len = 0; |
5107 | | |
5108 | | while current_len < length { |
5109 | | let c = generate_random_char(rng); |
5110 | | let char_len = c.len_utf8(); |
5111 | | |
5112 | | // Ensure we don't exceed target length |
5113 | | if current_len + char_len <= length { |
5114 | | result.push(c); |
5115 | | current_len += char_len; |
5116 | | } else { |
5117 | | // If adding this character would exceed length, fill with ASCII |
5118 | | let remaining = length - current_len; |
5119 | | for _ in 0..remaining { |
5120 | | result.push(rng.random_range('a'..='z')); |
5121 | | current_len += 1; |
5122 | | } |
5123 | | break; |
5124 | | } |
5125 | | } |
5126 | | |
5127 | | result |
5128 | | } |
5129 | | |
5130 | | // Generate random characters (including various UTF-8 characters) |
5131 | | fn generate_random_char(rng: &mut StdRng) -> char { |
5132 | | match rng.random_range(0..10) { |
5133 | | 0..=5 => rng.random_range('a'..='z'), // 60% ASCII lowercase |
5134 | | 6 => rng.random_range('A'..='Z'), // 10% ASCII uppercase |
5135 | | 7 => rng.random_range('0'..='9'), // 10% digits |
5136 | | 8 => { |
5137 | | // 10% Chinese characters |
5138 | | let chinese_chars = ['你', '好', '世', '界', '测', '试', '中', '文']; |
5139 | | chinese_chars[rng.random_range(0..chinese_chars.len())] |
5140 | | } |
5141 | | 9 => { |
5142 | | // 10% other Unicode characters (single `char`s) |
5143 | | let special_chars = ['é', 'ï', '🎉', '🎊', 'α', 'β', 'γ']; |
5144 | | special_chars[rng.random_range(0..special_chars.len())] |
5145 | | } |
5146 | | _ => unreachable!(), |
5147 | | } |
5148 | | } |
5149 | | |
5150 | | // Test descending sort order |
5151 | | #[test] |
5152 | | fn test_descending_sort() { |
5153 | | let test_cases = vec!["a", "ab", "ba", "baa", "abba", "abbc", "abc", "cda"]; |
5154 | | |
5155 | | let mut expected = test_cases.clone(); |
5156 | | expected.sort(); |
5157 | | expected.reverse(); // Descending order |
5158 | | |
5159 | | let string_array = StringArray::from(test_cases.clone()); |
5160 | | let indices: Vec<u32> = (0..test_cases.len() as u32).collect(); |
5161 | | let result = sort_bytes( |
5162 | | &string_array, |
5163 | | indices, |
5164 | | vec![], |
5165 | | SortOptions { |
5166 | | descending: true, |
5167 | | nulls_first: false, |
5168 | | }, |
5169 | | None, |
5170 | | ); |
5171 | | |
5172 | | let sorted_strings: Vec<&str> = result |
5173 | | .values() |
5174 | | .iter() |
5175 | | .map(|&idx| test_cases[idx as usize]) |
5176 | | .collect(); |
5177 | | |
5178 | | assert_eq!(sorted_strings, expected); |
5179 | | } |
5180 | | |
5181 | | // Stress test: large number of strings with same prefix |
5182 | | #[test] |
5183 | | fn test_same_prefix_stress() { |
5184 | | let mut test_cases = Vec::new(); |
5185 | | let prefix = "same"; |
5186 | | |
5187 | | // Generate many strings with the same prefix |
5188 | | for i in 0..1000 { |
5189 | | test_cases.push(format!("{prefix}{i:04}")); |
5190 | | } |
5191 | | |
5192 | | let mut expected = test_cases.clone(); |
5193 | | expected.sort(); |
5194 | | |
5195 | | let string_array = StringArray::from(test_cases.clone()); |
5196 | | let indices: Vec<u32> = (0..test_cases.len() as u32).collect(); |
5197 | | let result = sort_bytes(&string_array, indices, vec![], SortOptions::default(), None); |
5198 | | |
5199 | | let sorted_strings: Vec<String> = result |
5200 | | .values() |
5201 | | .iter() |
5202 | | .map(|&idx| test_cases[idx as usize].clone()) |
5203 | | .collect(); |
5204 | | |
5205 | | assert_eq!(sorted_strings, expected); |
5206 | | } |
5207 | | |
5208 | | // Test limit parameter |
5209 | | #[test] |
5210 | | fn test_with_limit() { |
5211 | | let test_cases = vec!["z", "y", "x", "w", "v", "u", "t", "s"]; |
5212 | | let limit = 3; |
5213 | | |
5214 | | let mut expected = test_cases.clone(); |
5215 | | expected.sort(); |
5216 | | expected.truncate(limit); |
5217 | | |
5218 | | let string_array = StringArray::from(test_cases.clone()); |
5219 | | let indices: Vec<u32> = (0..test_cases.len() as u32).collect(); |
5220 | | let result = sort_bytes( |
5221 | | &string_array, |
5222 | | indices, |
5223 | | vec![], |
5224 | | SortOptions::default(), |
5225 | | Some(limit), |
5226 | | ); |
5227 | | |
5228 | | let sorted_strings: Vec<&str> = result |
5229 | | .values() |
5230 | | .iter() |
5231 | | .map(|&idx| test_cases[idx as usize]) |
5232 | | .collect(); |
5233 | | |
5234 | | assert_eq!(sorted_strings, expected); |
5235 | | assert_eq!(sorted_strings.len(), limit); |
5236 | | } |
5237 | | } |