/Users/andrewlamb/Software/arrow-rs/arrow-row/src/run.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::{variable, RowConverter, Rows, SortField}; |
19 | | use arrow_array::types::RunEndIndexType; |
20 | | use arrow_array::{PrimitiveArray, RunArray}; |
21 | | use arrow_buffer::{ArrowNativeType, ScalarBuffer}; |
22 | | use arrow_schema::{ArrowError, SortOptions}; |
23 | | |
24 | | /// Computes the lengths of each row for a RunEndEncodedArray |
25 | 0 | pub fn compute_lengths<R: RunEndIndexType>( |
26 | 0 | lengths: &mut [usize], |
27 | 0 | rows: &Rows, |
28 | 0 | array: &RunArray<R>, |
29 | 0 | ) { |
30 | 0 | let run_ends = array.run_ends().values(); |
31 | 0 | let mut logical_start = 0; |
32 | | |
33 | | // Iterate over each run and apply the same length to all logical positions in the run |
34 | 0 | for (physical_idx, &run_end) in run_ends.iter().enumerate() { |
35 | 0 | let logical_end = run_end.as_usize(); |
36 | 0 | let row = rows.row(physical_idx); |
37 | 0 | let encoded_len = variable::encoded_len(Some(row.data)); |
38 | | |
39 | | // Add the same length for all logical positions in this run |
40 | 0 | for length in &mut lengths[logical_start..logical_end] { |
41 | 0 | *length += encoded_len; |
42 | 0 | } |
43 | | |
44 | 0 | logical_start = logical_end; |
45 | | } |
46 | 0 | } |
47 | | |
48 | | /// Encodes the provided `RunEndEncodedArray` to `out` with the provided `SortOptions` |
49 | | /// |
50 | | /// `rows` should contain the encoded values |
51 | 0 | pub fn encode<R: RunEndIndexType>( |
52 | 0 | data: &mut [u8], |
53 | 0 | offsets: &mut [usize], |
54 | 0 | rows: &Rows, |
55 | 0 | opts: SortOptions, |
56 | 0 | array: &RunArray<R>, |
57 | 0 | ) { |
58 | 0 | let run_ends = array.run_ends(); |
59 | | |
60 | 0 | let mut logical_idx = 0; |
61 | 0 | let mut offset_idx = 1; // Skip first offset |
62 | | |
63 | | // Iterate over each run |
64 | 0 | for physical_idx in 0..run_ends.values().len() { |
65 | 0 | let run_end = run_ends.values()[physical_idx].as_usize(); |
66 | | |
67 | | // Process all elements in this run |
68 | 0 | while logical_idx < run_end && offset_idx < offsets.len() { |
69 | 0 | let offset = &mut offsets[offset_idx]; |
70 | 0 | let out = &mut data[*offset..]; |
71 | 0 |
|
72 | 0 | // Use variable-length encoding to make the data self-describing |
73 | 0 | let row = rows.row(physical_idx); |
74 | 0 | let bytes_written = variable::encode_one(out, Some(row.data), opts); |
75 | 0 | *offset += bytes_written; |
76 | 0 |
|
77 | 0 | logical_idx += 1; |
78 | 0 | offset_idx += 1; |
79 | 0 | } |
80 | | |
81 | | // Break if we've processed all offsets |
82 | 0 | if offset_idx >= offsets.len() { |
83 | 0 | break; |
84 | 0 | } |
85 | | } |
86 | 0 | } |
87 | | |
88 | | /// Decodes a RunEndEncodedArray from `rows` with the provided `options` |
89 | | /// |
90 | | /// # Safety |
91 | | /// |
92 | | /// `rows` must contain valid data for the provided `converter` |
93 | 0 | pub unsafe fn decode<R: RunEndIndexType>( |
94 | 0 | converter: &RowConverter, |
95 | 0 | rows: &mut [&[u8]], |
96 | 0 | field: &SortField, |
97 | 0 | validate_utf8: bool, |
98 | 0 | ) -> Result<RunArray<R>, ArrowError> { |
99 | 0 | if rows.is_empty() { |
100 | 0 | let values = converter.convert_raw(&mut [], validate_utf8)?; |
101 | 0 | let run_ends_array = PrimitiveArray::<R>::new(ScalarBuffer::from(vec![]), None); |
102 | 0 | return RunArray::<R>::try_new(&run_ends_array, &values[0]); |
103 | 0 | } |
104 | | |
105 | | // Decode each row's REE data and collect the decoded values |
106 | 0 | let mut decoded_values = Vec::new(); |
107 | 0 | let mut run_ends = Vec::new(); |
108 | 0 | let mut unique_row_indices = Vec::new(); |
109 | | |
110 | | // Process each row to extract its REE data (following decode_binary pattern) |
111 | 0 | let mut decoded_data = Vec::new(); |
112 | 0 | for (idx, row) in rows.iter_mut().enumerate() { |
113 | 0 | decoded_data.clear(); |
114 | | // Extract the decoded value data from this row |
115 | 0 | let consumed = variable::decode_blocks(row, field.options, |block| { |
116 | 0 | decoded_data.extend_from_slice(block); |
117 | 0 | }); |
118 | | |
119 | | // Handle bit inversion for descending sort (following decode_binary pattern) |
120 | 0 | if field.options.descending { |
121 | 0 | decoded_data.iter_mut().for_each(|b| *b = !*b); |
122 | 0 | } |
123 | | |
124 | | // Update the row to point past the consumed REE data |
125 | 0 | *row = &row[consumed..]; |
126 | | |
127 | | // Check if this decoded value is the same as the previous one to identify runs |
128 | 0 | let is_new_run = |
129 | 0 | idx == 0 || decoded_data != decoded_values[*unique_row_indices.last().unwrap()]; |
130 | | |
131 | 0 | if is_new_run { |
132 | | // This is a new unique value - end the previous run if any |
133 | 0 | if idx > 0 { |
134 | 0 | run_ends.push(R::Native::usize_as(idx)); |
135 | 0 | } |
136 | 0 | unique_row_indices.push(decoded_values.len()); |
137 | 0 | decoded_values.push(decoded_data.clone()); |
138 | 0 | } |
139 | | } |
140 | | // Add the final run end |
141 | 0 | run_ends.push(R::Native::usize_as(rows.len())); |
142 | | |
143 | | // Convert the unique decoded values using the row converter |
144 | 0 | let mut unique_rows: Vec<&[u8]> = decoded_values.iter().map(|v| v.as_slice()).collect(); |
145 | 0 | let values = if unique_rows.is_empty() { |
146 | 0 | converter.convert_raw(&mut [], validate_utf8)? |
147 | | } else { |
148 | 0 | converter.convert_raw(&mut unique_rows, validate_utf8)? |
149 | | }; |
150 | | |
151 | | // Create run ends array |
152 | 0 | let run_ends_array = PrimitiveArray::<R>::new(ScalarBuffer::from(run_ends), None); |
153 | | |
154 | | // Create the RunEndEncodedArray |
155 | 0 | RunArray::<R>::try_new(&run_ends_array, &values[0]) |
156 | 0 | } |
157 | | |
158 | | #[cfg(test)] |
159 | | mod tests { |
160 | | use crate::{RowConverter, SortField}; |
161 | | use arrow_array::cast::AsArray; |
162 | | use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; |
163 | | use arrow_array::{Array, Int64Array, PrimitiveArray, RunArray, StringArray}; |
164 | | use arrow_schema::{DataType, SortOptions}; |
165 | | use std::sync::Arc; |
166 | | |
167 | | fn assert_roundtrip<R: RunEndIndexType>( |
168 | | array: &RunArray<R>, |
169 | | run_end_type: DataType, |
170 | | values_type: DataType, |
171 | | sort_options: Option<SortOptions>, |
172 | | ) { |
173 | | let sort_field = if let Some(options) = sort_options { |
174 | | SortField::new_with_options( |
175 | | DataType::RunEndEncoded( |
176 | | Arc::new(arrow_schema::Field::new("run_ends", run_end_type, false)), |
177 | | Arc::new(arrow_schema::Field::new("values", values_type, true)), |
178 | | ), |
179 | | options, |
180 | | ) |
181 | | } else { |
182 | | SortField::new(DataType::RunEndEncoded( |
183 | | Arc::new(arrow_schema::Field::new("run_ends", run_end_type, false)), |
184 | | Arc::new(arrow_schema::Field::new("values", values_type, true)), |
185 | | )) |
186 | | }; |
187 | | |
188 | | let converter = RowConverter::new(vec![sort_field]).unwrap(); |
189 | | |
190 | | let rows = converter |
191 | | .convert_columns(&[Arc::new(array.clone())]) |
192 | | .unwrap(); |
193 | | |
194 | | let arrays = converter.convert_rows(&rows).unwrap(); |
195 | | let result = arrays[0].as_any().downcast_ref::<RunArray<R>>().unwrap(); |
196 | | |
197 | | assert_eq!(array, result); |
198 | | } |
199 | | |
200 | | #[test] |
201 | | fn test_run_end_encoded_supports_datatype() { |
202 | | // Test that the RowConverter correctly supports run-end encoded arrays |
203 | | assert!(RowConverter::supports_datatype(&DataType::RunEndEncoded( |
204 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
205 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
206 | | ))); |
207 | | } |
208 | | |
209 | | #[test] |
210 | | fn test_run_end_encoded_round_trip_int16_int64s() { |
211 | | // Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it |
212 | | // doesn't just work with eg. strings (which are all the other tests). |
213 | | |
214 | | let values = Int64Array::from(vec![100, 200, 100, 300]); |
215 | | let run_ends = vec![2, 3, 5, 6]; |
216 | | let array: RunArray<Int16Type> = |
217 | | RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap(); |
218 | | |
219 | | assert_roundtrip(&array, DataType::Int16, DataType::Int64, None); |
220 | | } |
221 | | |
222 | | #[test] |
223 | | fn test_run_end_encoded_round_trip_int32_int64s() { |
224 | | // Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it |
225 | | // doesn't just work with eg. strings (which are all the other tests). |
226 | | |
227 | | let values = Int64Array::from(vec![100, 200, 100, 300]); |
228 | | let run_ends = vec![2, 3, 5, 6]; |
229 | | let array: RunArray<Int32Type> = |
230 | | RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap(); |
231 | | |
232 | | assert_roundtrip(&array, DataType::Int32, DataType::Int64, None); |
233 | | } |
234 | | |
235 | | #[test] |
236 | | fn test_run_end_encoded_round_trip_int64_int64s() { |
237 | | // Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it |
238 | | // doesn't just work with eg. strings (which are all the other tests). |
239 | | |
240 | | let values = Int64Array::from(vec![100, 200, 100, 300]); |
241 | | let run_ends = vec![2, 3, 5, 6]; |
242 | | let array: RunArray<Int64Type> = |
243 | | RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap(); |
244 | | |
245 | | assert_roundtrip(&array, DataType::Int64, DataType::Int64, None); |
246 | | } |
247 | | |
248 | | #[test] |
249 | | fn test_run_end_encoded_round_trip_strings() { |
250 | | // Test round-trip correctness for RunEndEncodedArray with strings |
251 | | |
252 | | let array: RunArray<Int32Type> = vec!["b", "b", "a"].into_iter().collect(); |
253 | | |
254 | | assert_roundtrip(&array, DataType::Int32, DataType::Utf8, None); |
255 | | } |
256 | | |
257 | | #[test] |
258 | | fn test_run_end_encoded_round_trip_strings_with_nulls() { |
259 | | // Test round-trip correctness for RunEndEncodedArray with nulls |
260 | | |
261 | | let array: RunArray<Int32Type> = vec![Some("b"), Some("b"), None, Some("a")] |
262 | | .into_iter() |
263 | | .collect(); |
264 | | |
265 | | assert_roundtrip(&array, DataType::Int32, DataType::Utf8, None); |
266 | | } |
267 | | |
268 | | #[test] |
269 | | fn test_run_end_encoded_ascending_descending_round_trip() { |
270 | | // Test round-trip correctness for ascending vs descending sort options |
271 | | |
272 | | let values_asc = |
273 | | arrow_array::StringArray::from(vec![Some("apple"), Some("banana"), Some("cherry")]); |
274 | | let run_ends_asc = vec![2, 4, 6]; |
275 | | let run_array_asc: RunArray<Int32Type> = RunArray::try_new( |
276 | | &arrow_array::PrimitiveArray::from(run_ends_asc), |
277 | | &values_asc, |
278 | | ) |
279 | | .unwrap(); |
280 | | |
281 | | // Test ascending order |
282 | | assert_roundtrip( |
283 | | &run_array_asc, |
284 | | DataType::Int32, |
285 | | DataType::Utf8, |
286 | | Some(SortOptions { |
287 | | descending: false, |
288 | | nulls_first: true, |
289 | | }), |
290 | | ); |
291 | | |
292 | | // Test descending order |
293 | | assert_roundtrip( |
294 | | &run_array_asc, |
295 | | DataType::Int32, |
296 | | DataType::Utf8, |
297 | | Some(SortOptions { |
298 | | descending: true, |
299 | | nulls_first: true, |
300 | | }), |
301 | | ); |
302 | | } |
303 | | |
304 | | #[test] |
305 | | fn test_run_end_encoded_sort_configurations_basic() { |
306 | | // Test that different sort configurations work and can round-trip successfully |
307 | | |
308 | | let test_array: RunArray<Int32Type> = vec!["test"].into_iter().collect(); |
309 | | |
310 | | // Test ascending order |
311 | | assert_roundtrip( |
312 | | &test_array, |
313 | | DataType::Int32, |
314 | | DataType::Utf8, |
315 | | Some(SortOptions { |
316 | | descending: false, |
317 | | nulls_first: true, |
318 | | }), |
319 | | ); |
320 | | |
321 | | // Test descending order |
322 | | assert_roundtrip( |
323 | | &test_array, |
324 | | DataType::Int32, |
325 | | DataType::Utf8, |
326 | | Some(SortOptions { |
327 | | descending: true, |
328 | | nulls_first: true, |
329 | | }), |
330 | | ); |
331 | | } |
332 | | |
333 | | #[test] |
334 | | fn test_run_end_encoded_nulls_first_last_configurations() { |
335 | | // Test that nulls_first vs nulls_last configurations work |
336 | | |
337 | | let simple_array: RunArray<Int32Type> = vec!["simple"].into_iter().collect(); |
338 | | |
339 | | let converter_nulls_first = RowConverter::new(vec![SortField::new_with_options( |
340 | | DataType::RunEndEncoded( |
341 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
342 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
343 | | ), |
344 | | SortOptions { |
345 | | descending: false, |
346 | | nulls_first: true, |
347 | | }, |
348 | | )]) |
349 | | .unwrap(); |
350 | | |
351 | | let converter_nulls_last = RowConverter::new(vec![SortField::new_with_options( |
352 | | DataType::RunEndEncoded( |
353 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
354 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
355 | | ), |
356 | | SortOptions { |
357 | | descending: false, |
358 | | nulls_first: false, |
359 | | }, |
360 | | )]) |
361 | | .unwrap(); |
362 | | |
363 | | // Test that both configurations can handle simple arrays |
364 | | let rows_nulls_first = converter_nulls_first |
365 | | .convert_columns(&[Arc::new(simple_array.clone())]) |
366 | | .unwrap(); |
367 | | let arrays_nulls_first = converter_nulls_first |
368 | | .convert_rows(&rows_nulls_first) |
369 | | .unwrap(); |
370 | | let result_nulls_first = arrays_nulls_first[0] |
371 | | .as_any() |
372 | | .downcast_ref::<RunArray<Int32Type>>() |
373 | | .unwrap(); |
374 | | |
375 | | let rows_nulls_last = converter_nulls_last |
376 | | .convert_columns(&[Arc::new(simple_array.clone())]) |
377 | | .unwrap(); |
378 | | let arrays_nulls_last = converter_nulls_last.convert_rows(&rows_nulls_last).unwrap(); |
379 | | let result_nulls_last = arrays_nulls_last[0] |
380 | | .as_any() |
381 | | .downcast_ref::<RunArray<Int32Type>>() |
382 | | .unwrap(); |
383 | | |
384 | | // Both should successfully convert the simple array |
385 | | assert_eq!(simple_array.len(), result_nulls_first.len()); |
386 | | assert_eq!(simple_array.len(), result_nulls_last.len()); |
387 | | } |
388 | | |
389 | | #[test] |
390 | | fn test_run_end_encoded_row_consumption() { |
391 | | // This test verifies that ALL rows are properly consumed during decoding, |
392 | | // not just the unique values. We test this by ensuring multi-column conversion |
393 | | // works correctly - if rows aren't consumed properly, the second column would fail. |
394 | | |
395 | | // Create a REE array with multiple runs |
396 | | let array: RunArray<Int32Type> = vec!["a", "a", "b", "b", "b", "c"].into_iter().collect(); |
397 | | let string_array = StringArray::from(vec!["x", "y", "z", "w", "u", "v"]); |
398 | | |
399 | | let multi_converter = RowConverter::new(vec![ |
400 | | SortField::new(DataType::RunEndEncoded( |
401 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
402 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
403 | | )), |
404 | | SortField::new(DataType::Utf8), |
405 | | ]) |
406 | | .unwrap(); |
407 | | |
408 | | let multi_rows = multi_converter |
409 | | .convert_columns(&[Arc::new(array.clone()), Arc::new(string_array.clone())]) |
410 | | .unwrap(); |
411 | | |
412 | | // Convert back - this will test that all rows are consumed properly |
413 | | let arrays = multi_converter.convert_rows(&multi_rows).unwrap(); |
414 | | |
415 | | // Verify both columns round-trip correctly |
416 | | let result_ree = arrays[0] |
417 | | .as_any() |
418 | | .downcast_ref::<RunArray<Int32Type>>() |
419 | | .unwrap(); |
420 | | |
421 | | let result_string = arrays[1].as_any().downcast_ref::<StringArray>().unwrap(); |
422 | | |
423 | | // This should pass - both arrays should be identical to originals |
424 | | assert_eq!(result_ree.values().as_ref(), array.values().as_ref()); |
425 | | assert_eq!(result_ree.run_ends().values(), array.run_ends().values()); |
426 | | assert_eq!(*result_string, string_array); |
427 | | } |
428 | | |
429 | | #[test] |
430 | | fn test_run_end_encoded_sorting_behavior() { |
431 | | // Test that the binary row encoding actually produces the correct sort order |
432 | | |
433 | | // Create REE arrays with different values to test sorting |
434 | | let array1: RunArray<Int32Type> = vec!["apple", "apple"].into_iter().collect(); |
435 | | let array2: RunArray<Int32Type> = vec!["banana", "banana"].into_iter().collect(); |
436 | | let array3: RunArray<Int32Type> = vec!["cherry", "cherry"].into_iter().collect(); |
437 | | |
438 | | // Test ascending sort |
439 | | let converter_asc = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded( |
440 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
441 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
442 | | ))]) |
443 | | .unwrap(); |
444 | | |
445 | | let rows1_asc = converter_asc |
446 | | .convert_columns(&[Arc::new(array1.clone())]) |
447 | | .unwrap(); |
448 | | let rows2_asc = converter_asc |
449 | | .convert_columns(&[Arc::new(array2.clone())]) |
450 | | .unwrap(); |
451 | | let rows3_asc = converter_asc |
452 | | .convert_columns(&[Arc::new(array3.clone())]) |
453 | | .unwrap(); |
454 | | |
455 | | // For ascending: apple < banana < cherry |
456 | | // So row bytes should sort: rows1 < rows2 < rows3 |
457 | | assert!( |
458 | | rows1_asc.row(0) < rows2_asc.row(0), |
459 | | "apple should come before banana in ascending order" |
460 | | ); |
461 | | assert!( |
462 | | rows2_asc.row(0) < rows3_asc.row(0), |
463 | | "banana should come before cherry in ascending order" |
464 | | ); |
465 | | assert!( |
466 | | rows1_asc.row(0) < rows3_asc.row(0), |
467 | | "apple should come before cherry in ascending order" |
468 | | ); |
469 | | |
470 | | // Test descending sort |
471 | | let converter_desc = RowConverter::new(vec![SortField::new_with_options( |
472 | | DataType::RunEndEncoded( |
473 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
474 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
475 | | ), |
476 | | arrow_schema::SortOptions { |
477 | | descending: true, |
478 | | nulls_first: true, |
479 | | }, |
480 | | )]) |
481 | | .unwrap(); |
482 | | |
483 | | let rows1_desc = converter_desc |
484 | | .convert_columns(&[Arc::new(array1.clone())]) |
485 | | .unwrap(); |
486 | | let rows2_desc = converter_desc |
487 | | .convert_columns(&[Arc::new(array2.clone())]) |
488 | | .unwrap(); |
489 | | let rows3_desc = converter_desc |
490 | | .convert_columns(&[Arc::new(array3.clone())]) |
491 | | .unwrap(); |
492 | | |
493 | | // For descending: cherry > banana > apple |
494 | | // So row bytes should sort: rows3 < rows2 < rows1 (because byte comparison is ascending) |
495 | | assert!( |
496 | | rows3_desc.row(0) < rows2_desc.row(0), |
497 | | "cherry should come before banana in descending order (byte-wise)" |
498 | | ); |
499 | | assert!( |
500 | | rows2_desc.row(0) < rows1_desc.row(0), |
501 | | "banana should come before apple in descending order (byte-wise)" |
502 | | ); |
503 | | assert!( |
504 | | rows3_desc.row(0) < rows1_desc.row(0), |
505 | | "cherry should come before apple in descending order (byte-wise)" |
506 | | ); |
507 | | } |
508 | | |
509 | | #[test] |
510 | | fn test_run_end_encoded_null_sorting() { |
511 | | // Test null handling in sort order |
512 | | |
513 | | let array_with_nulls: RunArray<Int32Type> = vec![None, None].into_iter().collect(); |
514 | | let array_with_values: RunArray<Int32Type> = vec!["apple", "apple"].into_iter().collect(); |
515 | | |
516 | | // Test nulls_first = true |
517 | | let converter_nulls_first = RowConverter::new(vec![SortField::new_with_options( |
518 | | DataType::RunEndEncoded( |
519 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
520 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
521 | | ), |
522 | | arrow_schema::SortOptions { |
523 | | descending: false, |
524 | | nulls_first: true, |
525 | | }, |
526 | | )]) |
527 | | .unwrap(); |
528 | | |
529 | | let rows_nulls = converter_nulls_first |
530 | | .convert_columns(&[Arc::new(array_with_nulls.clone())]) |
531 | | .unwrap(); |
532 | | let rows_values = converter_nulls_first |
533 | | .convert_columns(&[Arc::new(array_with_values.clone())]) |
534 | | .unwrap(); |
535 | | |
536 | | // nulls should come before values when nulls_first = true |
537 | | assert!( |
538 | | rows_nulls.row(0) < rows_values.row(0), |
539 | | "nulls should come before values when nulls_first=true" |
540 | | ); |
541 | | |
542 | | // Test nulls_first = false |
543 | | let converter_nulls_last = RowConverter::new(vec![SortField::new_with_options( |
544 | | DataType::RunEndEncoded( |
545 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
546 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
547 | | ), |
548 | | arrow_schema::SortOptions { |
549 | | descending: false, |
550 | | nulls_first: false, |
551 | | }, |
552 | | )]) |
553 | | .unwrap(); |
554 | | |
555 | | let rows_nulls_last = converter_nulls_last |
556 | | .convert_columns(&[Arc::new(array_with_nulls.clone())]) |
557 | | .unwrap(); |
558 | | let rows_values_last = converter_nulls_last |
559 | | .convert_columns(&[Arc::new(array_with_values.clone())]) |
560 | | .unwrap(); |
561 | | |
562 | | // values should come before nulls when nulls_first = false |
563 | | assert!( |
564 | | rows_values_last.row(0) < rows_nulls_last.row(0), |
565 | | "values should come before nulls when nulls_first=false" |
566 | | ); |
567 | | } |
568 | | |
569 | | #[test] |
570 | | fn test_run_end_encoded_mixed_sorting() { |
571 | | // Test sorting with mixed values and nulls to ensure complex scenarios work |
572 | | |
573 | | let array1: RunArray<Int32Type> = vec![Some("apple"), None].into_iter().collect(); |
574 | | let array2: RunArray<Int32Type> = vec![None, Some("banana")].into_iter().collect(); |
575 | | let array3: RunArray<Int32Type> = |
576 | | vec![Some("cherry"), Some("cherry")].into_iter().collect(); |
577 | | |
578 | | let converter = RowConverter::new(vec![SortField::new_with_options( |
579 | | DataType::RunEndEncoded( |
580 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
581 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
582 | | ), |
583 | | arrow_schema::SortOptions { |
584 | | descending: false, |
585 | | nulls_first: true, |
586 | | }, |
587 | | )]) |
588 | | .unwrap(); |
589 | | |
590 | | let rows1 = converter.convert_columns(&[Arc::new(array1)]).unwrap(); |
591 | | let rows2 = converter.convert_columns(&[Arc::new(array2)]).unwrap(); |
592 | | let rows3 = converter.convert_columns(&[Arc::new(array3)]).unwrap(); |
593 | | |
594 | | // With nulls_first=true, ascending: |
595 | | // Row 0: array1[0]="apple", array2[0]=null, array3[0]="cherry" -> null < apple < cherry |
596 | | // Row 1: array1[1]=null, array2[1]="banana", array3[1]="cherry" -> null < banana < cherry |
597 | | |
598 | | // Compare first rows: null < apple < cherry |
599 | | assert!(rows2.row(0) < rows1.row(0), "null should come before apple"); |
600 | | assert!( |
601 | | rows1.row(0) < rows3.row(0), |
602 | | "apple should come before cherry" |
603 | | ); |
604 | | |
605 | | // Compare second rows: null < banana < cherry |
606 | | assert!( |
607 | | rows1.row(1) < rows2.row(1), |
608 | | "null should come before banana" |
609 | | ); |
610 | | assert!( |
611 | | rows2.row(1) < rows3.row(1), |
612 | | "banana should come before cherry" |
613 | | ); |
614 | | } |
615 | | |
616 | | #[test] |
617 | | fn test_run_end_encoded_empty() { |
618 | | // Test converting / decoding an empty RunEndEncodedArray |
619 | | let values: Vec<&str> = vec![]; |
620 | | let array: RunArray<Int32Type> = values.into_iter().collect(); |
621 | | |
622 | | let converter = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded( |
623 | | Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)), |
624 | | Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)), |
625 | | ))]) |
626 | | .unwrap(); |
627 | | |
628 | | let rows = converter.convert_columns(&[Arc::new(array)]).unwrap(); |
629 | | assert_eq!(rows.num_rows(), 0); |
630 | | |
631 | | // Likewise converting empty rows should yield an empty RunEndEncodedArray |
632 | | let arrays = converter.convert_rows(&rows).unwrap(); |
633 | | assert_eq!(arrays.len(), 1); |
634 | | // Verify both columns round-trip correctly |
635 | | let result_ree = arrays[0].as_run::<Int32Type>(); |
636 | | assert_eq!(result_ree.len(), 0); |
637 | | } |
638 | | } |