/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/run_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::any::Any; |
19 | | use std::sync::Arc; |
20 | | |
21 | | use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer}; |
22 | | use arrow_data::{ArrayData, ArrayDataBuilder}; |
23 | | use arrow_schema::{ArrowError, DataType, Field}; |
24 | | |
25 | | use crate::{ |
26 | | builder::StringRunBuilder, |
27 | | make_array, |
28 | | run_iterator::RunArrayIter, |
29 | | types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, |
30 | | Array, ArrayAccessor, ArrayRef, PrimitiveArray, |
31 | | }; |
32 | | |
33 | | /// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) |
34 | | /// |
35 | | /// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding) |
36 | | /// and is good for representing data containing same values repeated consecutively. |
37 | | /// |
38 | | /// [`RunArray`] contains `run_ends` array and `values` array of same length. |
39 | | /// The `run_ends` array stores the indexes at which the run ends. The `values` array |
40 | | /// stores the value of each run. Below example illustrates how a logical array is represented in |
41 | | /// [`RunArray`] |
42 | | /// |
43 | | /// |
44 | | /// ```text |
45 | | /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐ |
46 | | /// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ |
47 | | /// │ │ A │ │ 2 │ │ │ A │ |
48 | | /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ |
49 | | /// │ │ D │ │ 3 │ │ │ A │ run length of 'A' = runs_ends[0] - 0 = 2 |
50 | | /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ |
51 | | /// │ │ B │ │ 6 │ │ │ D │ run length of 'D' = run_ends[1] - run_ends[0] = 1 |
52 | | /// └─────────────────┘ └─────────┘ ├─────────────────┤ |
53 | | /// │ values run_ends │ │ B │ |
54 | | /// ├─────────────────┤ |
55 | | /// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘ │ B │ |
56 | | /// ├─────────────────┤ |
57 | | /// RunArray │ B │ run length of 'B' = run_ends[2] - run_ends[1] = 3 |
58 | | /// length = 3 └─────────────────┘ |
59 | | /// |
60 | | /// Logical array |
61 | | /// Contents |
62 | | /// ``` |
63 | | pub struct RunArray<R: RunEndIndexType> { |
64 | | data_type: DataType, |
65 | | run_ends: RunEndBuffer<R::Native>, |
66 | | values: ArrayRef, |
67 | | } |
68 | | |
69 | | impl<R: RunEndIndexType> Clone for RunArray<R> { |
70 | 0 | fn clone(&self) -> Self { |
71 | 0 | Self { |
72 | 0 | data_type: self.data_type.clone(), |
73 | 0 | run_ends: self.run_ends.clone(), |
74 | 0 | values: self.values.clone(), |
75 | 0 | } |
76 | 0 | } |
77 | | } |
78 | | |
79 | | impl<R: RunEndIndexType> RunArray<R> { |
80 | | /// Calculates the logical length of the array encoded |
81 | | /// by the given run_ends array. |
82 | 0 | pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize { |
83 | 0 | let len = run_ends.len(); |
84 | 0 | if len == 0 { |
85 | 0 | return 0; |
86 | 0 | } |
87 | 0 | run_ends.value(len - 1).as_usize() |
88 | 0 | } |
89 | | |
90 | | /// Attempts to create RunArray using given run_ends (index where a run ends) |
91 | | /// and the values (value of the run). Returns an error if the given data is not compatible |
92 | | /// with RunEndEncoded specification. |
93 | 0 | pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> Result<Self, ArrowError> { |
94 | 0 | let run_ends_type = run_ends.data_type().clone(); |
95 | 0 | let values_type = values.data_type().clone(); |
96 | 0 | let ree_array_type = DataType::RunEndEncoded( |
97 | 0 | Arc::new(Field::new("run_ends", run_ends_type, false)), |
98 | 0 | Arc::new(Field::new("values", values_type, true)), |
99 | 0 | ); |
100 | 0 | let len = RunArray::logical_len(run_ends); |
101 | 0 | let builder = ArrayDataBuilder::new(ree_array_type) |
102 | 0 | .len(len) |
103 | 0 | .add_child_data(run_ends.to_data()) |
104 | 0 | .add_child_data(values.to_data()); |
105 | | |
106 | | // `build_unchecked` is used to avoid recursive validation of child arrays. |
107 | 0 | let array_data = unsafe { builder.build_unchecked() }; |
108 | | |
109 | | // Safety: `validate_data` checks below |
110 | | // 1. The given array data has exactly two child arrays. |
111 | | // 2. The first child array (run_ends) has valid data type. |
112 | | // 3. run_ends array does not have null values |
113 | | // 4. run_ends array has non-zero and strictly increasing values. |
114 | | // 5. The length of run_ends array and values array are the same. |
115 | 0 | array_data.validate_data()?; |
116 | | |
117 | 0 | Ok(array_data.into()) |
118 | 0 | } |
119 | | |
120 | | /// Returns a reference to [`RunEndBuffer`] |
121 | 0 | pub fn run_ends(&self) -> &RunEndBuffer<R::Native> { |
122 | 0 | &self.run_ends |
123 | 0 | } |
124 | | |
125 | | /// Returns a reference to values array |
126 | | /// |
127 | | /// Note: any slicing of this [`RunArray`] array is not applied to the returned array |
128 | | /// and must be handled separately |
129 | 0 | pub fn values(&self) -> &ArrayRef { |
130 | 0 | &self.values |
131 | 0 | } |
132 | | |
133 | | /// Returns the physical index at which the array slice starts. |
134 | 0 | pub fn get_start_physical_index(&self) -> usize { |
135 | 0 | self.run_ends.get_start_physical_index() |
136 | 0 | } |
137 | | |
138 | | /// Returns the physical index at which the array slice ends. |
139 | 0 | pub fn get_end_physical_index(&self) -> usize { |
140 | 0 | self.run_ends.get_end_physical_index() |
141 | 0 | } |
142 | | |
143 | | /// Downcast this [`RunArray`] to a [`TypedRunArray`] |
144 | | /// |
145 | | /// ``` |
146 | | /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type}; |
147 | | /// |
148 | | /// let orig = [Some("a"), Some("b"), None]; |
149 | | /// let run_array = RunArray::<Int32Type>::from_iter(orig); |
150 | | /// let typed = run_array.downcast::<StringArray>().unwrap(); |
151 | | /// assert_eq!(typed.value(0), "a"); |
152 | | /// assert_eq!(typed.value(1), "b"); |
153 | | /// assert!(typed.values().is_null(2)); |
154 | | /// ``` |
155 | | /// |
156 | | pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> { |
157 | | let values = self.values.as_any().downcast_ref()?; |
158 | | Some(TypedRunArray { |
159 | | run_array: self, |
160 | | values, |
161 | | }) |
162 | | } |
163 | | |
164 | | /// Returns index to the physical array for the given index to the logical array. |
165 | | /// This function adjusts the input logical index based on `ArrayData::offset` |
166 | | /// Performs a binary search on the run_ends array for the input index. |
167 | | /// |
168 | | /// The result is arbitrary if `logical_index >= self.len()` |
169 | 0 | pub fn get_physical_index(&self, logical_index: usize) -> usize { |
170 | 0 | self.run_ends.get_physical_index(logical_index) |
171 | 0 | } |
172 | | |
173 | | /// Returns the physical indices of the input logical indices. Returns error if any of the logical |
174 | | /// index cannot be converted to physical index. The logical indices are sorted and iterated along |
175 | | /// with run_ends array to find matching physical index. The approach used here was chosen over |
176 | | /// finding physical index for each logical index using binary search using the function |
177 | | /// `get_physical_index`. Running benchmarks on both approaches showed that the approach used here |
178 | | /// scaled well for larger inputs. |
179 | | /// See <https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more details. |
180 | | #[inline] |
181 | 0 | pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> Result<Vec<usize>, ArrowError> |
182 | 0 | where |
183 | 0 | I: ArrowNativeType, |
184 | | { |
185 | 0 | let len = self.run_ends().len(); |
186 | 0 | let offset = self.run_ends().offset(); |
187 | | |
188 | 0 | let indices_len = logical_indices.len(); |
189 | | |
190 | 0 | if indices_len == 0 { |
191 | 0 | return Ok(vec![]); |
192 | 0 | } |
193 | | |
194 | | // `ordered_indices` store index into `logical_indices` and can be used |
195 | | // to iterate `logical_indices` in sorted order. |
196 | 0 | let mut ordered_indices: Vec<usize> = (0..indices_len).collect(); |
197 | | |
198 | | // Instead of sorting `logical_indices` directly, sort the `ordered_indices` |
199 | | // whose values are index of `logical_indices` |
200 | 0 | ordered_indices.sort_unstable_by(|lhs, rhs| { |
201 | 0 | logical_indices[*lhs] |
202 | 0 | .partial_cmp(&logical_indices[*rhs]) |
203 | 0 | .unwrap() |
204 | 0 | }); |
205 | | |
206 | | // Return early if all the logical indices cannot be converted to physical indices. |
207 | 0 | let largest_logical_index = logical_indices[*ordered_indices.last().unwrap()].as_usize(); |
208 | 0 | if largest_logical_index >= len { |
209 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
210 | 0 | "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.", |
211 | 0 | ))); |
212 | 0 | } |
213 | | |
214 | | // Skip some physical indices based on offset. |
215 | 0 | let skip_value = self.get_start_physical_index(); |
216 | | |
217 | 0 | let mut physical_indices = vec![0; indices_len]; |
218 | | |
219 | 0 | let mut ordered_index = 0_usize; |
220 | 0 | for (physical_index, run_end) in self.run_ends.values().iter().enumerate().skip(skip_value) |
221 | | { |
222 | | // Get the run end index (relative to offset) of current physical index |
223 | 0 | let run_end_value = run_end.as_usize() - offset; |
224 | | |
225 | | // All the `logical_indices` that are less than current run end index |
226 | | // belongs to current physical index. |
227 | 0 | while ordered_index < indices_len |
228 | 0 | && logical_indices[ordered_indices[ordered_index]].as_usize() < run_end_value |
229 | 0 | { |
230 | 0 | physical_indices[ordered_indices[ordered_index]] = physical_index; |
231 | 0 | ordered_index += 1; |
232 | 0 | } |
233 | | } |
234 | | |
235 | | // If there are input values >= run_ends.last_value then we'll not be able to convert |
236 | | // all logical indices to physical indices. |
237 | 0 | if ordered_index < logical_indices.len() { |
238 | 0 | let logical_index = logical_indices[ordered_indices[ordered_index]].as_usize(); |
239 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
240 | 0 | "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {logical_index}.", |
241 | 0 | ))); |
242 | 0 | } |
243 | 0 | Ok(physical_indices) |
244 | 0 | } |
245 | | |
246 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
247 | 0 | pub fn slice(&self, offset: usize, length: usize) -> Self { |
248 | 0 | Self { |
249 | 0 | data_type: self.data_type.clone(), |
250 | 0 | run_ends: self.run_ends.slice(offset, length), |
251 | 0 | values: self.values.clone(), |
252 | 0 | } |
253 | 0 | } |
254 | | } |
255 | | |
256 | | impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> { |
257 | | // The method assumes the caller already validated the data using `ArrayData::validate_data()` |
258 | 0 | fn from(data: ArrayData) -> Self { |
259 | 0 | match data.data_type() { |
260 | 0 | DataType::RunEndEncoded(_, _) => {} |
261 | | _ => { |
262 | 0 | panic!("Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"); |
263 | | } |
264 | | } |
265 | | |
266 | | // Safety |
267 | | // ArrayData is valid |
268 | 0 | let child = &data.child_data()[0]; |
269 | 0 | assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type"); |
270 | 0 | let run_ends = unsafe { |
271 | 0 | let scalar = child.buffers()[0].clone().into(); |
272 | 0 | RunEndBuffer::new_unchecked(scalar, data.offset(), data.len()) |
273 | | }; |
274 | | |
275 | 0 | let values = make_array(data.child_data()[1].clone()); |
276 | 0 | Self { |
277 | 0 | data_type: data.data_type().clone(), |
278 | 0 | run_ends, |
279 | 0 | values, |
280 | 0 | } |
281 | 0 | } |
282 | | } |
283 | | |
284 | | impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData { |
285 | 0 | fn from(array: RunArray<R>) -> Self { |
286 | 0 | let len = array.run_ends.len(); |
287 | 0 | let offset = array.run_ends.offset(); |
288 | | |
289 | 0 | let run_ends = ArrayDataBuilder::new(R::DATA_TYPE) |
290 | 0 | .len(array.run_ends.values().len()) |
291 | 0 | .buffers(vec![array.run_ends.into_inner().into_inner()]); |
292 | | |
293 | 0 | let run_ends = unsafe { run_ends.build_unchecked() }; |
294 | | |
295 | 0 | let builder = ArrayDataBuilder::new(array.data_type) |
296 | 0 | .len(len) |
297 | 0 | .offset(offset) |
298 | 0 | .child_data(vec![run_ends, array.values.to_data()]); |
299 | | |
300 | 0 | unsafe { builder.build_unchecked() } |
301 | 0 | } |
302 | | } |
303 | | |
304 | | impl<T: RunEndIndexType> Array for RunArray<T> { |
305 | 0 | fn as_any(&self) -> &dyn Any { |
306 | 0 | self |
307 | 0 | } |
308 | | |
309 | 0 | fn to_data(&self) -> ArrayData { |
310 | 0 | self.clone().into() |
311 | 0 | } |
312 | | |
313 | 0 | fn into_data(self) -> ArrayData { |
314 | 0 | self.into() |
315 | 0 | } |
316 | | |
317 | 0 | fn data_type(&self) -> &DataType { |
318 | 0 | &self.data_type |
319 | 0 | } |
320 | | |
321 | 0 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
322 | 0 | Arc::new(self.slice(offset, length)) |
323 | 0 | } |
324 | | |
325 | 0 | fn len(&self) -> usize { |
326 | 0 | self.run_ends.len() |
327 | 0 | } |
328 | | |
329 | 0 | fn is_empty(&self) -> bool { |
330 | 0 | self.run_ends.is_empty() |
331 | 0 | } |
332 | | |
333 | 0 | fn shrink_to_fit(&mut self) { |
334 | 0 | self.run_ends.shrink_to_fit(); |
335 | 0 | self.values.shrink_to_fit(); |
336 | 0 | } |
337 | | |
338 | 0 | fn offset(&self) -> usize { |
339 | 0 | self.run_ends.offset() |
340 | 0 | } |
341 | | |
342 | 0 | fn nulls(&self) -> Option<&NullBuffer> { |
343 | 0 | None |
344 | 0 | } |
345 | | |
346 | 0 | fn logical_nulls(&self) -> Option<NullBuffer> { |
347 | 0 | let len = self.len(); |
348 | 0 | let nulls = self.values.logical_nulls()?; |
349 | 0 | let mut out = BooleanBufferBuilder::new(len); |
350 | 0 | let offset = self.run_ends.offset(); |
351 | 0 | let mut valid_start = 0; |
352 | 0 | let mut last_end = 0; |
353 | 0 | for (idx, end) in self.run_ends.values().iter().enumerate() { |
354 | 0 | let end = end.as_usize(); |
355 | 0 | if end < offset { |
356 | 0 | continue; |
357 | 0 | } |
358 | 0 | let end = (end - offset).min(len); |
359 | 0 | if nulls.is_null(idx) { |
360 | 0 | if valid_start < last_end { |
361 | 0 | out.append_n(last_end - valid_start, true); |
362 | 0 | } |
363 | 0 | out.append_n(end - last_end, false); |
364 | 0 | valid_start = end; |
365 | 0 | } |
366 | 0 | last_end = end; |
367 | 0 | if end == len { |
368 | 0 | break; |
369 | 0 | } |
370 | | } |
371 | 0 | if valid_start < len { |
372 | 0 | out.append_n(len - valid_start, true) |
373 | 0 | } |
374 | | // Sanity check |
375 | 0 | assert_eq!(out.len(), len); |
376 | 0 | Some(out.finish().into()) |
377 | 0 | } |
378 | | |
379 | 0 | fn is_nullable(&self) -> bool { |
380 | 0 | !self.is_empty() && self.values.is_nullable() |
381 | 0 | } |
382 | | |
383 | 0 | fn get_buffer_memory_size(&self) -> usize { |
384 | 0 | self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size() |
385 | 0 | } |
386 | | |
387 | 0 | fn get_array_memory_size(&self) -> usize { |
388 | 0 | std::mem::size_of::<Self>() |
389 | 0 | + self.run_ends.inner().inner().capacity() |
390 | 0 | + self.values.get_array_memory_size() |
391 | 0 | } |
392 | | } |
393 | | |
394 | | impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> { |
395 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
396 | 0 | writeln!( |
397 | 0 | f, |
398 | 0 | "RunArray {{run_ends: {:?}, values: {:?}}}", |
399 | 0 | self.run_ends.values(), |
400 | | self.values |
401 | | ) |
402 | 0 | } |
403 | | } |
404 | | |
405 | | /// Constructs a `RunArray` from an iterator of optional strings. |
406 | | /// |
407 | | /// # Example: |
408 | | /// ``` |
409 | | /// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; |
410 | | /// |
411 | | /// let test = vec!["a", "a", "b", "c", "c"]; |
412 | | /// let array: RunArray<Int16Type> = test |
413 | | /// .iter() |
414 | | /// .map(|&x| if x == "b" { None } else { Some(x) }) |
415 | | /// .collect(); |
416 | | /// assert_eq!( |
417 | | /// "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", |
418 | | /// format!("{:?}", array) |
419 | | /// ); |
420 | | /// ``` |
421 | | impl<'a, T: RunEndIndexType> FromIterator<Option<&'a str>> for RunArray<T> { |
422 | | fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self { |
423 | | let it = iter.into_iter(); |
424 | | let (lower, _) = it.size_hint(); |
425 | | let mut builder = StringRunBuilder::with_capacity(lower, 256); |
426 | | it.for_each(|i| { |
427 | | builder.append_option(i); |
428 | | }); |
429 | | |
430 | | builder.finish() |
431 | | } |
432 | | } |
433 | | |
434 | | /// Constructs a `RunArray` from an iterator of strings. |
435 | | /// |
436 | | /// # Example: |
437 | | /// |
438 | | /// ``` |
439 | | /// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; |
440 | | /// |
441 | | /// let test = vec!["a", "a", "b", "c"]; |
442 | | /// let array: RunArray<Int16Type> = test.into_iter().collect(); |
443 | | /// assert_eq!( |
444 | | /// "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", |
445 | | /// format!("{:?}", array) |
446 | | /// ); |
447 | | /// ``` |
448 | | impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray<T> { |
449 | | fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self { |
450 | | let it = iter.into_iter(); |
451 | | let (lower, _) = it.size_hint(); |
452 | | let mut builder = StringRunBuilder::with_capacity(lower, 256); |
453 | | it.for_each(|i| { |
454 | | builder.append_value(i); |
455 | | }); |
456 | | |
457 | | builder.finish() |
458 | | } |
459 | | } |
460 | | |
461 | | /// |
462 | | /// A [`RunArray`] with `i16` run ends |
463 | | /// |
464 | | /// # Example: Using `collect` |
465 | | /// ``` |
466 | | /// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray}; |
467 | | /// # use std::sync::Arc; |
468 | | /// |
469 | | /// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); |
470 | | /// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"])); |
471 | | /// assert_eq!(array.run_ends().values(), &[2, 3, 5]); |
472 | | /// assert_eq!(array.values(), &values); |
473 | | /// ``` |
474 | | pub type Int16RunArray = RunArray<Int16Type>; |
475 | | |
476 | | /// |
477 | | /// A [`RunArray`] with `i32` run ends |
478 | | /// |
479 | | /// # Example: Using `collect` |
480 | | /// ``` |
481 | | /// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray}; |
482 | | /// # use std::sync::Arc; |
483 | | /// |
484 | | /// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); |
485 | | /// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"])); |
486 | | /// assert_eq!(array.run_ends().values(), &[2, 3, 5]); |
487 | | /// assert_eq!(array.values(), &values); |
488 | | /// ``` |
489 | | pub type Int32RunArray = RunArray<Int32Type>; |
490 | | |
491 | | /// |
492 | | /// A [`RunArray`] with `i64` run ends |
493 | | /// |
494 | | /// # Example: Using `collect` |
495 | | /// ``` |
496 | | /// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray}; |
497 | | /// # use std::sync::Arc; |
498 | | /// |
499 | | /// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); |
500 | | /// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"])); |
501 | | /// assert_eq!(array.run_ends().values(), &[2, 3, 5]); |
502 | | /// assert_eq!(array.values(), &values); |
503 | | /// ``` |
504 | | pub type Int64RunArray = RunArray<Int64Type>; |
505 | | |
506 | | /// A [`RunArray`] typed typed on its child values array |
507 | | /// |
508 | | /// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements |
509 | | /// |
510 | | /// ``` |
511 | | /// use arrow_array::{RunArray, StringArray, types::Int32Type}; |
512 | | /// |
513 | | /// let orig = ["a", "b", "a", "b"]; |
514 | | /// let ree_array = RunArray::<Int32Type>::from_iter(orig); |
515 | | /// |
516 | | /// // `TypedRunArray` allows you to access the values directly |
517 | | /// let typed = ree_array.downcast::<StringArray>().unwrap(); |
518 | | /// |
519 | | /// for (maybe_val, orig) in typed.into_iter().zip(orig) { |
520 | | /// assert_eq!(maybe_val.unwrap(), orig) |
521 | | /// } |
522 | | /// ``` |
523 | | pub struct TypedRunArray<'a, R: RunEndIndexType, V> { |
524 | | /// The run array |
525 | | run_array: &'a RunArray<R>, |
526 | | |
527 | | /// The values of the run_array |
528 | | values: &'a V, |
529 | | } |
530 | | |
531 | | // Manually implement `Clone` to avoid `V: Clone` type constraint |
532 | | impl<R: RunEndIndexType, V> Clone for TypedRunArray<'_, R, V> { |
533 | | fn clone(&self) -> Self { |
534 | | *self |
535 | | } |
536 | | } |
537 | | |
538 | | impl<R: RunEndIndexType, V> Copy for TypedRunArray<'_, R, V> {} |
539 | | |
540 | | impl<R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'_, R, V> { |
541 | | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
542 | | writeln!(f, "TypedRunArray({:?})", self.run_array) |
543 | | } |
544 | | } |
545 | | |
546 | | impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { |
547 | | /// Returns the run_ends of this [`TypedRunArray`] |
548 | | pub fn run_ends(&self) -> &'a RunEndBuffer<R::Native> { |
549 | | self.run_array.run_ends() |
550 | | } |
551 | | |
552 | | /// Returns the values of this [`TypedRunArray`] |
553 | | pub fn values(&self) -> &'a V { |
554 | | self.values |
555 | | } |
556 | | |
557 | | /// Returns the run array of this [`TypedRunArray`] |
558 | | pub fn run_array(&self) -> &'a RunArray<R> { |
559 | | self.run_array |
560 | | } |
561 | | } |
562 | | |
563 | | impl<R: RunEndIndexType, V: Sync> Array for TypedRunArray<'_, R, V> { |
564 | | fn as_any(&self) -> &dyn Any { |
565 | | self.run_array |
566 | | } |
567 | | |
568 | | fn to_data(&self) -> ArrayData { |
569 | | self.run_array.to_data() |
570 | | } |
571 | | |
572 | | fn into_data(self) -> ArrayData { |
573 | | self.run_array.into_data() |
574 | | } |
575 | | |
576 | | fn data_type(&self) -> &DataType { |
577 | | self.run_array.data_type() |
578 | | } |
579 | | |
580 | | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
581 | | Arc::new(self.run_array.slice(offset, length)) |
582 | | } |
583 | | |
584 | | fn len(&self) -> usize { |
585 | | self.run_array.len() |
586 | | } |
587 | | |
588 | | fn is_empty(&self) -> bool { |
589 | | self.run_array.is_empty() |
590 | | } |
591 | | |
592 | | fn offset(&self) -> usize { |
593 | | self.run_array.offset() |
594 | | } |
595 | | |
596 | | fn nulls(&self) -> Option<&NullBuffer> { |
597 | | self.run_array.nulls() |
598 | | } |
599 | | |
600 | | fn logical_nulls(&self) -> Option<NullBuffer> { |
601 | | self.run_array.logical_nulls() |
602 | | } |
603 | | |
604 | | fn logical_null_count(&self) -> usize { |
605 | | self.run_array.logical_null_count() |
606 | | } |
607 | | |
608 | | fn is_nullable(&self) -> bool { |
609 | | self.run_array.is_nullable() |
610 | | } |
611 | | |
612 | | fn get_buffer_memory_size(&self) -> usize { |
613 | | self.run_array.get_buffer_memory_size() |
614 | | } |
615 | | |
616 | | fn get_array_memory_size(&self) -> usize { |
617 | | self.run_array.get_array_memory_size() |
618 | | } |
619 | | } |
620 | | |
621 | | // Array accessor converts the index of logical array to the index of the physical array |
622 | | // using binary search. The time complexity is O(log N) where N is number of runs. |
623 | | impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V> |
624 | | where |
625 | | R: RunEndIndexType, |
626 | | V: Sync + Send, |
627 | | &'a V: ArrayAccessor, |
628 | | <&'a V as ArrayAccessor>::Item: Default, |
629 | | { |
630 | | type Item = <&'a V as ArrayAccessor>::Item; |
631 | | |
632 | | fn value(&self, logical_index: usize) -> Self::Item { |
633 | | assert!( |
634 | | logical_index < self.len(), |
635 | | "Trying to access an element at index {} from a TypedRunArray of length {}", |
636 | | logical_index, |
637 | | self.len() |
638 | | ); |
639 | | unsafe { self.value_unchecked(logical_index) } |
640 | | } |
641 | | |
642 | | unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item { |
643 | | let physical_index = self.run_array.get_physical_index(logical_index); |
644 | | self.values().value_unchecked(physical_index) |
645 | | } |
646 | | } |
647 | | |
648 | | impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V> |
649 | | where |
650 | | R: RunEndIndexType, |
651 | | V: Sync + Send, |
652 | | &'a V: ArrayAccessor, |
653 | | <&'a V as ArrayAccessor>::Item: Default, |
654 | | { |
655 | | type Item = Option<<&'a V as ArrayAccessor>::Item>; |
656 | | type IntoIter = RunArrayIter<'a, R, V>; |
657 | | |
658 | | fn into_iter(self) -> Self::IntoIter { |
659 | | RunArrayIter::new(self) |
660 | | } |
661 | | } |
662 | | |
663 | | #[cfg(test)] |
664 | | mod tests { |
665 | | use rand::rng; |
666 | | use rand::seq::SliceRandom; |
667 | | use rand::Rng; |
668 | | |
669 | | use super::*; |
670 | | use crate::builder::PrimitiveRunBuilder; |
671 | | use crate::cast::AsArray; |
672 | | use crate::types::{Int8Type, UInt32Type}; |
673 | | use crate::{Int16Array, Int32Array, StringArray}; |
674 | | |
675 | | fn build_input_array(size: usize) -> Vec<Option<i32>> { |
676 | | // The input array is created by shuffling and repeating |
677 | | // the seed values random number of times. |
678 | | let mut seed: Vec<Option<i32>> = vec![ |
679 | | None, |
680 | | None, |
681 | | None, |
682 | | Some(1), |
683 | | Some(2), |
684 | | Some(3), |
685 | | Some(4), |
686 | | Some(5), |
687 | | Some(6), |
688 | | Some(7), |
689 | | Some(8), |
690 | | Some(9), |
691 | | ]; |
692 | | let mut result: Vec<Option<i32>> = Vec::with_capacity(size); |
693 | | let mut ix = 0; |
694 | | let mut rng = rng(); |
695 | | // run length can go up to 8. Cap the max run length for smaller arrays to size / 2. |
696 | | let max_run_length = 8_usize.min(1_usize.max(size / 2)); |
697 | | while result.len() < size { |
698 | | // shuffle the seed array if all the values are iterated. |
699 | | if ix == 0 { |
700 | | seed.shuffle(&mut rng); |
701 | | } |
702 | | // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays |
703 | | let num = max_run_length.min(rng.random_range(1..=max_run_length)); |
704 | | for _ in 0..num { |
705 | | result.push(seed[ix]); |
706 | | } |
707 | | ix += 1; |
708 | | if ix == seed.len() { |
709 | | ix = 0 |
710 | | } |
711 | | } |
712 | | result.resize(size, None); |
713 | | result |
714 | | } |
715 | | |
716 | | // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]` |
717 | | fn compare_logical_and_physical_indices( |
718 | | logical_indices: &[u32], |
719 | | logical_array: &[Option<i32>], |
720 | | physical_indices: &[usize], |
721 | | physical_array: &PrimitiveArray<Int32Type>, |
722 | | ) { |
723 | | assert_eq!(logical_indices.len(), physical_indices.len()); |
724 | | |
725 | | // check value in logical index in the logical_array matches physical index in physical_array |
726 | | logical_indices |
727 | | .iter() |
728 | | .map(|f| f.as_usize()) |
729 | | .zip(physical_indices.iter()) |
730 | | .for_each(|(logical_ix, physical_ix)| { |
731 | | let expected = logical_array[logical_ix]; |
732 | | match expected { |
733 | | Some(val) => { |
734 | | assert!(physical_array.is_valid(*physical_ix)); |
735 | | let actual = physical_array.value(*physical_ix); |
736 | | assert_eq!(val, actual); |
737 | | } |
738 | | None => { |
739 | | assert!(physical_array.is_null(*physical_ix)) |
740 | | } |
741 | | }; |
742 | | }); |
743 | | } |
744 | | #[test] |
745 | | fn test_run_array() { |
746 | | // Construct a value array |
747 | | let value_data = |
748 | | PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); |
749 | | |
750 | | // Construct a run_ends array: |
751 | | let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22]; |
752 | | let run_ends_data = |
753 | | PrimitiveArray::<Int16Type>::from_iter_values(run_ends_values.iter().copied()); |
754 | | |
755 | | // Construct a run ends encoded array from the above two |
756 | | let ree_array = RunArray::<Int16Type>::try_new(&run_ends_data, &value_data).unwrap(); |
757 | | |
758 | | assert_eq!(ree_array.len(), 22); |
759 | | assert_eq!(ree_array.null_count(), 0); |
760 | | |
761 | | let values = ree_array.values(); |
762 | | assert_eq!(value_data.into_data(), values.to_data()); |
763 | | assert_eq!(&DataType::Int8, values.data_type()); |
764 | | |
765 | | let run_ends = ree_array.run_ends(); |
766 | | assert_eq!(run_ends.values(), &run_ends_values); |
767 | | } |
768 | | |
769 | | #[test] |
770 | | fn test_run_array_fmt_debug() { |
771 | | let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(3); |
772 | | builder.append_value(12345678); |
773 | | builder.append_null(); |
774 | | builder.append_value(22345678); |
775 | | let array = builder.finish(); |
776 | | assert_eq!( |
777 | | "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray<UInt32>\n[\n 12345678,\n null,\n 22345678,\n]}\n", |
778 | | format!("{array:?}") |
779 | | ); |
780 | | |
781 | | let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(20); |
782 | | for _ in 0..20 { |
783 | | builder.append_value(1); |
784 | | } |
785 | | let array = builder.finish(); |
786 | | |
787 | | assert_eq!(array.len(), 20); |
788 | | assert_eq!(array.null_count(), 0); |
789 | | assert_eq!(array.logical_null_count(), 0); |
790 | | |
791 | | assert_eq!( |
792 | | "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n 1,\n]}\n", |
793 | | format!("{array:?}") |
794 | | ); |
795 | | } |
796 | | |
797 | | #[test] |
798 | | fn test_run_array_from_iter() { |
799 | | let test = vec!["a", "a", "b", "c"]; |
800 | | let array: RunArray<Int16Type> = test |
801 | | .iter() |
802 | | .map(|&x| if x == "b" { None } else { Some(x) }) |
803 | | .collect(); |
804 | | assert_eq!( |
805 | | "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", |
806 | | format!("{array:?}") |
807 | | ); |
808 | | |
809 | | assert_eq!(array.len(), 4); |
810 | | assert_eq!(array.null_count(), 0); |
811 | | assert_eq!(array.logical_null_count(), 1); |
812 | | |
813 | | let array: RunArray<Int16Type> = test.into_iter().collect(); |
814 | | assert_eq!( |
815 | | "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", |
816 | | format!("{array:?}") |
817 | | ); |
818 | | } |
819 | | |
820 | | #[test] |
821 | | fn test_run_array_run_ends_as_primitive_array() { |
822 | | let test = vec!["a", "b", "c", "a"]; |
823 | | let array: RunArray<Int16Type> = test.into_iter().collect(); |
824 | | |
825 | | assert_eq!(array.len(), 4); |
826 | | assert_eq!(array.null_count(), 0); |
827 | | assert_eq!(array.logical_null_count(), 0); |
828 | | |
829 | | let run_ends = array.run_ends(); |
830 | | assert_eq!(&[1, 2, 3, 4], run_ends.values()); |
831 | | } |
832 | | |
833 | | #[test] |
834 | | fn test_run_array_as_primitive_array_with_null() { |
835 | | let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; |
836 | | let array: RunArray<Int32Type> = test.into_iter().collect(); |
837 | | |
838 | | assert_eq!(array.len(), 6); |
839 | | assert_eq!(array.null_count(), 0); |
840 | | assert_eq!(array.logical_null_count(), 3); |
841 | | |
842 | | let run_ends = array.run_ends(); |
843 | | assert_eq!(&[1, 2, 3, 5, 6], run_ends.values()); |
844 | | |
845 | | let values_data = array.values(); |
846 | | assert_eq!(2, values_data.null_count()); |
847 | | assert_eq!(5, values_data.len()); |
848 | | } |
849 | | |
850 | | #[test] |
851 | | fn test_run_array_all_nulls() { |
852 | | let test = vec![None, None, None]; |
853 | | let array: RunArray<Int32Type> = test.into_iter().collect(); |
854 | | |
855 | | assert_eq!(array.len(), 3); |
856 | | assert_eq!(array.null_count(), 0); |
857 | | assert_eq!(array.logical_null_count(), 3); |
858 | | |
859 | | let run_ends = array.run_ends(); |
860 | | assert_eq!(3, run_ends.len()); |
861 | | assert_eq!(&[3], run_ends.values()); |
862 | | |
863 | | let values_data = array.values(); |
864 | | assert_eq!(1, values_data.null_count()); |
865 | | } |
866 | | |
867 | | #[test] |
868 | | fn test_run_array_try_new() { |
869 | | let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] |
870 | | .into_iter() |
871 | | .collect(); |
872 | | let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); |
873 | | |
874 | | let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap(); |
875 | | assert_eq!(array.values().data_type(), &DataType::Utf8); |
876 | | |
877 | | assert_eq!(array.null_count(), 0); |
878 | | assert_eq!(array.logical_null_count(), 1); |
879 | | assert_eq!(array.len(), 4); |
880 | | assert_eq!(array.values().null_count(), 1); |
881 | | |
882 | | assert_eq!( |
883 | | "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", |
884 | | format!("{array:?}") |
885 | | ); |
886 | | } |
887 | | |
888 | | #[test] |
889 | | fn test_run_array_int16_type_definition() { |
890 | | let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); |
891 | | let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"])); |
892 | | assert_eq!(array.run_ends().values(), &[2, 3, 5]); |
893 | | assert_eq!(array.values(), &values); |
894 | | } |
895 | | |
896 | | #[test] |
897 | | fn test_run_array_empty_string() { |
898 | | let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect(); |
899 | | let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "", "c"])); |
900 | | assert_eq!(array.run_ends().values(), &[2, 4, 5]); |
901 | | assert_eq!(array.values(), &values); |
902 | | } |
903 | | |
904 | | #[test] |
905 | | fn test_run_array_length_mismatch() { |
906 | | let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] |
907 | | .into_iter() |
908 | | .collect(); |
909 | | let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect(); |
910 | | |
911 | | let actual = RunArray::<Int32Type>::try_new(&run_ends, &values); |
912 | | let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string()); |
913 | | assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); |
914 | | } |
915 | | |
916 | | #[test] |
917 | | fn test_run_array_run_ends_with_null() { |
918 | | let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] |
919 | | .into_iter() |
920 | | .collect(); |
921 | | let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); |
922 | | |
923 | | let actual = RunArray::<Int32Type>::try_new(&run_ends, &values); |
924 | | let expected = ArrowError::InvalidArgumentError( |
925 | | "Found null values in run_ends array. The run_ends array should not have null values." |
926 | | .to_string(), |
927 | | ); |
928 | | assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); |
929 | | } |
930 | | |
931 | | #[test] |
932 | | fn test_run_array_run_ends_with_zeroes() { |
933 | | let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] |
934 | | .into_iter() |
935 | | .collect(); |
936 | | let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect(); |
937 | | |
938 | | let actual = RunArray::<Int32Type>::try_new(&run_ends, &values); |
939 | | let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string()); |
940 | | assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); |
941 | | } |
942 | | |
943 | | #[test] |
944 | | fn test_run_array_run_ends_non_increasing() { |
945 | | let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] |
946 | | .into_iter() |
947 | | .collect(); |
948 | | let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect(); |
949 | | |
950 | | let actual = RunArray::<Int32Type>::try_new(&run_ends, &values); |
951 | | let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string()); |
952 | | assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); |
953 | | } |
954 | | |
955 | | #[test] |
956 | | #[should_panic(expected = "Incorrect run ends type")] |
957 | | fn test_run_array_run_ends_data_type_mismatch() { |
958 | | let a = RunArray::<Int32Type>::from_iter(["32"]); |
959 | | let _ = RunArray::<Int64Type>::from(a.into_data()); |
960 | | } |
961 | | |
962 | | #[test] |
963 | | fn test_ree_array_accessor() { |
964 | | let input_array = build_input_array(256); |
965 | | |
966 | | // Encode the input_array to ree_array |
967 | | let mut builder = |
968 | | PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len()); |
969 | | builder.extend(input_array.iter().copied()); |
970 | | let run_array = builder.finish(); |
971 | | let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap(); |
972 | | |
973 | | // Access every index and check if the value in the input array matches returned value. |
974 | | for (i, inp_val) in input_array.iter().enumerate() { |
975 | | if let Some(val) = inp_val { |
976 | | let actual = typed.value(i); |
977 | | assert_eq!(*val, actual) |
978 | | } else { |
979 | | let physical_ix = run_array.get_physical_index(i); |
980 | | assert!(typed.values().is_null(physical_ix)); |
981 | | }; |
982 | | } |
983 | | } |
984 | | |
985 | | #[test] |
986 | | #[cfg_attr(miri, ignore)] // Takes too long |
987 | | fn test_get_physical_indices() { |
988 | | // Test for logical lengths starting from 10 to 250 increasing by 10 |
989 | | for logical_len in (0..250).step_by(10) { |
990 | | let input_array = build_input_array(logical_len); |
991 | | |
992 | | // create run array using input_array |
993 | | let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new(); |
994 | | builder.extend(input_array.clone().into_iter()); |
995 | | |
996 | | let run_array = builder.finish(); |
997 | | let physical_values_array = run_array.values().as_primitive::<Int32Type>(); |
998 | | |
999 | | // create an array consisting of all the indices repeated twice and shuffled. |
1000 | | let mut logical_indices: Vec<u32> = (0_u32..(logical_len as u32)).collect(); |
1001 | | // add same indices once more |
1002 | | logical_indices.append(&mut logical_indices.clone()); |
1003 | | let mut rng = rng(); |
1004 | | logical_indices.shuffle(&mut rng); |
1005 | | |
1006 | | let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap(); |
1007 | | |
1008 | | assert_eq!(logical_indices.len(), physical_indices.len()); |
1009 | | |
1010 | | // check value in logical index in the input_array matches physical index in typed_run_array |
1011 | | compare_logical_and_physical_indices( |
1012 | | &logical_indices, |
1013 | | &input_array, |
1014 | | &physical_indices, |
1015 | | physical_values_array, |
1016 | | ); |
1017 | | } |
1018 | | } |
1019 | | |
1020 | | #[test] |
1021 | | #[cfg_attr(miri, ignore)] // Takes too long |
1022 | | fn test_get_physical_indices_sliced() { |
1023 | | let total_len = 80; |
1024 | | let input_array = build_input_array(total_len); |
1025 | | |
1026 | | // Encode the input_array to run array |
1027 | | let mut builder = |
1028 | | PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len()); |
1029 | | builder.extend(input_array.iter().copied()); |
1030 | | let run_array = builder.finish(); |
1031 | | let physical_values_array = run_array.values().as_primitive::<Int32Type>(); |
1032 | | |
1033 | | // test for all slice lengths. |
1034 | | for slice_len in 1..=total_len { |
1035 | | // create an array consisting of all the indices repeated twice and shuffled. |
1036 | | let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect(); |
1037 | | // add same indices once more |
1038 | | logical_indices.append(&mut logical_indices.clone()); |
1039 | | let mut rng = rng(); |
1040 | | logical_indices.shuffle(&mut rng); |
1041 | | |
1042 | | // test for offset = 0 and slice length = slice_len |
1043 | | // slice the input array using which the run array was built. |
1044 | | let sliced_input_array = &input_array[0..slice_len]; |
1045 | | |
1046 | | // slice the run array |
1047 | | let sliced_run_array: RunArray<Int16Type> = |
1048 | | run_array.slice(0, slice_len).into_data().into(); |
1049 | | |
1050 | | // Get physical indices. |
1051 | | let physical_indices = sliced_run_array |
1052 | | .get_physical_indices(&logical_indices) |
1053 | | .unwrap(); |
1054 | | |
1055 | | compare_logical_and_physical_indices( |
1056 | | &logical_indices, |
1057 | | sliced_input_array, |
1058 | | &physical_indices, |
1059 | | physical_values_array, |
1060 | | ); |
1061 | | |
1062 | | // test for offset = total_len - slice_len and slice length = slice_len |
1063 | | // slice the input array using which the run array was built. |
1064 | | let sliced_input_array = &input_array[total_len - slice_len..total_len]; |
1065 | | |
1066 | | // slice the run array |
1067 | | let sliced_run_array: RunArray<Int16Type> = run_array |
1068 | | .slice(total_len - slice_len, slice_len) |
1069 | | .into_data() |
1070 | | .into(); |
1071 | | |
1072 | | // Get physical indices |
1073 | | let physical_indices = sliced_run_array |
1074 | | .get_physical_indices(&logical_indices) |
1075 | | .unwrap(); |
1076 | | |
1077 | | compare_logical_and_physical_indices( |
1078 | | &logical_indices, |
1079 | | sliced_input_array, |
1080 | | &physical_indices, |
1081 | | physical_values_array, |
1082 | | ); |
1083 | | } |
1084 | | } |
1085 | | |
1086 | | #[test] |
1087 | | fn test_logical_nulls() { |
1088 | | let run = Int32Array::from(vec![3, 6, 9, 12]); |
1089 | | let values = Int32Array::from(vec![Some(0), None, Some(1), None]); |
1090 | | let array = RunArray::try_new(&run, &values).unwrap(); |
1091 | | |
1092 | | let expected = [ |
1093 | | true, true, true, false, false, false, true, true, true, false, false, false, |
1094 | | ]; |
1095 | | |
1096 | | let n = array.logical_nulls().unwrap(); |
1097 | | assert_eq!(n.null_count(), 6); |
1098 | | |
1099 | | let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)]; |
1100 | | for (offset, length) in slices { |
1101 | | let a = array.slice(offset, length); |
1102 | | let n = a.logical_nulls().unwrap(); |
1103 | | let n = n.into_iter().collect::<Vec<_>>(); |
1104 | | assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}"); |
1105 | | } |
1106 | | } |
1107 | | |
1108 | | #[test] |
1109 | | fn test_run_array_eq_identical() { |
1110 | | let run_ends1 = Int32Array::from(vec![2, 4, 6]); |
1111 | | let values1 = StringArray::from(vec!["a", "b", "c"]); |
1112 | | let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap(); |
1113 | | |
1114 | | let run_ends2 = Int32Array::from(vec![2, 4, 6]); |
1115 | | let values2 = StringArray::from(vec!["a", "b", "c"]); |
1116 | | let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap(); |
1117 | | |
1118 | | assert_eq!(array1, array2); |
1119 | | } |
1120 | | |
1121 | | #[test] |
1122 | | fn test_run_array_ne_different_run_ends() { |
1123 | | let run_ends1 = Int32Array::from(vec![2, 4, 6]); |
1124 | | let values1 = StringArray::from(vec!["a", "b", "c"]); |
1125 | | let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap(); |
1126 | | |
1127 | | let run_ends2 = Int32Array::from(vec![1, 4, 6]); |
1128 | | let values2 = StringArray::from(vec!["a", "b", "c"]); |
1129 | | let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap(); |
1130 | | |
1131 | | assert_ne!(array1, array2); |
1132 | | } |
1133 | | |
1134 | | #[test] |
1135 | | fn test_run_array_ne_different_values() { |
1136 | | let run_ends1 = Int32Array::from(vec![2, 4, 6]); |
1137 | | let values1 = StringArray::from(vec!["a", "b", "c"]); |
1138 | | let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap(); |
1139 | | |
1140 | | let run_ends2 = Int32Array::from(vec![2, 4, 6]); |
1141 | | let values2 = StringArray::from(vec!["a", "b", "d"]); |
1142 | | let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap(); |
1143 | | |
1144 | | assert_ne!(array1, array2); |
1145 | | } |
1146 | | |
1147 | | #[test] |
1148 | | fn test_run_array_eq_with_nulls() { |
1149 | | let run_ends1 = Int32Array::from(vec![2, 4, 6]); |
1150 | | let values1 = StringArray::from(vec![Some("a"), None, Some("c")]); |
1151 | | let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap(); |
1152 | | |
1153 | | let run_ends2 = Int32Array::from(vec![2, 4, 6]); |
1154 | | let values2 = StringArray::from(vec![Some("a"), None, Some("c")]); |
1155 | | let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap(); |
1156 | | |
1157 | | assert_eq!(array1, array2); |
1158 | | } |
1159 | | |
1160 | | #[test] |
1161 | | fn test_run_array_eq_different_run_end_types() { |
1162 | | let run_ends_i16_1 = Int16Array::from(vec![2_i16, 4, 6]); |
1163 | | let values_i16_1 = StringArray::from(vec!["a", "b", "c"]); |
1164 | | let array_i16_1 = RunArray::<Int16Type>::try_new(&run_ends_i16_1, &values_i16_1).unwrap(); |
1165 | | |
1166 | | let run_ends_i16_2 = Int16Array::from(vec![2_i16, 4, 6]); |
1167 | | let values_i16_2 = StringArray::from(vec!["a", "b", "c"]); |
1168 | | let array_i16_2 = RunArray::<Int16Type>::try_new(&run_ends_i16_2, &values_i16_2).unwrap(); |
1169 | | |
1170 | | assert_eq!(array_i16_1, array_i16_2); |
1171 | | } |
1172 | | } |