/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_byte_run_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::types::bytes::ByteArrayNativeType; |
19 | | use std::{any::Any, sync::Arc}; |
20 | | |
21 | | use crate::{ |
22 | | ArrayRef, ArrowPrimitiveType, RunArray, |
23 | | types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, |
24 | | }; |
25 | | |
26 | | use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; |
27 | | |
28 | | use arrow_buffer::ArrowNativeType; |
29 | | |
30 | | /// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray) |
31 | | /// |
32 | | /// # Example: |
33 | | /// |
34 | | /// ``` |
35 | | /// |
36 | | /// # use arrow_array::builder::GenericByteRunBuilder; |
37 | | /// # use arrow_array::{GenericByteArray, BinaryArray}; |
38 | | /// # use arrow_array::types::{BinaryType, Int16Type}; |
39 | | /// # use arrow_array::{Array, Int16Array}; |
40 | | /// # use arrow_array::cast::AsArray; |
41 | | /// |
42 | | /// let mut builder = |
43 | | /// GenericByteRunBuilder::<Int16Type, BinaryType>::new(); |
44 | | /// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter()); |
45 | | /// builder.append_value(b"def"); |
46 | | /// builder.append_null(); |
47 | | /// let array = builder.finish(); |
48 | | /// |
49 | | /// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]); |
50 | | /// |
51 | | /// let av = array.values(); |
52 | | /// |
53 | | /// assert!(!av.is_null(0)); |
54 | | /// assert!(av.is_null(1)); |
55 | | /// assert!(!av.is_null(2)); |
56 | | /// assert!(av.is_null(3)); |
57 | | /// |
58 | | /// // Values are polymorphic and so require a downcast. |
59 | | /// let ava: &BinaryArray = av.as_binary(); |
60 | | /// |
61 | | /// assert_eq!(ava.value(0), b"abc"); |
62 | | /// assert_eq!(ava.value(2), b"def"); |
63 | | /// ``` |
64 | | #[derive(Debug)] |
65 | | pub struct GenericByteRunBuilder<R, V> |
66 | | where |
67 | | R: ArrowPrimitiveType, |
68 | | V: ByteArrayType, |
69 | | { |
70 | | run_ends_builder: PrimitiveBuilder<R>, |
71 | | values_builder: GenericByteBuilder<V>, |
72 | | current_value: Vec<u8>, |
73 | | has_current_value: bool, |
74 | | current_run_end_index: usize, |
75 | | prev_run_end_index: usize, |
76 | | } |
77 | | |
78 | | impl<R, V> Default for GenericByteRunBuilder<R, V> |
79 | | where |
80 | | R: ArrowPrimitiveType, |
81 | | V: ByteArrayType, |
82 | | { |
83 | | fn default() -> Self { |
84 | | Self::new() |
85 | | } |
86 | | } |
87 | | |
88 | | impl<R, V> GenericByteRunBuilder<R, V> |
89 | | where |
90 | | R: ArrowPrimitiveType, |
91 | | V: ByteArrayType, |
92 | | { |
93 | | /// Creates a new `GenericByteRunBuilder` |
94 | | pub fn new() -> Self { |
95 | | Self { |
96 | | run_ends_builder: PrimitiveBuilder::new(), |
97 | | values_builder: GenericByteBuilder::<V>::new(), |
98 | | current_value: Vec::new(), |
99 | | has_current_value: false, |
100 | | current_run_end_index: 0, |
101 | | prev_run_end_index: 0, |
102 | | } |
103 | | } |
104 | | |
105 | | /// Creates a new `GenericByteRunBuilder` with the provided capacity |
106 | | /// |
107 | | /// `capacity`: the expected number of run-end encoded values. |
108 | | /// `data_capacity`: the expected number of bytes of run end encoded values |
109 | 4 | pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { |
110 | 4 | Self { |
111 | 4 | run_ends_builder: PrimitiveBuilder::with_capacity(capacity), |
112 | 4 | values_builder: GenericByteBuilder::<V>::with_capacity(capacity, data_capacity), |
113 | 4 | current_value: Vec::new(), |
114 | 4 | has_current_value: false, |
115 | 4 | current_run_end_index: 0, |
116 | 4 | prev_run_end_index: 0, |
117 | 4 | } |
118 | 4 | } |
119 | | } |
120 | | |
121 | | impl<R, V> ArrayBuilder for GenericByteRunBuilder<R, V> |
122 | | where |
123 | | R: RunEndIndexType, |
124 | | V: ByteArrayType, |
125 | | { |
126 | | /// Returns the builder as a non-mutable `Any` reference. |
127 | | fn as_any(&self) -> &dyn Any { |
128 | | self |
129 | | } |
130 | | |
131 | | /// Returns the builder as a mutable `Any` reference. |
132 | | fn as_any_mut(&mut self) -> &mut dyn Any { |
133 | | self |
134 | | } |
135 | | |
136 | | /// Returns the boxed builder as a box of `Any`. |
137 | | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
138 | | self |
139 | | } |
140 | | |
141 | | /// Returns the length of logical array encoded by |
142 | | /// the eventual runs array. |
143 | | fn len(&self) -> usize { |
144 | | self.current_run_end_index |
145 | | } |
146 | | |
147 | | /// Builds the array and reset this builder. |
148 | | fn finish(&mut self) -> ArrayRef { |
149 | | Arc::new(self.finish()) |
150 | | } |
151 | | |
152 | | /// Builds the array without resetting the builder. |
153 | | fn finish_cloned(&self) -> ArrayRef { |
154 | | Arc::new(self.finish_cloned()) |
155 | | } |
156 | | } |
157 | | |
158 | | impl<R, V> GenericByteRunBuilder<R, V> |
159 | | where |
160 | | R: RunEndIndexType, |
161 | | V: ByteArrayType, |
162 | | { |
163 | | /// Appends optional value to the logical array encoded by the RunArray. |
164 | 9 | pub fn append_option(&mut self, input_value: Option<impl AsRef<V::Native>>) { |
165 | 9 | match input_value { |
166 | 6 | Some(value) => self.append_value(value), |
167 | 3 | None => self.append_null(), |
168 | | } |
169 | 9 | } |
170 | | |
171 | | /// Appends value to the logical array encoded by the RunArray. |
172 | 15 | pub fn append_value(&mut self, input_value: impl AsRef<V::Native>) { |
173 | 15 | let value: &[u8] = input_value.as_ref().as_ref(); |
174 | 15 | if !self.has_current_value { |
175 | 5 | self.append_run_end(); |
176 | 5 | self.current_value.extend_from_slice(value); |
177 | 5 | self.has_current_value = true; |
178 | 10 | } else if self.current_value.as_slice() != value { |
179 | 5 | self.append_run_end(); |
180 | 5 | self.current_value.clear(); |
181 | 5 | self.current_value.extend_from_slice(value); |
182 | 5 | } |
183 | 15 | self.current_run_end_index += 1; |
184 | 15 | } |
185 | | |
186 | | /// Appends null to the logical array encoded by the RunArray. |
187 | 3 | pub fn append_null(&mut self) { |
188 | 3 | if self.has_current_value { |
189 | 1 | self.append_run_end(); |
190 | 1 | self.current_value.clear(); |
191 | 1 | self.has_current_value = false; |
192 | 2 | } |
193 | 3 | self.current_run_end_index += 1; |
194 | 3 | } |
195 | | |
196 | | /// Creates the RunArray and resets the builder. |
197 | | /// Panics if RunArray cannot be built. |
198 | 4 | pub fn finish(&mut self) -> RunArray<R> { |
199 | | // write the last run end to the array. |
200 | 4 | self.append_run_end(); |
201 | | |
202 | | // reset the run end index to zero. |
203 | 4 | self.current_value.clear(); |
204 | 4 | self.has_current_value = false; |
205 | 4 | self.current_run_end_index = 0; |
206 | 4 | self.prev_run_end_index = 0; |
207 | | |
208 | | // build the run encoded array by adding run_ends and values array as its children. |
209 | 4 | let run_ends_array = self.run_ends_builder.finish(); |
210 | 4 | let values_array = self.values_builder.finish(); |
211 | 4 | RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap() |
212 | 4 | } |
213 | | |
214 | | /// Creates the RunArray and without resetting the builder. |
215 | | /// Panics if RunArray cannot be built. |
216 | | pub fn finish_cloned(&self) -> RunArray<R> { |
217 | | let mut run_ends_array = self.run_ends_builder.finish_cloned(); |
218 | | let mut values_array = self.values_builder.finish_cloned(); |
219 | | |
220 | | // Add current run if one exists |
221 | | if self.prev_run_end_index != self.current_run_end_index { |
222 | | let mut run_end_builder = run_ends_array.into_builder().unwrap(); |
223 | | let mut values_builder = values_array.into_builder().unwrap(); |
224 | | self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); |
225 | | run_ends_array = run_end_builder.finish(); |
226 | | values_array = values_builder.finish(); |
227 | | } |
228 | | |
229 | | RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap() |
230 | | } |
231 | | |
232 | | // Appends the current run to the array. |
233 | 15 | fn append_run_end(&mut self) { |
234 | | // empty array or the function called without appending any value. |
235 | 15 | if self.prev_run_end_index == self.current_run_end_index { |
236 | 3 | return; |
237 | 12 | } |
238 | 12 | let run_end_index = self.run_end_index_as_native(); |
239 | 12 | self.run_ends_builder.append_value(run_end_index); |
240 | 12 | if self.has_current_value { |
241 | 10 | let slice = self.current_value.as_slice(); |
242 | 10 | let native = unsafe { |
243 | 10 | // Safety: |
244 | 10 | // As self.current_value is created from V::Native. The value V::Native can be |
245 | 10 | // built back from the bytes without validations |
246 | 10 | V::Native::from_bytes_unchecked(slice) |
247 | 10 | }; |
248 | 10 | self.values_builder.append_value(native); |
249 | 10 | } else { |
250 | 2 | self.values_builder.append_null(); |
251 | 2 | } |
252 | 12 | self.prev_run_end_index = self.current_run_end_index; |
253 | 15 | } |
254 | | |
255 | | // Similar to `append_run_end` but on custom builders. |
256 | | // Used in `finish_cloned` which is not suppose to mutate `self`. |
257 | | fn append_run_end_with_builders( |
258 | | &self, |
259 | | run_ends_builder: &mut PrimitiveBuilder<R>, |
260 | | values_builder: &mut GenericByteBuilder<V>, |
261 | | ) { |
262 | | let run_end_index = self.run_end_index_as_native(); |
263 | | run_ends_builder.append_value(run_end_index); |
264 | | if self.has_current_value { |
265 | | let slice = self.current_value.as_slice(); |
266 | | let native = unsafe { |
267 | | // Safety: |
268 | | // As self.current_value is created from V::Native. The value V::Native can be |
269 | | // built back from the bytes without validations |
270 | | V::Native::from_bytes_unchecked(slice) |
271 | | }; |
272 | | values_builder.append_value(native); |
273 | | } else { |
274 | | values_builder.append_null(); |
275 | | } |
276 | | } |
277 | | |
278 | 12 | fn run_end_index_as_native(&self) -> R::Native { |
279 | 12 | R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| {0 |
280 | 0 | panic!( |
281 | 0 | "Cannot convert the value {} from `usize` to native form of arrow datatype {}", |
282 | | self.current_run_end_index, |
283 | 0 | R::DATA_TYPE |
284 | | ) |
285 | | }) |
286 | 12 | } |
287 | | } |
288 | | |
289 | | impl<R, V, S> Extend<Option<S>> for GenericByteRunBuilder<R, V> |
290 | | where |
291 | | R: RunEndIndexType, |
292 | | V: ByteArrayType, |
293 | | S: AsRef<V::Native>, |
294 | | { |
295 | | fn extend<T: IntoIterator<Item = Option<S>>>(&mut self, iter: T) { |
296 | | for elem in iter { |
297 | | self.append_option(elem); |
298 | | } |
299 | | } |
300 | | } |
301 | | |
302 | | /// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray) |
303 | | /// |
304 | | /// ``` |
305 | | /// // Create a run-end encoded array with run-end indexes data type as `i16`. |
306 | | /// // The encoded values are Strings. |
307 | | /// |
308 | | /// # use arrow_array::builder::StringRunBuilder; |
309 | | /// # use arrow_array::{Int16Array, StringArray}; |
310 | | /// # use arrow_array::types::Int16Type; |
311 | | /// # use arrow_array::cast::AsArray; |
312 | | /// # |
313 | | /// let mut builder = StringRunBuilder::<Int16Type>::new(); |
314 | | /// |
315 | | /// // The builder builds the dictionary value by value |
316 | | /// builder.append_value("abc"); |
317 | | /// builder.append_null(); |
318 | | /// builder.extend([Some("def"), Some("def"), Some("abc")]); |
319 | | /// let array = builder.finish(); |
320 | | /// |
321 | | /// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); |
322 | | /// |
323 | | /// // Values are polymorphic and so require a downcast. |
324 | | /// let av = array.values(); |
325 | | /// let ava: &StringArray = av.as_string::<i32>(); |
326 | | /// |
327 | | /// assert_eq!(ava.value(0), "abc"); |
328 | | /// assert!(av.is_null(1)); |
329 | | /// assert_eq!(ava.value(2), "def"); |
330 | | /// assert_eq!(ava.value(3), "abc"); |
331 | | /// |
332 | | /// ``` |
333 | | pub type StringRunBuilder<K> = GenericByteRunBuilder<K, Utf8Type>; |
334 | | |
335 | | /// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray) |
336 | | pub type LargeStringRunBuilder<K> = GenericByteRunBuilder<K, LargeUtf8Type>; |
337 | | |
338 | | /// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray) |
339 | | /// |
340 | | /// ``` |
341 | | /// // Create a run-end encoded array with run-end indexes data type as `i16`. |
342 | | /// // The encoded data is binary values. |
343 | | /// |
344 | | /// # use arrow_array::builder::BinaryRunBuilder; |
345 | | /// # use arrow_array::{BinaryArray, Int16Array}; |
346 | | /// # use arrow_array::cast::AsArray; |
347 | | /// # use arrow_array::types::Int16Type; |
348 | | /// |
349 | | /// let mut builder = BinaryRunBuilder::<Int16Type>::new(); |
350 | | /// |
351 | | /// // The builder builds the dictionary value by value |
352 | | /// builder.append_value(b"abc"); |
353 | | /// builder.append_null(); |
354 | | /// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); |
355 | | /// let array = builder.finish(); |
356 | | /// |
357 | | /// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); |
358 | | /// |
359 | | /// // Values are polymorphic and so require a downcast. |
360 | | /// let av = array.values(); |
361 | | /// let ava: &BinaryArray = av.as_binary(); |
362 | | /// |
363 | | /// assert_eq!(ava.value(0), b"abc"); |
364 | | /// assert!(av.is_null(1)); |
365 | | /// assert_eq!(ava.value(2), b"def"); |
366 | | /// assert_eq!(ava.value(3), b"abc"); |
367 | | /// |
368 | | /// ``` |
369 | | pub type BinaryRunBuilder<K> = GenericByteRunBuilder<K, BinaryType>; |
370 | | |
371 | | /// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) |
372 | | pub type LargeBinaryRunBuilder<K> = GenericByteRunBuilder<K, LargeBinaryType>; |
373 | | |
374 | | #[cfg(test)] |
375 | | mod tests { |
376 | | use super::*; |
377 | | |
378 | | use crate::GenericByteArray; |
379 | | use crate::Int16RunArray; |
380 | | use crate::array::Array; |
381 | | use crate::cast::AsArray; |
382 | | use crate::types::{Int16Type, Int32Type}; |
383 | | |
384 | | fn test_bytes_run_builder<T>(values: Vec<&T::Native>) |
385 | | where |
386 | | T: ByteArrayType, |
387 | | <T as ByteArrayType>::Native: PartialEq, |
388 | | <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>, |
389 | | { |
390 | | let mut builder = GenericByteRunBuilder::<Int16Type, T>::new(); |
391 | | builder.append_value(values[0]); |
392 | | builder.append_value(values[0]); |
393 | | builder.append_value(values[0]); |
394 | | builder.append_null(); |
395 | | builder.append_null(); |
396 | | builder.append_value(values[1]); |
397 | | builder.append_value(values[1]); |
398 | | builder.append_value(values[2]); |
399 | | builder.append_value(values[2]); |
400 | | builder.append_value(values[2]); |
401 | | builder.append_value(values[2]); |
402 | | let array = builder.finish(); |
403 | | |
404 | | assert_eq!(array.len(), 11); |
405 | | assert_eq!(array.null_count(), 0); |
406 | | assert_eq!(array.logical_null_count(), 2); |
407 | | |
408 | | assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]); |
409 | | |
410 | | // Values are polymorphic and so require a downcast. |
411 | | let av = array.values(); |
412 | | let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap(); |
413 | | |
414 | | assert_eq!(*ava.value(0), *values[0]); |
415 | | assert!(ava.is_null(1)); |
416 | | assert_eq!(*ava.value(2), *values[1]); |
417 | | assert_eq!(*ava.value(3), *values[2]); |
418 | | } |
419 | | |
420 | | #[test] |
421 | | fn test_string_run_builder() { |
422 | | test_bytes_run_builder::<Utf8Type>(vec!["abc", "def", "ghi"]); |
423 | | } |
424 | | |
425 | | #[test] |
426 | | fn test_string_run_builder_with_empty_strings() { |
427 | | test_bytes_run_builder::<Utf8Type>(vec!["abc", "", "ghi"]); |
428 | | } |
429 | | |
430 | | #[test] |
431 | | fn test_binary_run_builder() { |
432 | | test_bytes_run_builder::<BinaryType>(vec![b"abc", b"def", b"ghi"]); |
433 | | } |
434 | | |
435 | | fn test_bytes_run_builder_finish_cloned<T>(values: Vec<&T::Native>) |
436 | | where |
437 | | T: ByteArrayType, |
438 | | <T as ByteArrayType>::Native: PartialEq, |
439 | | <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>, |
440 | | { |
441 | | let mut builder = GenericByteRunBuilder::<Int16Type, T>::new(); |
442 | | |
443 | | builder.append_value(values[0]); |
444 | | builder.append_null(); |
445 | | builder.append_value(values[1]); |
446 | | builder.append_value(values[1]); |
447 | | builder.append_value(values[0]); |
448 | | let mut array: Int16RunArray = builder.finish_cloned(); |
449 | | |
450 | | assert_eq!(array.len(), 5); |
451 | | assert_eq!(array.null_count(), 0); |
452 | | assert_eq!(array.logical_null_count(), 1); |
453 | | |
454 | | assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); |
455 | | |
456 | | // Values are polymorphic and so require a downcast. |
457 | | let av = array.values(); |
458 | | let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap(); |
459 | | |
460 | | assert_eq!(ava.value(0), values[0]); |
461 | | assert!(ava.is_null(1)); |
462 | | assert_eq!(ava.value(2), values[1]); |
463 | | assert_eq!(ava.value(3), values[0]); |
464 | | |
465 | | // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only |
466 | | // one entry in final output. |
467 | | builder.append_value(values[0]); |
468 | | builder.append_value(values[0]); |
469 | | builder.append_value(values[1]); |
470 | | array = builder.finish(); |
471 | | |
472 | | assert_eq!(array.len(), 8); |
473 | | assert_eq!(array.null_count(), 0); |
474 | | assert_eq!(array.logical_null_count(), 1); |
475 | | |
476 | | assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]); |
477 | | |
478 | | // Values are polymorphic and so require a downcast. |
479 | | let av2 = array.values(); |
480 | | let ava2: &GenericByteArray<T> = |
481 | | av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap(); |
482 | | |
483 | | assert_eq!(ava2.value(0), values[0]); |
484 | | assert!(ava2.is_null(1)); |
485 | | assert_eq!(ava2.value(2), values[1]); |
486 | | // The value appended before and after `finish_cloned` has only one entry. |
487 | | assert_eq!(ava2.value(3), values[0]); |
488 | | assert_eq!(ava2.value(4), values[1]); |
489 | | } |
490 | | |
491 | | #[test] |
492 | | fn test_string_run_builder_finish_cloned() { |
493 | | test_bytes_run_builder_finish_cloned::<Utf8Type>(vec!["abc", "def", "ghi"]); |
494 | | } |
495 | | |
496 | | #[test] |
497 | | fn test_binary_run_builder_finish_cloned() { |
498 | | test_bytes_run_builder_finish_cloned::<BinaryType>(vec![b"abc", b"def", b"ghi"]); |
499 | | } |
500 | | |
501 | | #[test] |
502 | | fn test_extend() { |
503 | | let mut builder = StringRunBuilder::<Int32Type>::new(); |
504 | | builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some)); |
505 | | builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some)); |
506 | | let array = builder.finish(); |
507 | | |
508 | | assert_eq!(array.len(), 10); |
509 | | assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]); |
510 | | |
511 | | let str_array = array.values().as_string::<i32>(); |
512 | | assert_eq!(str_array.value(0), "a"); |
513 | | assert_eq!(str_array.value(1), ""); |
514 | | assert_eq!(str_array.value(2), "b"); |
515 | | assert_eq!(str_array.value(3), "cupcakes"); |
516 | | } |
517 | | } |