/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::builder::ArrayBuilder; |
19 | | use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; |
20 | | use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; |
21 | | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer}; |
22 | | use arrow_data::ArrayDataBuilder; |
23 | | use std::any::Any; |
24 | | use std::sync::Arc; |
25 | | |
26 | | /// Builder for [`GenericByteArray`] |
27 | | /// |
28 | | /// For building strings, see docs on [`GenericStringBuilder`]. |
29 | | /// For building binary, see docs on [`GenericBinaryBuilder`]. |
30 | | pub struct GenericByteBuilder<T: ByteArrayType> { |
31 | | value_builder: Vec<u8>, |
32 | | offsets_builder: Vec<T::Offset>, |
33 | | null_buffer_builder: NullBufferBuilder, |
34 | | } |
35 | | |
36 | | impl<T: ByteArrayType> GenericByteBuilder<T> { |
37 | | /// Creates a new [`GenericByteBuilder`]. |
38 | 6 | pub fn new() -> Self { |
39 | 6 | Self::with_capacity(1024, 1024) |
40 | 6 | } |
41 | | |
42 | | /// Creates a new [`GenericByteBuilder`]. |
43 | | /// |
44 | | /// - `item_capacity` is the number of items to pre-allocate. |
45 | | /// The size of the preallocated buffer of offsets is the number of items plus one. |
46 | | /// - `data_capacity` is the total number of bytes of data to pre-allocate |
47 | | /// (for all items, not per item). |
48 | 24 | pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { |
49 | 24 | let mut offsets_builder = Vec::with_capacity(item_capacity + 1); |
50 | 24 | offsets_builder.push(T::Offset::from_usize(0).unwrap()); |
51 | 24 | Self { |
52 | 24 | value_builder: Vec::with_capacity(data_capacity), |
53 | 24 | offsets_builder, |
54 | 24 | null_buffer_builder: NullBufferBuilder::new(item_capacity), |
55 | 24 | } |
56 | 24 | } |
57 | | |
58 | | /// Creates a new [`GenericByteBuilder`] from buffers. |
59 | | /// |
60 | | /// # Safety |
61 | | /// |
62 | | /// This doesn't verify buffer contents as it assumes the buffers are from |
63 | | /// existing and valid [`GenericByteArray`]. |
64 | | pub unsafe fn new_from_buffer( |
65 | | offsets_buffer: MutableBuffer, |
66 | | value_buffer: MutableBuffer, |
67 | | null_buffer: Option<MutableBuffer>, |
68 | | ) -> Self { |
69 | | let offsets_builder: Vec<T::Offset> = |
70 | | ScalarBuffer::<T::Offset>::from(offsets_buffer).into(); |
71 | | let value_builder: Vec<u8> = ScalarBuffer::<u8>::from(value_buffer).into(); |
72 | | |
73 | | let null_buffer_builder = null_buffer |
74 | | .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) |
75 | | .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); |
76 | | |
77 | | Self { |
78 | | offsets_builder, |
79 | | value_builder, |
80 | | null_buffer_builder, |
81 | | } |
82 | | } |
83 | | |
84 | | #[inline] |
85 | 120 | fn next_offset(&self) -> T::Offset { |
86 | 120 | T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") |
87 | 120 | } |
88 | | |
89 | | /// Appends a value into the builder. |
90 | | /// |
91 | | /// See the [GenericStringBuilder] documentation for examples of |
92 | | /// incrementally building string values with multiple `write!` calls. |
93 | | /// |
94 | | /// # Panics |
95 | | /// |
96 | | /// Panics if the resulting length of [`Self::values_slice`] would exceed |
97 | | /// `T::Offset::MAX` bytes. |
98 | | /// |
99 | | /// For example, this can happen with [`StringArray`] or [`BinaryArray`] |
100 | | /// where the total length of all values exceeds 2GB |
101 | | /// |
102 | | /// [`StringArray`]: crate::StringArray |
103 | | /// [`BinaryArray`]: crate::BinaryArray |
104 | | #[inline] |
105 | 32 | pub fn append_value(&mut self, value: impl AsRef<T::Native>) { |
106 | 32 | self.value_builder |
107 | 32 | .extend_from_slice(value.as_ref().as_ref()); |
108 | 32 | self.null_buffer_builder.append(true); |
109 | 32 | self.offsets_builder.push(self.next_offset()); |
110 | 32 | } |
111 | | |
112 | | /// Append an `Option` value into the builder. |
113 | | /// |
114 | | /// - A `None` value will append a null value. |
115 | | /// - A `Some` value will append the value. |
116 | | /// |
117 | | /// See [`Self::append_value`] for more panic information. |
118 | | #[inline] |
119 | 27 | pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) { |
120 | 27 | match value { |
121 | 5 | None => self.append_null(), |
122 | 22 | Some(v) => self.append_value(v), |
123 | | }; |
124 | 27 | } |
125 | | |
126 | | /// Append a null value into the builder. |
127 | | #[inline] |
128 | 7 | pub fn append_null(&mut self) { |
129 | 7 | self.null_buffer_builder.append(false); |
130 | 7 | self.offsets_builder.push(self.next_offset()); |
131 | 7 | } |
132 | | |
133 | | /// Appends `n` `null`s into the builder. |
134 | | #[inline] |
135 | | pub fn append_nulls(&mut self, n: usize) { |
136 | | self.null_buffer_builder.append_n_nulls(n); |
137 | | let next_offset = self.next_offset(); |
138 | | self.offsets_builder |
139 | | .extend(std::iter::repeat_n(next_offset, n)); |
140 | | } |
141 | | |
142 | | /// Appends array values and null to this builder as is |
143 | | /// (this means that underlying null values are copied as is). |
144 | | #[inline] |
145 | 37 | pub fn append_array(&mut self, array: &GenericByteArray<T>) { |
146 | 37 | if array.len() == 0 { |
147 | 2 | return; |
148 | 35 | } |
149 | | |
150 | 35 | let offsets = array.offsets(); |
151 | | |
152 | | // If the offsets are contiguous, we can append them directly avoiding the need to align |
153 | | // for example, when the first appended array is not sliced (starts at offset 0) |
154 | 35 | if self.next_offset() == offsets[0] { |
155 | 13 | self.offsets_builder.extend_from_slice(&offsets[1..]); |
156 | 13 | } else { |
157 | | // Shifting all the offsets |
158 | 22 | let shift: T::Offset = self.next_offset() - offsets[0]; |
159 | | |
160 | | // Creating intermediate offsets instead of pushing each offset is faster |
161 | | // (even if we make MutableBuffer to avoid updating length on each push |
162 | | // and reserve the necessary capacity, it's still slower) |
163 | 22 | let mut intermediate = Vec::with_capacity(offsets.len() - 1); |
164 | | |
165 | 55 | for &offset in &offsets[1..]22 { |
166 | 55 | intermediate.push(offset + shift) |
167 | | } |
168 | | |
169 | 22 | self.offsets_builder.extend_from_slice(&intermediate); |
170 | | } |
171 | | |
172 | | // Append underlying values, starting from the first offset and ending at the last offset |
173 | 35 | self.value_builder.extend_from_slice( |
174 | 35 | &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()], |
175 | | ); |
176 | | |
177 | 35 | if let Some(null_buffer3 ) = array.nulls() { |
178 | 3 | self.null_buffer_builder.append_buffer(null_buffer); |
179 | 32 | } else { |
180 | 32 | self.null_buffer_builder.append_n_non_nulls(array.len()); |
181 | 32 | } |
182 | 37 | } |
183 | | |
184 | | /// Builds the [`GenericByteArray`] and reset this builder. |
185 | 24 | pub fn finish(&mut self) -> GenericByteArray<T> { |
186 | 24 | let array_type = T::DATA_TYPE; |
187 | 24 | let array_builder = ArrayDataBuilder::new(array_type) |
188 | 24 | .len(self.len()) |
189 | 24 | .add_buffer(std::mem::take(&mut self.offsets_builder).into()) |
190 | 24 | .add_buffer(std::mem::take(&mut self.value_builder).into()) |
191 | 24 | .nulls(self.null_buffer_builder.finish()); |
192 | | |
193 | 24 | self.offsets_builder.push(self.next_offset()); |
194 | 24 | let array_data = unsafe { array_builder.build_unchecked() }; |
195 | 24 | GenericByteArray::from(array_data) |
196 | 24 | } |
197 | | |
198 | | /// Builds the [`GenericByteArray`] without resetting the builder. |
199 | 0 | pub fn finish_cloned(&self) -> GenericByteArray<T> { |
200 | 0 | let array_type = T::DATA_TYPE; |
201 | 0 | let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); |
202 | 0 | let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice()); |
203 | 0 | let array_builder = ArrayDataBuilder::new(array_type) |
204 | 0 | .len(self.len()) |
205 | 0 | .add_buffer(offset_buffer) |
206 | 0 | .add_buffer(value_buffer) |
207 | 0 | .nulls(self.null_buffer_builder.finish_cloned()); |
208 | | |
209 | 0 | let array_data = unsafe { array_builder.build_unchecked() }; |
210 | 0 | GenericByteArray::from(array_data) |
211 | 0 | } |
212 | | |
213 | | /// Returns the current values buffer as a slice |
214 | 0 | pub fn values_slice(&self) -> &[u8] { |
215 | 0 | self.value_builder.as_slice() |
216 | 0 | } |
217 | | |
218 | | /// Returns the current offsets buffer as a slice |
219 | 0 | pub fn offsets_slice(&self) -> &[T::Offset] { |
220 | 0 | self.offsets_builder.as_slice() |
221 | 0 | } |
222 | | |
223 | | /// Returns the current null buffer as a slice |
224 | | pub fn validity_slice(&self) -> Option<&[u8]> { |
225 | | self.null_buffer_builder.as_slice() |
226 | | } |
227 | | |
228 | | /// Returns the current null buffer as a mutable slice |
229 | | pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { |
230 | | self.null_buffer_builder.as_slice_mut() |
231 | | } |
232 | | } |
233 | | |
234 | | impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> { |
235 | | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
236 | | write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?; |
237 | | f.debug_struct("") |
238 | | .field("value_builder", &self.value_builder) |
239 | | .field("offsets_builder", &self.offsets_builder) |
240 | | .field("null_buffer_builder", &self.null_buffer_builder) |
241 | | .finish() |
242 | | } |
243 | | } |
244 | | |
245 | | impl<T: ByteArrayType> Default for GenericByteBuilder<T> { |
246 | | fn default() -> Self { |
247 | | Self::new() |
248 | | } |
249 | | } |
250 | | |
251 | | impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> { |
252 | | /// Returns the number of binary slots in the builder |
253 | 45 | fn len(&self) -> usize { |
254 | 45 | self.null_buffer_builder.len() |
255 | 45 | } |
256 | | |
257 | | /// Builds the array and reset this builder. |
258 | 6 | fn finish(&mut self) -> ArrayRef { |
259 | 6 | Arc::new(self.finish()) |
260 | 6 | } |
261 | | |
262 | | /// Builds the array without resetting the builder. |
263 | 0 | fn finish_cloned(&self) -> ArrayRef { |
264 | 0 | Arc::new(self.finish_cloned()) |
265 | 0 | } |
266 | | |
267 | | /// Returns the builder as a non-mutable `Any` reference. |
268 | 0 | fn as_any(&self) -> &dyn Any { |
269 | 0 | self |
270 | 0 | } |
271 | | |
272 | | /// Returns the builder as a mutable `Any` reference. |
273 | 3 | fn as_any_mut(&mut self) -> &mut dyn Any { |
274 | 3 | self |
275 | 3 | } |
276 | | |
277 | | /// Returns the boxed builder as a box of `Any`. |
278 | 0 | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
279 | 0 | self |
280 | 0 | } |
281 | | } |
282 | | |
283 | | impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> { |
284 | | #[inline] |
285 | 5 | fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) { |
286 | 32 | for v27 in iter { |
287 | 27 | self.append_option(v) |
288 | | } |
289 | 5 | } |
290 | | } |
291 | | |
292 | | /// Array builder for [`GenericStringArray`][crate::GenericStringArray] |
293 | | /// |
294 | | /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with |
295 | | /// [`GenericByteBuilder::append_null`]. |
296 | | /// |
297 | | /// This builder also implements [`std::fmt::Write`] with any written data |
298 | | /// included in the next appended value. This allows using [`std::fmt::Display`] |
299 | | /// with standard Rust idioms like `write!` and `writeln!` to write data |
300 | | /// directly to the builder without intermediate allocations. |
301 | | /// |
302 | | /// # Example writing strings with `append_value` |
303 | | /// ``` |
304 | | /// # use arrow_array::builder::GenericStringBuilder; |
305 | | /// let mut builder = GenericStringBuilder::<i32>::new(); |
306 | | /// |
307 | | /// // Write one string value |
308 | | /// builder.append_value("foobarbaz"); |
309 | | /// |
310 | | /// // Write a second string |
311 | | /// builder.append_value("v2"); |
312 | | /// |
313 | | /// let array = builder.finish(); |
314 | | /// assert_eq!(array.value(0), "foobarbaz"); |
315 | | /// assert_eq!(array.value(1), "v2"); |
316 | | /// ``` |
317 | | /// |
318 | | /// # Example incrementally writing strings with `std::fmt::Write` |
319 | | /// |
320 | | /// ``` |
321 | | /// # use std::fmt::Write; |
322 | | /// # use arrow_array::builder::GenericStringBuilder; |
323 | | /// let mut builder = GenericStringBuilder::<i32>::new(); |
324 | | /// |
325 | | /// // Write data in multiple `write!` calls |
326 | | /// write!(builder, "foo").unwrap(); |
327 | | /// write!(builder, "bar").unwrap(); |
328 | | /// // The next call to append_value finishes the current string |
329 | | /// // including all previously written strings. |
330 | | /// builder.append_value("baz"); |
331 | | /// |
332 | | /// // Write second value with a single write call |
333 | | /// write!(builder, "v2").unwrap(); |
334 | | /// // finish the value by calling append_value with an empty string |
335 | | /// builder.append_value(""); |
336 | | /// |
337 | | /// let array = builder.finish(); |
338 | | /// assert_eq!(array.value(0), "foobarbaz"); |
339 | | /// assert_eq!(array.value(1), "v2"); |
340 | | /// ``` |
341 | | pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>; |
342 | | |
343 | | impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> { |
344 | 0 | fn write_str(&mut self, s: &str) -> std::fmt::Result { |
345 | 0 | self.value_builder.extend_from_slice(s.as_bytes()); |
346 | 0 | Ok(()) |
347 | 0 | } |
348 | | } |
349 | | |
350 | | /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] |
351 | | /// |
352 | | /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with |
353 | | /// [`GenericByteBuilder::append_null`]. |
354 | | /// |
355 | | /// # Example |
356 | | /// ``` |
357 | | /// # use arrow_array::builder::GenericBinaryBuilder; |
358 | | /// let mut builder = GenericBinaryBuilder::<i32>::new(); |
359 | | /// |
360 | | /// // Write data |
361 | | /// builder.append_value("foo"); |
362 | | /// |
363 | | /// // Write second value |
364 | | /// builder.append_value(&[0,1,2]); |
365 | | /// |
366 | | /// let array = builder.finish(); |
367 | | /// // binary values |
368 | | /// assert_eq!(array.value(0), b"foo"); |
369 | | /// assert_eq!(array.value(1), b"\x00\x01\x02"); |
370 | | /// ``` |
371 | | /// |
372 | | /// # Example incrementally writing bytes with `write_bytes` |
373 | | /// |
374 | | /// ``` |
375 | | /// # use std::io::Write; |
376 | | /// # use arrow_array::builder::GenericBinaryBuilder; |
377 | | /// let mut builder = GenericBinaryBuilder::<i32>::new(); |
378 | | /// |
379 | | /// // Write data in multiple `write_bytes` calls |
380 | | /// write!(builder, "foo").unwrap(); |
381 | | /// write!(builder, "bar").unwrap(); |
382 | | /// // The next call to append_value finishes the current string |
383 | | /// // including all previously written strings. |
384 | | /// builder.append_value("baz"); |
385 | | /// |
386 | | /// // Write second value with a single write call |
387 | | /// write!(builder, "v2").unwrap(); |
388 | | /// // finish the value by calling append_value with an empty string |
389 | | /// builder.append_value(""); |
390 | | /// |
391 | | /// let array = builder.finish(); |
392 | | /// assert_eq!(array.value(0), "foobarbaz".as_bytes()); |
393 | | /// assert_eq!(array.value(1), "v2".as_bytes()); |
394 | | /// ``` |
395 | | pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>; |
396 | | |
397 | | impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> { |
398 | | fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> { |
399 | | self.value_builder.extend_from_slice(bs); |
400 | | Ok(bs.len()) |
401 | | } |
402 | | |
403 | | fn flush(&mut self) -> std::io::Result<()> { |
404 | | Ok(()) |
405 | | } |
406 | | } |
407 | | |
408 | | #[cfg(test)] |
409 | | mod tests { |
410 | | use super::*; |
411 | | use crate::array::Array; |
412 | | use crate::GenericStringArray; |
413 | | use arrow_buffer::NullBuffer; |
414 | | use std::fmt::Write as _; |
415 | | use std::io::Write as _; |
416 | | |
417 | | fn _test_generic_binary_builder<O: OffsetSizeTrait>() { |
418 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
419 | | |
420 | | builder.append_value(b"hello"); |
421 | | builder.append_value(b""); |
422 | | builder.append_null(); |
423 | | builder.append_value(b"rust"); |
424 | | |
425 | | let array = builder.finish(); |
426 | | |
427 | | assert_eq!(4, array.len()); |
428 | | assert_eq!(1, array.null_count()); |
429 | | assert_eq!(b"hello", array.value(0)); |
430 | | assert_eq!([] as [u8; 0], array.value(1)); |
431 | | assert!(array.is_null(2)); |
432 | | assert_eq!(b"rust", array.value(3)); |
433 | | assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]); |
434 | | assert_eq!(O::from_usize(4).unwrap(), array.value_length(3)); |
435 | | } |
436 | | |
437 | | #[test] |
438 | | fn test_binary_builder() { |
439 | | _test_generic_binary_builder::<i32>() |
440 | | } |
441 | | |
442 | | #[test] |
443 | | fn test_large_binary_builder() { |
444 | | _test_generic_binary_builder::<i64>() |
445 | | } |
446 | | |
447 | | fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() { |
448 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
449 | | builder.append_null(); |
450 | | builder.append_null(); |
451 | | builder.append_null(); |
452 | | builder.append_nulls(2); |
453 | | assert_eq!(5, builder.len()); |
454 | | assert!(!builder.is_empty()); |
455 | | |
456 | | let array = builder.finish(); |
457 | | assert_eq!(5, array.null_count()); |
458 | | assert_eq!(5, array.len()); |
459 | | assert!(array.is_null(0)); |
460 | | assert!(array.is_null(1)); |
461 | | assert!(array.is_null(2)); |
462 | | assert!(array.is_null(3)); |
463 | | assert!(array.is_null(4)); |
464 | | } |
465 | | |
466 | | #[test] |
467 | | fn test_binary_builder_all_nulls() { |
468 | | _test_generic_binary_builder_all_nulls::<i32>() |
469 | | } |
470 | | |
471 | | #[test] |
472 | | fn test_large_binary_builder_all_nulls() { |
473 | | _test_generic_binary_builder_all_nulls::<i64>() |
474 | | } |
475 | | |
476 | | fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() { |
477 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
478 | | |
479 | | builder.append_value(b"hello"); |
480 | | builder.append_value(b""); |
481 | | builder.append_null(); |
482 | | builder.append_value(b"rust"); |
483 | | builder.finish(); |
484 | | |
485 | | assert!(builder.is_empty()); |
486 | | |
487 | | builder.append_value(b"parquet"); |
488 | | builder.append_null(); |
489 | | builder.append_value(b"arrow"); |
490 | | builder.append_value(b""); |
491 | | builder.append_nulls(2); |
492 | | builder.append_value(b"hi"); |
493 | | let array = builder.finish(); |
494 | | |
495 | | assert_eq!(7, array.len()); |
496 | | assert_eq!(3, array.null_count()); |
497 | | assert_eq!(b"parquet", array.value(0)); |
498 | | assert!(array.is_null(1)); |
499 | | assert!(array.is_null(4)); |
500 | | assert!(array.is_null(5)); |
501 | | assert_eq!(b"arrow", array.value(2)); |
502 | | assert_eq!(b"", array.value(1)); |
503 | | assert_eq!(b"hi", array.value(6)); |
504 | | |
505 | | assert_eq!(O::zero(), array.value_offsets()[0]); |
506 | | assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]); |
507 | | assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]); |
508 | | assert_eq!(O::from_usize(5).unwrap(), array.value_length(2)); |
509 | | } |
510 | | |
511 | | #[test] |
512 | | fn test_binary_builder_reset() { |
513 | | _test_generic_binary_builder_reset::<i32>() |
514 | | } |
515 | | |
516 | | #[test] |
517 | | fn test_large_binary_builder_reset() { |
518 | | _test_generic_binary_builder_reset::<i64>() |
519 | | } |
520 | | |
521 | | fn _test_generic_string_array_builder<O: OffsetSizeTrait>() { |
522 | | let mut builder = GenericStringBuilder::<O>::new(); |
523 | | let owned = "arrow".to_owned(); |
524 | | |
525 | | builder.append_value("hello"); |
526 | | builder.append_value(""); |
527 | | builder.append_value(&owned); |
528 | | builder.append_null(); |
529 | | builder.append_option(Some("rust")); |
530 | | builder.append_option(None::<&str>); |
531 | | builder.append_option(None::<String>); |
532 | | builder.append_nulls(2); |
533 | | builder.append_value("parquet"); |
534 | | assert_eq!(10, builder.len()); |
535 | | |
536 | | assert_eq!( |
537 | | GenericStringArray::<O>::from(vec![ |
538 | | Some("hello"), |
539 | | Some(""), |
540 | | Some("arrow"), |
541 | | None, |
542 | | Some("rust"), |
543 | | None, |
544 | | None, |
545 | | None, |
546 | | None, |
547 | | Some("parquet") |
548 | | ]), |
549 | | builder.finish() |
550 | | ); |
551 | | } |
552 | | |
553 | | #[test] |
554 | | fn test_string_array_builder() { |
555 | | _test_generic_string_array_builder::<i32>() |
556 | | } |
557 | | |
558 | | #[test] |
559 | | fn test_large_string_array_builder() { |
560 | | _test_generic_string_array_builder::<i64>() |
561 | | } |
562 | | |
563 | | fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() { |
564 | | let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11); |
565 | | |
566 | | builder.append_value("hello"); |
567 | | builder.append_value("rust"); |
568 | | builder.append_null(); |
569 | | |
570 | | builder.finish(); |
571 | | assert!(builder.is_empty()); |
572 | | assert_eq!(&[O::zero()], builder.offsets_slice()); |
573 | | |
574 | | builder.append_value("arrow"); |
575 | | builder.append_value("parquet"); |
576 | | let arr = builder.finish(); |
577 | | // array should not have null buffer because there is not `null` value. |
578 | | assert!(arr.nulls().is_none()); |
579 | | assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,) |
580 | | } |
581 | | |
582 | | #[test] |
583 | | fn test_string_array_builder_finish() { |
584 | | _test_generic_string_array_builder_finish::<i32>() |
585 | | } |
586 | | |
587 | | #[test] |
588 | | fn test_large_string_array_builder_finish() { |
589 | | _test_generic_string_array_builder_finish::<i64>() |
590 | | } |
591 | | |
592 | | fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() { |
593 | | let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11); |
594 | | |
595 | | builder.append_value("hello"); |
596 | | builder.append_value("rust"); |
597 | | builder.append_null(); |
598 | | |
599 | | let mut arr = builder.finish_cloned(); |
600 | | assert!(!builder.is_empty()); |
601 | | assert_eq!(3, arr.len()); |
602 | | |
603 | | builder.append_value("arrow"); |
604 | | builder.append_value("parquet"); |
605 | | arr = builder.finish(); |
606 | | |
607 | | assert!(arr.nulls().is_some()); |
608 | | assert_eq!(&[O::zero()], builder.offsets_slice()); |
609 | | assert_eq!(5, arr.len()); |
610 | | } |
611 | | |
612 | | #[test] |
613 | | fn test_string_array_builder_finish_cloned() { |
614 | | _test_generic_string_array_builder_finish_cloned::<i32>() |
615 | | } |
616 | | |
617 | | #[test] |
618 | | fn test_large_string_array_builder_finish_cloned() { |
619 | | _test_generic_string_array_builder_finish_cloned::<i64>() |
620 | | } |
621 | | |
622 | | #[test] |
623 | | fn test_extend() { |
624 | | let mut builder = GenericStringBuilder::<i32>::new(); |
625 | | builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some)); |
626 | | builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some)); |
627 | | let array = builder.finish(); |
628 | | assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); |
629 | | assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); |
630 | | } |
631 | | |
632 | | #[test] |
633 | | fn test_write_str() { |
634 | | let mut builder = GenericStringBuilder::<i32>::new(); |
635 | | write!(builder, "foo").unwrap(); |
636 | | builder.append_value(""); |
637 | | writeln!(builder, "bar").unwrap(); |
638 | | builder.append_value(""); |
639 | | write!(builder, "fiz").unwrap(); |
640 | | write!(builder, "buz").unwrap(); |
641 | | builder.append_value(""); |
642 | | let a = builder.finish(); |
643 | | let r: Vec<_> = a.iter().flatten().collect(); |
644 | | assert_eq!(r, &["foo", "bar\n", "fizbuz"]) |
645 | | } |
646 | | |
647 | | #[test] |
648 | | fn test_write_bytes() { |
649 | | let mut builder = GenericBinaryBuilder::<i32>::new(); |
650 | | write!(builder, "foo").unwrap(); |
651 | | builder.append_value(""); |
652 | | writeln!(builder, "bar").unwrap(); |
653 | | builder.append_value(""); |
654 | | write!(builder, "fiz").unwrap(); |
655 | | write!(builder, "buz").unwrap(); |
656 | | builder.append_value(""); |
657 | | let a = builder.finish(); |
658 | | let r: Vec<_> = a.iter().flatten().collect(); |
659 | | assert_eq!( |
660 | | r, |
661 | | &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] |
662 | | ) |
663 | | } |
664 | | |
665 | | #[test] |
666 | | fn test_append_array_without_nulls() { |
667 | | let input = vec![ |
668 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
669 | | "thank", "you", "for", "asking", |
670 | | ]; |
671 | | let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
672 | | let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
673 | | let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
674 | | |
675 | | let mut builder = GenericStringBuilder::<i32>::new(); |
676 | | builder.append_array(&arr1); |
677 | | builder.append_array(&arr2); |
678 | | builder.append_array(&arr3); |
679 | | |
680 | | let actual = builder.finish(); |
681 | | let expected = GenericStringArray::<i32>::from(input); |
682 | | |
683 | | assert_eq!(actual, expected); |
684 | | } |
685 | | |
686 | | #[test] |
687 | | fn test_append_array_with_nulls() { |
688 | | let input = vec![ |
689 | | Some("hello"), |
690 | | None, |
691 | | Some("how"), |
692 | | None, |
693 | | None, |
694 | | None, |
695 | | None, |
696 | | Some("I"), |
697 | | Some("am"), |
698 | | Some("doing"), |
699 | | Some("well"), |
700 | | ]; |
701 | | let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
702 | | let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
703 | | let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
704 | | |
705 | | let mut builder = GenericStringBuilder::<i32>::new(); |
706 | | builder.append_array(&arr1); |
707 | | builder.append_array(&arr2); |
708 | | builder.append_array(&arr3); |
709 | | |
710 | | let actual = builder.finish(); |
711 | | let expected = GenericStringArray::<i32>::from(input); |
712 | | |
713 | | assert_eq!(actual, expected); |
714 | | } |
715 | | |
716 | | #[test] |
717 | | fn test_append_empty_array() { |
718 | | let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); |
719 | | let mut builder = GenericStringBuilder::<i32>::new(); |
720 | | builder.append_array(&arr); |
721 | | let result = builder.finish(); |
722 | | assert_eq!(result.len(), 0); |
723 | | } |
724 | | |
725 | | #[test] |
726 | | fn test_append_array_with_offset_not_starting_at_0() { |
727 | | let input = vec![ |
728 | | Some("hello"), |
729 | | None, |
730 | | Some("how"), |
731 | | None, |
732 | | None, |
733 | | None, |
734 | | None, |
735 | | Some("I"), |
736 | | Some("am"), |
737 | | Some("doing"), |
738 | | Some("well"), |
739 | | ]; |
740 | | let full_array = GenericStringArray::<i32>::from(input); |
741 | | let sliced = full_array.slice(1, 4); |
742 | | |
743 | | assert_ne!(sliced.offsets()[0].as_usize(), 0); |
744 | | assert_ne!(sliced.offsets().last(), full_array.offsets().last()); |
745 | | |
746 | | let mut builder = GenericStringBuilder::<i32>::new(); |
747 | | builder.append_array(&sliced); |
748 | | let actual = builder.finish(); |
749 | | |
750 | | let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); |
751 | | |
752 | | assert_eq!(actual, expected); |
753 | | } |
754 | | |
755 | | #[test] |
756 | | fn test_append_underlying_null_values_added_as_is() { |
757 | | let input_1_array_with_nulls = { |
758 | | let input = vec![ |
759 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", |
760 | | ]; |
761 | | let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
762 | | |
763 | | GenericStringArray::<i32>::new( |
764 | | offsets, |
765 | | buffer, |
766 | | Some(NullBuffer::from(&[ |
767 | | true, false, true, false, false, true, true, true, false, |
768 | | ])), |
769 | | ) |
770 | | }; |
771 | | let input_2_array_with_nulls = { |
772 | | let input = vec!["doing", "well", "thank", "you", "for", "asking"]; |
773 | | let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
774 | | |
775 | | GenericStringArray::<i32>::new( |
776 | | offsets, |
777 | | buffer, |
778 | | Some(NullBuffer::from(&[false, false, true, false, true, true])), |
779 | | ) |
780 | | }; |
781 | | |
782 | | let mut builder = GenericStringBuilder::<i32>::new(); |
783 | | builder.append_array(&input_1_array_with_nulls); |
784 | | builder.append_array(&input_2_array_with_nulls); |
785 | | |
786 | | let actual = builder.finish(); |
787 | | let expected = GenericStringArray::<i32>::from(vec![ |
788 | | Some("hello"), |
789 | | None, // world |
790 | | Some("how"), |
791 | | None, // are |
792 | | None, // you |
793 | | Some("doing"), |
794 | | Some("today"), |
795 | | Some("I"), |
796 | | None, // am |
797 | | None, // doing |
798 | | None, // well |
799 | | Some("thank"), |
800 | | None, // "you", |
801 | | Some("for"), |
802 | | Some("asking"), |
803 | | ]); |
804 | | |
805 | | assert_eq!(actual, expected); |
806 | | |
807 | | let expected_underlying_buffer = Buffer::from( |
808 | | [ |
809 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", |
810 | | "well", "thank", "you", "for", "asking", |
811 | | ] |
812 | | .join("") |
813 | | .as_bytes(), |
814 | | ); |
815 | | assert_eq!(actual.values(), &expected_underlying_buffer); |
816 | | } |
817 | | |
818 | | #[test] |
819 | | fn append_array_with_continues_indices() { |
820 | | let input = vec![ |
821 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
822 | | "thank", "you", "for", "asking", |
823 | | ]; |
824 | | let full_array = GenericStringArray::<i32>::from(input); |
825 | | let slice1 = full_array.slice(0, 3); |
826 | | let slice2 = full_array.slice(3, 4); |
827 | | let slice3 = full_array.slice(7, full_array.len() - 7); |
828 | | |
829 | | let mut builder = GenericStringBuilder::<i32>::new(); |
830 | | builder.append_array(&slice1); |
831 | | builder.append_array(&slice2); |
832 | | builder.append_array(&slice3); |
833 | | |
834 | | let actual = builder.finish(); |
835 | | |
836 | | assert_eq!(actual, full_array); |
837 | | } |
838 | | } |