/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::builder::ArrayBuilder; |
19 | | use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; |
20 | | use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; |
21 | | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer}; |
22 | | use arrow_data::ArrayDataBuilder; |
23 | | use arrow_schema::ArrowError; |
24 | | use std::any::Any; |
25 | | use std::sync::Arc; |
26 | | |
27 | | /// Builder for [`GenericByteArray`] |
28 | | /// |
29 | | /// For building strings, see docs on [`GenericStringBuilder`]. |
30 | | /// For building binary, see docs on [`GenericBinaryBuilder`]. |
31 | | pub struct GenericByteBuilder<T: ByteArrayType> { |
32 | | value_builder: Vec<u8>, |
33 | | offsets_builder: Vec<T::Offset>, |
34 | | null_buffer_builder: NullBufferBuilder, |
35 | | } |
36 | | |
37 | | impl<T: ByteArrayType> GenericByteBuilder<T> { |
38 | | /// Creates a new [`GenericByteBuilder`]. |
39 | 80.0k | pub fn new() -> Self { |
40 | 80.0k | Self::with_capacity(1024, 1024) |
41 | 80.0k | } |
42 | | |
43 | | /// Creates a new [`GenericByteBuilder`]. |
44 | | /// |
45 | | /// - `item_capacity` is the number of items to pre-allocate. |
46 | | /// The size of the preallocated buffer of offsets is the number of items plus one. |
47 | | /// - `data_capacity` is the total number of bytes of data to pre-allocate |
48 | | /// (for all items, not per item). |
49 | 80.3k | pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { |
50 | 80.3k | let mut offsets_builder = Vec::with_capacity(item_capacity + 1); |
51 | 80.3k | offsets_builder.push(T::Offset::from_usize(0).unwrap()); |
52 | 80.3k | Self { |
53 | 80.3k | value_builder: Vec::with_capacity(data_capacity), |
54 | 80.3k | offsets_builder, |
55 | 80.3k | null_buffer_builder: NullBufferBuilder::new(item_capacity), |
56 | 80.3k | } |
57 | 80.3k | } |
58 | | |
59 | | /// Creates a new [`GenericByteBuilder`] from buffers. |
60 | | /// |
61 | | /// # Safety |
62 | | /// |
63 | | /// This doesn't verify buffer contents as it assumes the buffers are from |
64 | | /// existing and valid [`GenericByteArray`]. |
65 | | pub unsafe fn new_from_buffer( |
66 | | offsets_buffer: MutableBuffer, |
67 | | value_buffer: MutableBuffer, |
68 | | null_buffer: Option<MutableBuffer>, |
69 | | ) -> Self { |
70 | | let offsets_builder: Vec<T::Offset> = |
71 | | ScalarBuffer::<T::Offset>::from(offsets_buffer).into(); |
72 | | let value_builder: Vec<u8> = ScalarBuffer::<u8>::from(value_buffer).into(); |
73 | | |
74 | | let null_buffer_builder = null_buffer |
75 | | .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) |
76 | | .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); |
77 | | |
78 | | Self { |
79 | | offsets_builder, |
80 | | value_builder, |
81 | | null_buffer_builder, |
82 | | } |
83 | | } |
84 | | |
85 | | #[inline] |
86 | 436k | fn next_offset(&self) -> T::Offset { |
87 | 436k | T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") |
88 | 436k | } |
89 | | |
90 | | /// Appends a value into the builder. |
91 | | /// |
92 | | /// See the [GenericStringBuilder] documentation for examples of |
93 | | /// incrementally building string values with multiple `write!` calls. |
94 | | /// |
95 | | /// # Panics |
96 | | /// |
97 | | /// Panics if the resulting length of [`Self::values_slice`] would exceed |
98 | | /// `T::Offset::MAX` bytes. |
99 | | /// |
100 | | /// For example, this can happen with [`StringArray`] or [`BinaryArray`] |
101 | | /// where the total length of all values exceeds 2GB |
102 | | /// |
103 | | /// [`StringArray`]: crate::StringArray |
104 | | /// [`BinaryArray`]: crate::BinaryArray |
105 | | #[inline] |
106 | 324k | pub fn append_value(&mut self, value: impl AsRef<T::Native>) { |
107 | 324k | self.value_builder |
108 | 324k | .extend_from_slice(value.as_ref().as_ref()); |
109 | 324k | self.null_buffer_builder.append(true); |
110 | 324k | self.offsets_builder.push(self.next_offset()); |
111 | 324k | } |
112 | | |
113 | | /// Append an `Option` value into the builder. |
114 | | /// |
115 | | /// - A `None` value will append a null value. |
116 | | /// - A `Some` value will append the value. |
117 | | /// |
118 | | /// See [`Self::append_value`] for more panic information. |
119 | | #[inline] |
120 | 267k | pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) { |
121 | 267k | match value { |
122 | 30.9k | None => self.append_null(), |
123 | 236k | Some(v) => self.append_value(v), |
124 | | }; |
125 | 267k | } |
126 | | |
127 | | /// Append a null value into the builder. |
128 | | #[inline] |
129 | 30.9k | pub fn append_null(&mut self) { |
130 | 30.9k | self.null_buffer_builder.append(false); |
131 | 30.9k | self.offsets_builder.push(self.next_offset()); |
132 | 30.9k | } |
133 | | |
134 | | /// Appends `n` `null`s into the builder. |
135 | | #[inline] |
136 | | pub fn append_nulls(&mut self, n: usize) { |
137 | | self.null_buffer_builder.append_n_nulls(n); |
138 | | let next_offset = self.next_offset(); |
139 | | self.offsets_builder |
140 | | .extend(std::iter::repeat_n(next_offset, n)); |
141 | | } |
142 | | |
143 | | /// Appends array values and null to this builder as is |
144 | | /// (this means that underlying null values are copied as is). |
145 | | #[inline] |
146 | 129 | pub fn append_array(&mut self, array: &GenericByteArray<T>) -> Result<(), ArrowError> { |
147 | | use num_traits::CheckedAdd; |
148 | 129 | if array.len() == 0 { |
149 | 0 | return Ok(()); |
150 | 129 | } |
151 | | |
152 | 129 | let offsets = array.offsets(); |
153 | | |
154 | | // If the offsets are contiguous, we can append them directly avoiding the need to align |
155 | | // for example, when the first appended array is not sliced (starts at offset 0) |
156 | 129 | if self.next_offset() == offsets[0] { |
157 | 16 | self.offsets_builder.extend_from_slice(&offsets[1..]); |
158 | 16 | } else { |
159 | | // Shifting all the offsets |
160 | 113 | let shift: T::Offset = self.next_offset() - offsets[0]; |
161 | | |
162 | 113 | if shift.checked_add(&offsets[offsets.len() - 1]).is_none() { |
163 | 0 | return Err(ArrowError::OffsetOverflowError( |
164 | 0 | shift.as_usize() + offsets[offsets.len() - 1].as_usize(), |
165 | 0 | )); |
166 | 113 | } |
167 | | |
168 | 113 | self.offsets_builder |
169 | 31.2k | .extend113 (offsets[1..]113 .iter113 ().map113 (|&offset| offset + shift)); |
170 | | } |
171 | | |
172 | | // Append underlying values, starting from the first offset and ending at the last offset |
173 | 129 | self.value_builder.extend_from_slice( |
174 | 129 | &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()], |
175 | | ); |
176 | | |
177 | 129 | if let Some(null_buffer93 ) = array.nulls() { |
178 | 93 | self.null_buffer_builder.append_buffer(null_buffer); |
179 | 93 | } else { |
180 | 36 | self.null_buffer_builder.append_n_non_nulls(array.len()); |
181 | 36 | } |
182 | 129 | Ok(()) |
183 | 129 | } |
184 | | |
185 | | /// Builds the [`GenericByteArray`] and reset this builder. |
186 | 80.3k | pub fn finish(&mut self) -> GenericByteArray<T> { |
187 | 80.3k | let array_type = T::DATA_TYPE; |
188 | 80.3k | let array_builder = ArrayDataBuilder::new(array_type) |
189 | 80.3k | .len(self.len()) |
190 | 80.3k | .add_buffer(std::mem::take(&mut self.offsets_builder).into()) |
191 | 80.3k | .add_buffer(std::mem::take(&mut self.value_builder).into()) |
192 | 80.3k | .nulls(self.null_buffer_builder.finish()); |
193 | | |
194 | 80.3k | self.offsets_builder.push(self.next_offset()); |
195 | 80.3k | let array_data = unsafe { array_builder.build_unchecked() }; |
196 | 80.3k | GenericByteArray::from(array_data) |
197 | 80.3k | } |
198 | | |
199 | | /// Builds the [`GenericByteArray`] without resetting the builder. |
200 | 0 | pub fn finish_cloned(&self) -> GenericByteArray<T> { |
201 | 0 | let array_type = T::DATA_TYPE; |
202 | 0 | let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); |
203 | 0 | let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice()); |
204 | 0 | let array_builder = ArrayDataBuilder::new(array_type) |
205 | 0 | .len(self.len()) |
206 | 0 | .add_buffer(offset_buffer) |
207 | 0 | .add_buffer(value_buffer) |
208 | 0 | .nulls(self.null_buffer_builder.finish_cloned()); |
209 | | |
210 | 0 | let array_data = unsafe { array_builder.build_unchecked() }; |
211 | 0 | GenericByteArray::from(array_data) |
212 | 0 | } |
213 | | |
214 | | /// Returns the current values buffer as a slice |
215 | 91.7k | pub fn values_slice(&self) -> &[u8] { |
216 | 91.7k | self.value_builder.as_slice() |
217 | 91.7k | } |
218 | | |
219 | | /// Returns the current offsets buffer as a slice |
220 | 91.7k | pub fn offsets_slice(&self) -> &[T::Offset] { |
221 | 91.7k | self.offsets_builder.as_slice() |
222 | 91.7k | } |
223 | | |
224 | | /// Returns the current null buffer as a slice |
225 | | pub fn validity_slice(&self) -> Option<&[u8]> { |
226 | | self.null_buffer_builder.as_slice() |
227 | | } |
228 | | |
229 | | /// Returns the current null buffer as a mutable slice |
230 | | pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { |
231 | | self.null_buffer_builder.as_slice_mut() |
232 | | } |
233 | | } |
234 | | |
235 | | impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> { |
236 | | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
237 | | write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?; |
238 | | f.debug_struct("") |
239 | | .field("value_builder", &self.value_builder) |
240 | | .field("offsets_builder", &self.offsets_builder) |
241 | | .field("null_buffer_builder", &self.null_buffer_builder) |
242 | | .finish() |
243 | | } |
244 | | } |
245 | | |
246 | | impl<T: ByteArrayType> Default for GenericByteBuilder<T> { |
247 | | fn default() -> Self { |
248 | | Self::new() |
249 | | } |
250 | | } |
251 | | |
252 | | impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> { |
253 | | /// Returns the number of binary slots in the builder |
254 | 168k | fn len(&self) -> usize { |
255 | 168k | self.null_buffer_builder.len() |
256 | 168k | } |
257 | | |
258 | | /// Builds the array and reset this builder. |
259 | 2 | fn finish(&mut self) -> ArrayRef { |
260 | 2 | Arc::new(self.finish()) |
261 | 2 | } |
262 | | |
263 | | /// Builds the array without resetting the builder. |
264 | 0 | fn finish_cloned(&self) -> ArrayRef { |
265 | 0 | Arc::new(self.finish_cloned()) |
266 | 0 | } |
267 | | |
268 | | /// Returns the builder as a non-mutable `Any` reference. |
269 | 0 | fn as_any(&self) -> &dyn Any { |
270 | 0 | self |
271 | 0 | } |
272 | | |
273 | | /// Returns the builder as a mutable `Any` reference. |
274 | 0 | fn as_any_mut(&mut self) -> &mut dyn Any { |
275 | 0 | self |
276 | 0 | } |
277 | | |
278 | | /// Returns the boxed builder as a box of `Any`. |
279 | 0 | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
280 | 0 | self |
281 | 0 | } |
282 | | } |
283 | | |
284 | | impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> { |
285 | | #[inline] |
286 | 204 | fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) { |
287 | 267k | for v267k in iter { |
288 | 267k | self.append_option(v) |
289 | | } |
290 | 204 | } |
291 | | } |
292 | | |
293 | | /// Array builder for [`GenericStringArray`][crate::GenericStringArray] |
294 | | /// |
295 | | /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with |
296 | | /// [`GenericByteBuilder::append_null`]. |
297 | | /// |
298 | | /// This builder also implements [`std::fmt::Write`] with any written data |
299 | | /// included in the next appended value. This allows using [`std::fmt::Display`] |
300 | | /// with standard Rust idioms like `write!` and `writeln!` to write data |
301 | | /// directly to the builder without intermediate allocations. |
302 | | /// |
303 | | /// # Example writing strings with `append_value` |
304 | | /// ``` |
305 | | /// # use arrow_array::builder::GenericStringBuilder; |
306 | | /// let mut builder = GenericStringBuilder::<i32>::new(); |
307 | | /// |
308 | | /// // Write one string value |
309 | | /// builder.append_value("foobarbaz"); |
310 | | /// |
311 | | /// // Write a second string |
312 | | /// builder.append_value("v2"); |
313 | | /// |
314 | | /// let array = builder.finish(); |
315 | | /// assert_eq!(array.value(0), "foobarbaz"); |
316 | | /// assert_eq!(array.value(1), "v2"); |
317 | | /// ``` |
318 | | /// |
319 | | /// # Example incrementally writing strings with `std::fmt::Write` |
320 | | /// |
321 | | /// ``` |
322 | | /// # use std::fmt::Write; |
323 | | /// # use arrow_array::builder::GenericStringBuilder; |
324 | | /// let mut builder = GenericStringBuilder::<i32>::new(); |
325 | | /// |
326 | | /// // Write data in multiple `write!` calls |
327 | | /// write!(builder, "foo").unwrap(); |
328 | | /// write!(builder, "bar").unwrap(); |
329 | | /// // The next call to append_value finishes the current string |
330 | | /// // including all previously written strings. |
331 | | /// builder.append_value("baz"); |
332 | | /// |
333 | | /// // Write second value with a single write call |
334 | | /// write!(builder, "v2").unwrap(); |
335 | | /// // finish the value by calling append_value with an empty string |
336 | | /// builder.append_value(""); |
337 | | /// |
338 | | /// let array = builder.finish(); |
339 | | /// assert_eq!(array.value(0), "foobarbaz"); |
340 | | /// assert_eq!(array.value(1), "v2"); |
341 | | /// ``` |
342 | | pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>; |
343 | | |
344 | | impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> { |
345 | | fn write_str(&mut self, s: &str) -> std::fmt::Result { |
346 | | self.value_builder.extend_from_slice(s.as_bytes()); |
347 | | Ok(()) |
348 | | } |
349 | | } |
350 | | |
351 | | /// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`] |
352 | | /// |
353 | | /// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \ |
354 | | /// - `item_capacity` - the row count \ |
355 | | /// - `data_capacity` - total string byte count \ |
356 | | /// |
357 | | /// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \ |
358 | | /// |
359 | | /// These capacities are preallocation hints used to improve performance, |
360 | | /// but consequences of passing a hint too large or too small should be negligible. |
361 | | const AVERAGE_STRING_LENGTH: usize = 16; |
362 | | /// Trait for string-like array builders |
363 | | /// |
364 | | /// This trait provides unified interface for builders that append string-like data |
365 | | /// such as [`GenericStringBuilder<O>`] and [`crate::builder::StringViewBuilder`] |
366 | | pub trait StringLikeArrayBuilder: ArrayBuilder { |
367 | | /// Returns a human-readable type name for the builder. |
368 | | fn type_name() -> &'static str; |
369 | | |
370 | | /// Creates a new builder with the given row capacity. |
371 | | fn with_capacity(capacity: usize) -> Self; |
372 | | |
373 | | /// Appends a non-null string value to the builder. |
374 | | fn append_value(&mut self, value: &str); |
375 | | |
376 | | /// Appends a null value to the builder. |
377 | | fn append_null(&mut self); |
378 | | } |
379 | | |
380 | | impl<O: OffsetSizeTrait> StringLikeArrayBuilder for GenericStringBuilder<O> { |
381 | | fn type_name() -> &'static str { |
382 | | std::any::type_name::<Self>() |
383 | | } |
384 | | fn with_capacity(capacity: usize) -> Self { |
385 | | Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH) |
386 | | } |
387 | | fn append_value(&mut self, value: &str) { |
388 | | Self::append_value(self, value); |
389 | | } |
390 | | fn append_null(&mut self) { |
391 | | Self::append_null(self); |
392 | | } |
393 | | } |
394 | | |
395 | | /// A byte size value representing the number of bytes to allocate per binary in [`GenericBinaryBuilder`] |
396 | | /// |
397 | | /// To create a [`GenericBinaryBuilder`] using `.with_capacity` we are required to provide: \ |
398 | | /// - `item_capacity` - the row count \ |
399 | | /// - `data_capacity` - total binary byte count \ |
400 | | /// |
401 | | /// We will use the `AVERAGE_BINARY_LENGTH` * row_count for `data_capacity`. \ |
402 | | /// |
403 | | /// These capacities are preallocation hints used to improve performance, |
404 | | /// but consequences of passing a hint too large or too small should be negligible. |
405 | | const AVERAGE_BINARY_LENGTH: usize = 128; |
406 | | /// Trait for binary-like array builders |
407 | | /// |
408 | | /// This trait provides unified interface for builders that append binary-like data |
409 | | /// such as [`GenericBinaryBuilder<O>`] and [`crate::builder::BinaryViewBuilder`] |
410 | | pub trait BinaryLikeArrayBuilder: ArrayBuilder { |
411 | | /// Returns a human-readable type name for the builder. |
412 | | fn type_name() -> &'static str; |
413 | | |
414 | | /// Creates a new builder with the given row capacity. |
415 | | fn with_capacity(capacity: usize) -> Self; |
416 | | |
417 | | /// Appends a non-null string value to the builder. |
418 | | fn append_value(&mut self, value: &[u8]); |
419 | | |
420 | | /// Appends a null value to the builder. |
421 | | fn append_null(&mut self); |
422 | | } |
423 | | |
424 | | impl<O: OffsetSizeTrait> BinaryLikeArrayBuilder for GenericBinaryBuilder<O> { |
425 | | fn type_name() -> &'static str { |
426 | | std::any::type_name::<Self>() |
427 | | } |
428 | | fn with_capacity(capacity: usize) -> Self { |
429 | | Self::with_capacity(capacity, capacity * AVERAGE_BINARY_LENGTH) |
430 | | } |
431 | | fn append_value(&mut self, value: &[u8]) { |
432 | | Self::append_value(self, value); |
433 | | } |
434 | | fn append_null(&mut self) { |
435 | | Self::append_null(self); |
436 | | } |
437 | | } |
438 | | |
439 | | /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] |
440 | | /// |
441 | | /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with |
442 | | /// [`GenericByteBuilder::append_null`]. |
443 | | /// |
444 | | /// # Example |
445 | | /// ``` |
446 | | /// # use arrow_array::builder::GenericBinaryBuilder; |
447 | | /// let mut builder = GenericBinaryBuilder::<i32>::new(); |
448 | | /// |
449 | | /// // Write data |
450 | | /// builder.append_value("foo"); |
451 | | /// |
452 | | /// // Write second value |
453 | | /// builder.append_value(&[0,1,2]); |
454 | | /// |
455 | | /// let array = builder.finish(); |
456 | | /// // binary values |
457 | | /// assert_eq!(array.value(0), b"foo"); |
458 | | /// assert_eq!(array.value(1), b"\x00\x01\x02"); |
459 | | /// ``` |
460 | | /// |
461 | | /// # Example incrementally writing bytes with `write_bytes` |
462 | | /// |
463 | | /// ``` |
464 | | /// # use std::io::Write; |
465 | | /// # use arrow_array::builder::GenericBinaryBuilder; |
466 | | /// let mut builder = GenericBinaryBuilder::<i32>::new(); |
467 | | /// |
468 | | /// // Write data in multiple `write_bytes` calls |
469 | | /// write!(builder, "foo").unwrap(); |
470 | | /// write!(builder, "bar").unwrap(); |
471 | | /// // The next call to append_value finishes the current string |
472 | | /// // including all previously written strings. |
473 | | /// builder.append_value("baz"); |
474 | | /// |
475 | | /// // Write second value with a single write call |
476 | | /// write!(builder, "v2").unwrap(); |
477 | | /// // finish the value by calling append_value with an empty string |
478 | | /// builder.append_value(""); |
479 | | /// |
480 | | /// let array = builder.finish(); |
481 | | /// assert_eq!(array.value(0), "foobarbaz".as_bytes()); |
482 | | /// assert_eq!(array.value(1), "v2".as_bytes()); |
483 | | /// ``` |
484 | | pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>; |
485 | | |
486 | | impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> { |
487 | | fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> { |
488 | | self.value_builder.extend_from_slice(bs); |
489 | | Ok(bs.len()) |
490 | | } |
491 | | |
492 | | fn flush(&mut self) -> std::io::Result<()> { |
493 | | Ok(()) |
494 | | } |
495 | | } |
496 | | |
497 | | #[cfg(test)] |
498 | | mod tests { |
499 | | use super::*; |
500 | | use crate::GenericStringArray; |
501 | | use crate::array::Array; |
502 | | use arrow_buffer::NullBuffer; |
503 | | use std::fmt::Write as _; |
504 | | use std::io::Write as _; |
505 | | |
506 | | fn _test_generic_binary_builder<O: OffsetSizeTrait>() { |
507 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
508 | | |
509 | | builder.append_value(b"hello"); |
510 | | builder.append_value(b""); |
511 | | builder.append_null(); |
512 | | builder.append_value(b"rust"); |
513 | | |
514 | | let array = builder.finish(); |
515 | | |
516 | | assert_eq!(4, array.len()); |
517 | | assert_eq!(1, array.null_count()); |
518 | | assert_eq!(b"hello", array.value(0)); |
519 | | assert_eq!([] as [u8; 0], array.value(1)); |
520 | | assert!(array.is_null(2)); |
521 | | assert_eq!(b"rust", array.value(3)); |
522 | | assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]); |
523 | | assert_eq!(O::from_usize(4).unwrap(), array.value_length(3)); |
524 | | } |
525 | | |
526 | | #[test] |
527 | | fn test_binary_builder() { |
528 | | _test_generic_binary_builder::<i32>() |
529 | | } |
530 | | |
531 | | #[test] |
532 | | fn test_large_binary_builder() { |
533 | | _test_generic_binary_builder::<i64>() |
534 | | } |
535 | | |
536 | | fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() { |
537 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
538 | | builder.append_null(); |
539 | | builder.append_null(); |
540 | | builder.append_null(); |
541 | | builder.append_nulls(2); |
542 | | assert_eq!(5, builder.len()); |
543 | | assert!(!builder.is_empty()); |
544 | | |
545 | | let array = builder.finish(); |
546 | | assert_eq!(5, array.null_count()); |
547 | | assert_eq!(5, array.len()); |
548 | | assert!(array.is_null(0)); |
549 | | assert!(array.is_null(1)); |
550 | | assert!(array.is_null(2)); |
551 | | assert!(array.is_null(3)); |
552 | | assert!(array.is_null(4)); |
553 | | } |
554 | | |
555 | | #[test] |
556 | | fn test_binary_builder_all_nulls() { |
557 | | _test_generic_binary_builder_all_nulls::<i32>() |
558 | | } |
559 | | |
560 | | #[test] |
561 | | fn test_large_binary_builder_all_nulls() { |
562 | | _test_generic_binary_builder_all_nulls::<i64>() |
563 | | } |
564 | | |
565 | | fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() { |
566 | | let mut builder = GenericBinaryBuilder::<O>::new(); |
567 | | |
568 | | builder.append_value(b"hello"); |
569 | | builder.append_value(b""); |
570 | | builder.append_null(); |
571 | | builder.append_value(b"rust"); |
572 | | builder.finish(); |
573 | | |
574 | | assert!(builder.is_empty()); |
575 | | |
576 | | builder.append_value(b"parquet"); |
577 | | builder.append_null(); |
578 | | builder.append_value(b"arrow"); |
579 | | builder.append_value(b""); |
580 | | builder.append_nulls(2); |
581 | | builder.append_value(b"hi"); |
582 | | let array = builder.finish(); |
583 | | |
584 | | assert_eq!(7, array.len()); |
585 | | assert_eq!(3, array.null_count()); |
586 | | assert_eq!(b"parquet", array.value(0)); |
587 | | assert!(array.is_null(1)); |
588 | | assert!(array.is_null(4)); |
589 | | assert!(array.is_null(5)); |
590 | | assert_eq!(b"arrow", array.value(2)); |
591 | | assert_eq!(b"", array.value(1)); |
592 | | assert_eq!(b"hi", array.value(6)); |
593 | | |
594 | | assert_eq!(O::zero(), array.value_offsets()[0]); |
595 | | assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]); |
596 | | assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]); |
597 | | assert_eq!(O::from_usize(5).unwrap(), array.value_length(2)); |
598 | | } |
599 | | |
600 | | #[test] |
601 | | fn test_binary_builder_reset() { |
602 | | _test_generic_binary_builder_reset::<i32>() |
603 | | } |
604 | | |
605 | | #[test] |
606 | | fn test_large_binary_builder_reset() { |
607 | | _test_generic_binary_builder_reset::<i64>() |
608 | | } |
609 | | |
610 | | fn _test_generic_string_array_builder<O: OffsetSizeTrait>() { |
611 | | let mut builder = GenericStringBuilder::<O>::new(); |
612 | | let owned = "arrow".to_owned(); |
613 | | |
614 | | builder.append_value("hello"); |
615 | | builder.append_value(""); |
616 | | builder.append_value(&owned); |
617 | | builder.append_null(); |
618 | | builder.append_option(Some("rust")); |
619 | | builder.append_option(None::<&str>); |
620 | | builder.append_option(None::<String>); |
621 | | builder.append_nulls(2); |
622 | | builder.append_value("parquet"); |
623 | | assert_eq!(10, builder.len()); |
624 | | |
625 | | assert_eq!( |
626 | | GenericStringArray::<O>::from(vec![ |
627 | | Some("hello"), |
628 | | Some(""), |
629 | | Some("arrow"), |
630 | | None, |
631 | | Some("rust"), |
632 | | None, |
633 | | None, |
634 | | None, |
635 | | None, |
636 | | Some("parquet") |
637 | | ]), |
638 | | builder.finish() |
639 | | ); |
640 | | } |
641 | | |
642 | | #[test] |
643 | | fn test_string_array_builder() { |
644 | | _test_generic_string_array_builder::<i32>() |
645 | | } |
646 | | |
647 | | #[test] |
648 | | fn test_large_string_array_builder() { |
649 | | _test_generic_string_array_builder::<i64>() |
650 | | } |
651 | | |
652 | | fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() { |
653 | | let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11); |
654 | | |
655 | | builder.append_value("hello"); |
656 | | builder.append_value("rust"); |
657 | | builder.append_null(); |
658 | | |
659 | | builder.finish(); |
660 | | assert!(builder.is_empty()); |
661 | | assert_eq!(&[O::zero()], builder.offsets_slice()); |
662 | | |
663 | | builder.append_value("arrow"); |
664 | | builder.append_value("parquet"); |
665 | | let arr = builder.finish(); |
666 | | // array should not have null buffer because there is not `null` value. |
667 | | assert!(arr.nulls().is_none()); |
668 | | assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,) |
669 | | } |
670 | | |
671 | | #[test] |
672 | | fn test_string_array_builder_finish() { |
673 | | _test_generic_string_array_builder_finish::<i32>() |
674 | | } |
675 | | |
676 | | #[test] |
677 | | fn test_large_string_array_builder_finish() { |
678 | | _test_generic_string_array_builder_finish::<i64>() |
679 | | } |
680 | | |
681 | | fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() { |
682 | | let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11); |
683 | | |
684 | | builder.append_value("hello"); |
685 | | builder.append_value("rust"); |
686 | | builder.append_null(); |
687 | | |
688 | | let mut arr = builder.finish_cloned(); |
689 | | assert!(!builder.is_empty()); |
690 | | assert_eq!(3, arr.len()); |
691 | | |
692 | | builder.append_value("arrow"); |
693 | | builder.append_value("parquet"); |
694 | | arr = builder.finish(); |
695 | | |
696 | | assert!(arr.nulls().is_some()); |
697 | | assert_eq!(&[O::zero()], builder.offsets_slice()); |
698 | | assert_eq!(5, arr.len()); |
699 | | } |
700 | | |
701 | | #[test] |
702 | | fn test_string_array_builder_finish_cloned() { |
703 | | _test_generic_string_array_builder_finish_cloned::<i32>() |
704 | | } |
705 | | |
706 | | #[test] |
707 | | fn test_large_string_array_builder_finish_cloned() { |
708 | | _test_generic_string_array_builder_finish_cloned::<i64>() |
709 | | } |
710 | | |
711 | | #[test] |
712 | | fn test_extend() { |
713 | | let mut builder = GenericStringBuilder::<i32>::new(); |
714 | | builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some)); |
715 | | builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some)); |
716 | | let array = builder.finish(); |
717 | | assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); |
718 | | assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); |
719 | | } |
720 | | |
721 | | #[test] |
722 | | fn test_write_str() { |
723 | | let mut builder = GenericStringBuilder::<i32>::new(); |
724 | | write!(builder, "foo").unwrap(); |
725 | | builder.append_value(""); |
726 | | writeln!(builder, "bar").unwrap(); |
727 | | builder.append_value(""); |
728 | | write!(builder, "fiz").unwrap(); |
729 | | write!(builder, "buz").unwrap(); |
730 | | builder.append_value(""); |
731 | | let a = builder.finish(); |
732 | | let r: Vec<_> = a.iter().flatten().collect(); |
733 | | assert_eq!(r, &["foo", "bar\n", "fizbuz"]) |
734 | | } |
735 | | |
736 | | #[test] |
737 | | fn test_write_bytes() { |
738 | | let mut builder = GenericBinaryBuilder::<i32>::new(); |
739 | | write!(builder, "foo").unwrap(); |
740 | | builder.append_value(""); |
741 | | writeln!(builder, "bar").unwrap(); |
742 | | builder.append_value(""); |
743 | | write!(builder, "fiz").unwrap(); |
744 | | write!(builder, "buz").unwrap(); |
745 | | builder.append_value(""); |
746 | | let a = builder.finish(); |
747 | | let r: Vec<_> = a.iter().flatten().collect(); |
748 | | assert_eq!( |
749 | | r, |
750 | | &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] |
751 | | ) |
752 | | } |
753 | | |
754 | | #[test] |
755 | | fn test_append_array_without_nulls() { |
756 | | let input = vec![ |
757 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
758 | | "thank", "you", "for", "asking", |
759 | | ]; |
760 | | let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
761 | | let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
762 | | let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
763 | | |
764 | | let mut builder = GenericStringBuilder::<i32>::new(); |
765 | | builder.append_array(&arr1).unwrap(); |
766 | | builder.append_array(&arr2).unwrap(); |
767 | | builder.append_array(&arr3).unwrap(); |
768 | | |
769 | | let actual = builder.finish(); |
770 | | let expected = GenericStringArray::<i32>::from(input); |
771 | | |
772 | | assert_eq!(actual, expected); |
773 | | } |
774 | | |
775 | | #[test] |
776 | | fn test_append_array_with_nulls() { |
777 | | let input = vec![ |
778 | | Some("hello"), |
779 | | None, |
780 | | Some("how"), |
781 | | None, |
782 | | None, |
783 | | None, |
784 | | None, |
785 | | Some("I"), |
786 | | Some("am"), |
787 | | Some("doing"), |
788 | | Some("well"), |
789 | | ]; |
790 | | let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); |
791 | | let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); |
792 | | let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); |
793 | | |
794 | | let mut builder = GenericStringBuilder::<i32>::new(); |
795 | | builder.append_array(&arr1).unwrap(); |
796 | | builder.append_array(&arr2).unwrap(); |
797 | | builder.append_array(&arr3).unwrap(); |
798 | | |
799 | | let actual = builder.finish(); |
800 | | let expected = GenericStringArray::<i32>::from(input); |
801 | | |
802 | | assert_eq!(actual, expected); |
803 | | } |
804 | | |
805 | | #[test] |
806 | | fn test_append_empty_array() { |
807 | | let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); |
808 | | let mut builder = GenericStringBuilder::<i32>::new(); |
809 | | builder.append_array(&arr).unwrap(); |
810 | | let result = builder.finish(); |
811 | | assert_eq!(result.len(), 0); |
812 | | } |
813 | | |
814 | | #[test] |
815 | | fn test_append_array_with_offset_not_starting_at_0() { |
816 | | let input = vec![ |
817 | | Some("hello"), |
818 | | None, |
819 | | Some("how"), |
820 | | None, |
821 | | None, |
822 | | None, |
823 | | None, |
824 | | Some("I"), |
825 | | Some("am"), |
826 | | Some("doing"), |
827 | | Some("well"), |
828 | | ]; |
829 | | let full_array = GenericStringArray::<i32>::from(input); |
830 | | let sliced = full_array.slice(1, 4); |
831 | | |
832 | | assert_ne!(sliced.offsets()[0].as_usize(), 0); |
833 | | assert_ne!(sliced.offsets().last(), full_array.offsets().last()); |
834 | | |
835 | | let mut builder = GenericStringBuilder::<i32>::new(); |
836 | | builder.append_array(&sliced).unwrap(); |
837 | | let actual = builder.finish(); |
838 | | |
839 | | let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); |
840 | | |
841 | | assert_eq!(actual, expected); |
842 | | } |
843 | | |
844 | | #[test] |
845 | | fn test_append_underlying_null_values_added_as_is() { |
846 | | let input_1_array_with_nulls = { |
847 | | let input = vec![ |
848 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", |
849 | | ]; |
850 | | let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
851 | | |
852 | | GenericStringArray::<i32>::new( |
853 | | offsets, |
854 | | buffer, |
855 | | Some(NullBuffer::from(&[ |
856 | | true, false, true, false, false, true, true, true, false, |
857 | | ])), |
858 | | ) |
859 | | }; |
860 | | let input_2_array_with_nulls = { |
861 | | let input = vec!["doing", "well", "thank", "you", "for", "asking"]; |
862 | | let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); |
863 | | |
864 | | GenericStringArray::<i32>::new( |
865 | | offsets, |
866 | | buffer, |
867 | | Some(NullBuffer::from(&[false, false, true, false, true, true])), |
868 | | ) |
869 | | }; |
870 | | |
871 | | let mut builder = GenericStringBuilder::<i32>::new(); |
872 | | builder.append_array(&input_1_array_with_nulls).unwrap(); |
873 | | builder.append_array(&input_2_array_with_nulls).unwrap(); |
874 | | |
875 | | let actual = builder.finish(); |
876 | | let expected = GenericStringArray::<i32>::from(vec![ |
877 | | Some("hello"), |
878 | | None, // world |
879 | | Some("how"), |
880 | | None, // are |
881 | | None, // you |
882 | | Some("doing"), |
883 | | Some("today"), |
884 | | Some("I"), |
885 | | None, // am |
886 | | None, // doing |
887 | | None, // well |
888 | | Some("thank"), |
889 | | None, // "you", |
890 | | Some("for"), |
891 | | Some("asking"), |
892 | | ]); |
893 | | |
894 | | assert_eq!(actual, expected); |
895 | | |
896 | | let expected_underlying_buffer = Buffer::from( |
897 | | [ |
898 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", |
899 | | "well", "thank", "you", "for", "asking", |
900 | | ] |
901 | | .join("") |
902 | | .as_bytes(), |
903 | | ); |
904 | | assert_eq!(actual.values(), &expected_underlying_buffer); |
905 | | } |
906 | | |
907 | | #[test] |
908 | | fn append_array_with_continues_indices() { |
909 | | let input = vec![ |
910 | | "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", |
911 | | "thank", "you", "for", "asking", |
912 | | ]; |
913 | | let full_array = GenericStringArray::<i32>::from(input); |
914 | | let slice1 = full_array.slice(0, 3); |
915 | | let slice2 = full_array.slice(3, 4); |
916 | | let slice3 = full_array.slice(7, full_array.len() - 7); |
917 | | |
918 | | let mut builder = GenericStringBuilder::<i32>::new(); |
919 | | builder.append_array(&slice1).unwrap(); |
920 | | builder.append_array(&slice2).unwrap(); |
921 | | builder.append_array(&slice3).unwrap(); |
922 | | |
923 | | let actual = builder.finish(); |
924 | | |
925 | | assert_eq!(actual, full_array); |
926 | | } |
927 | | |
928 | | #[test] |
929 | | fn test_append_array_offset_overflow_precise() { |
930 | | let mut builder = GenericStringBuilder::<i32>::new(); |
931 | | |
932 | | let initial_string = "x".repeat(i32::MAX as usize - 100); |
933 | | builder.append_value(&initial_string); |
934 | | |
935 | | let overflow_string = "y".repeat(200); |
936 | | let overflow_array = GenericStringArray::<i32>::from(vec![overflow_string.as_str()]); |
937 | | |
938 | | let result = builder.append_array(&overflow_array); |
939 | | |
940 | | assert!(matches!(result, Err(ArrowError::OffsetOverflowError(_)))); |
941 | | } |
942 | | } |