/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_view_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::any::Any; |
19 | | use std::marker::PhantomData; |
20 | | use std::sync::Arc; |
21 | | |
22 | | use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; |
23 | | use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; |
24 | | use arrow_schema::ArrowError; |
25 | | use hashbrown::hash_table::Entry; |
26 | | use hashbrown::HashTable; |
27 | | |
28 | | use crate::builder::ArrayBuilder; |
29 | | use crate::types::bytes::ByteArrayNativeType; |
30 | | use crate::types::{BinaryViewType, ByteViewType, StringViewType}; |
31 | | use crate::{Array, ArrayRef, GenericByteViewArray}; |
32 | | |
33 | | const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB |
34 | | const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB |
35 | | |
36 | | enum BlockSizeGrowthStrategy { |
37 | | Fixed { size: u32 }, |
38 | | Exponential { current_size: u32 }, |
39 | | } |
40 | | |
41 | | impl BlockSizeGrowthStrategy { |
42 | 0 | fn next_size(&mut self) -> u32 { |
43 | 0 | match self { |
44 | 0 | Self::Fixed { size } => *size, |
45 | 0 | Self::Exponential { current_size } => { |
46 | 0 | if *current_size < MAX_BLOCK_SIZE { |
47 | | // we have fixed start/end block sizes, so we can't overflow |
48 | 0 | *current_size = current_size.saturating_mul(2); |
49 | 0 | *current_size |
50 | | } else { |
51 | 0 | MAX_BLOCK_SIZE |
52 | | } |
53 | | } |
54 | | } |
55 | 0 | } |
56 | | } |
57 | | |
58 | | /// A builder for [`GenericByteViewArray`] |
59 | | /// |
60 | | /// A [`GenericByteViewArray`] consists of a list of data blocks containing string data, |
61 | | /// and a list of views into those buffers. |
62 | | /// |
63 | | /// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`] |
64 | | /// |
65 | | /// This builder can be used in two ways |
66 | | /// |
67 | | /// # Append Values |
68 | | /// |
69 | | /// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable |
70 | | /// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] |
71 | | /// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller |
72 | | /// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the |
73 | | /// in-progress block, it will be closed, and a new block of sufficient size allocated |
74 | | /// |
75 | | /// # Append Views |
76 | | /// |
77 | | /// Some use-cases may wish to reuse an existing allocation containing string data, for example, |
78 | | /// when parsing data from a parquet data page. In such a case entire blocks can be appended |
79 | | /// using [`GenericByteViewBuilder::append_block`] and then views into this block appended |
80 | | /// using [`GenericByteViewBuilder::try_append_view`] |
81 | | pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> { |
82 | | views_buffer: Vec<u128>, |
83 | | null_buffer_builder: NullBufferBuilder, |
84 | | completed: Vec<Buffer>, |
85 | | in_progress: Vec<u8>, |
86 | | block_size: BlockSizeGrowthStrategy, |
87 | | /// Some if deduplicating strings |
88 | | /// map `<string hash> -> <index to the views>` |
89 | | string_tracker: Option<(HashTable<usize>, ahash::RandomState)>, |
90 | | phantom: PhantomData<T>, |
91 | | } |
92 | | |
93 | | impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { |
94 | | /// Creates a new [`GenericByteViewBuilder`]. |
95 | | pub fn new() -> Self { |
96 | | Self::with_capacity(1024) |
97 | | } |
98 | | |
99 | | /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. |
100 | 1 | pub fn with_capacity(capacity: usize) -> Self { |
101 | 1 | Self { |
102 | 1 | views_buffer: Vec::with_capacity(capacity), |
103 | 1 | null_buffer_builder: NullBufferBuilder::new(capacity), |
104 | 1 | completed: vec![], |
105 | 1 | in_progress: vec![], |
106 | 1 | block_size: BlockSizeGrowthStrategy::Exponential { |
107 | 1 | current_size: STARTING_BLOCK_SIZE, |
108 | 1 | }, |
109 | 1 | string_tracker: None, |
110 | 1 | phantom: Default::default(), |
111 | 1 | } |
112 | 1 | } |
113 | | |
114 | | /// Set a fixed buffer size for variable length strings |
115 | | /// |
116 | | /// The block size is the size of the buffer used to store values greater |
117 | | /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current |
118 | | /// buffer is full. |
119 | | /// |
120 | | /// By default the builder balances buffer size and buffer count by |
121 | | /// growing buffer size exponentially from 8KB up to 2MB. The |
122 | | /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. |
123 | | /// |
124 | | /// If this method is used, any new buffers allocated are |
125 | | /// exactly this size. This can be useful for advanced users |
126 | | /// that want to control the memory usage and buffer count. |
127 | | /// |
128 | | /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications. |
129 | | pub fn with_fixed_block_size(self, block_size: u32) -> Self { |
130 | | debug_assert!(block_size > 0, "Block size must be greater than 0"); |
131 | | Self { |
132 | | block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, |
133 | | ..self |
134 | | } |
135 | | } |
136 | | |
137 | | /// Deduplicate strings while building the array |
138 | | /// |
139 | | /// This will potentially decrease the memory usage if the array have repeated strings |
140 | | /// It will also increase the time to build the array as it needs to hash the strings |
141 | | pub fn with_deduplicate_strings(self) -> Self { |
142 | | Self { |
143 | | string_tracker: Some(( |
144 | | HashTable::with_capacity(self.views_buffer.capacity()), |
145 | | Default::default(), |
146 | | )), |
147 | | ..self |
148 | | } |
149 | | } |
150 | | |
151 | | /// Append a new data block returning the new block offset |
152 | | /// |
153 | | /// Note: this will first flush any in-progress block |
154 | | /// |
155 | | /// This allows appending views from blocks added using [`Self::append_block`]. See |
156 | | /// [`Self::append_value`] for appending individual values |
157 | | /// |
158 | | /// ``` |
159 | | /// # use arrow_array::builder::StringViewBuilder; |
160 | | /// let mut builder = StringViewBuilder::new(); |
161 | | /// |
162 | | /// let block = builder.append_block(b"helloworldbingobongo".into()); |
163 | | /// |
164 | | /// builder.try_append_view(block, 0, 5).unwrap(); |
165 | | /// builder.try_append_view(block, 5, 5).unwrap(); |
166 | | /// builder.try_append_view(block, 10, 5).unwrap(); |
167 | | /// builder.try_append_view(block, 15, 5).unwrap(); |
168 | | /// builder.try_append_view(block, 0, 15).unwrap(); |
169 | | /// let array = builder.finish(); |
170 | | /// |
171 | | /// let actual: Vec<_> = array.iter().flatten().collect(); |
172 | | /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"]; |
173 | | /// assert_eq!(actual, expected); |
174 | | /// ``` |
175 | 0 | pub fn append_block(&mut self, buffer: Buffer) -> u32 { |
176 | 0 | assert!(buffer.len() < u32::MAX as usize); |
177 | | |
178 | 0 | self.flush_in_progress(); |
179 | 0 | let offset = self.completed.len(); |
180 | 0 | self.push_completed(buffer); |
181 | 0 | offset as u32 |
182 | 0 | } |
183 | | |
184 | | /// Append a view of the given `block`, `offset` and `length` |
185 | | /// |
186 | | /// # Safety |
187 | | /// (1) The block must have been added using [`Self::append_block`] |
188 | | /// (2) The range `offset..offset+length` must be within the bounds of the block |
189 | | /// (3) The data in the block must be valid of type `T` |
190 | 0 | pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { |
191 | 0 | let b = self.completed.get_unchecked(block as usize); |
192 | 0 | let start = offset as usize; |
193 | 0 | let end = start.saturating_add(len as usize); |
194 | 0 | let b = b.get_unchecked(start..end); |
195 | | |
196 | 0 | let view = make_view(b, block, offset); |
197 | 0 | self.views_buffer.push(view); |
198 | 0 | self.null_buffer_builder.append_non_null(); |
199 | 0 | } |
200 | | |
201 | | /// Appends an array to the builder. |
202 | | /// This will flush any in-progress block and append the data buffers |
203 | | /// and add the (adapted) views. |
204 | 0 | pub fn append_array(&mut self, array: &GenericByteViewArray<T>) { |
205 | 0 | self.flush_in_progress(); |
206 | | // keep original views if this array is the first to be added or if there are no data buffers (all inline views) |
207 | 0 | let keep_views = self.completed.is_empty() || array.data_buffers().is_empty(); |
208 | 0 | let starting_buffer = self.completed.len() as u32; |
209 | | |
210 | 0 | self.completed.extend(array.data_buffers().iter().cloned()); |
211 | | |
212 | 0 | if keep_views { |
213 | 0 | self.views_buffer.extend_from_slice(array.views()); |
214 | 0 | } else { |
215 | 0 | self.views_buffer.extend(array.views().iter().map(|v| { |
216 | 0 | let mut byte_view = ByteView::from(*v); |
217 | 0 | if byte_view.length > MAX_INLINE_VIEW_LEN { |
218 | 0 | // Small views (<=12 bytes) are inlined, so only need to update large views |
219 | 0 | byte_view.buffer_index += starting_buffer; |
220 | 0 | }; |
221 | | |
222 | 0 | byte_view.as_u128() |
223 | 0 | })); |
224 | | } |
225 | | |
226 | 0 | if let Some(null_buffer) = array.nulls() { |
227 | 0 | self.null_buffer_builder.append_buffer(null_buffer); |
228 | 0 | } else { |
229 | 0 | self.null_buffer_builder.append_n_non_nulls(array.len()); |
230 | 0 | } |
231 | 0 | } |
232 | | |
233 | | /// Try to append a view of the given `block`, `offset` and `length` |
234 | | /// |
235 | | /// See [`Self::append_block`] |
236 | | pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> { |
237 | | let b = self.completed.get(block as usize).ok_or_else(|| { |
238 | | ArrowError::InvalidArgumentError(format!("No block found with index {block}")) |
239 | | })?; |
240 | | let start = offset as usize; |
241 | | let end = start.saturating_add(len as usize); |
242 | | |
243 | | let b = b.get(start..end).ok_or_else(|| { |
244 | | ArrowError::InvalidArgumentError(format!( |
245 | | "Range {start}..{end} out of bounds for block of length {}", |
246 | | b.len() |
247 | | )) |
248 | | })?; |
249 | | |
250 | | if T::Native::from_bytes_checked(b).is_none() { |
251 | | return Err(ArrowError::InvalidArgumentError( |
252 | | "Invalid view data".to_string(), |
253 | | )); |
254 | | } |
255 | | |
256 | | unsafe { |
257 | | self.append_view_unchecked(block, offset, len); |
258 | | } |
259 | | Ok(()) |
260 | | } |
261 | | |
262 | | /// Flushes the in progress block if any |
263 | | #[inline] |
264 | 1 | fn flush_in_progress(&mut self) { |
265 | 1 | if !self.in_progress.is_empty() { |
266 | 0 | let f = Buffer::from_vec(std::mem::take(&mut self.in_progress)); |
267 | 0 | self.push_completed(f) |
268 | 1 | } |
269 | 1 | } |
270 | | |
271 | | /// Append a block to `self.completed`, checking for overflow |
272 | | #[inline] |
273 | 0 | fn push_completed(&mut self, block: Buffer) { |
274 | 0 | assert!(block.len() < u32::MAX as usize, "Block too large"); |
275 | 0 | assert!(self.completed.len() < u32::MAX as usize, "Too many blocks"); |
276 | 0 | self.completed.push(block); |
277 | 0 | } |
278 | | |
279 | | /// Returns the value at the given index |
280 | | /// Useful if we want to know what value has been inserted to the builder |
281 | | /// The index has to be smaller than `self.len()`, otherwise it will panic |
282 | 0 | pub fn get_value(&self, index: usize) -> &[u8] { |
283 | 0 | let view = self.views_buffer.as_slice().get(index).unwrap(); |
284 | 0 | let len = *view as u32; |
285 | 0 | if len <= MAX_INLINE_VIEW_LEN { |
286 | | // # Safety |
287 | | // The view is valid from the builder |
288 | 0 | unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) } |
289 | | } else { |
290 | 0 | let view = ByteView::from(*view); |
291 | 0 | if view.buffer_index < self.completed.len() as u32 { |
292 | 0 | let block = &self.completed[view.buffer_index as usize]; |
293 | 0 | &block[view.offset as usize..view.offset as usize + view.length as usize] |
294 | | } else { |
295 | 0 | &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] |
296 | | } |
297 | | } |
298 | 0 | } |
299 | | |
300 | | /// Appends a value into the builder |
301 | | /// |
302 | | /// # Panics |
303 | | /// |
304 | | /// Panics if |
305 | | /// - String buffer count exceeds `u32::MAX` |
306 | | /// - String length exceeds `u32::MAX` |
307 | | #[inline] |
308 | 2 | pub fn append_value(&mut self, value: impl AsRef<T::Native>) { |
309 | 2 | let v: &[u8] = value.as_ref().as_ref(); |
310 | 2 | let length: u32 = v.len().try_into().unwrap(); |
311 | 2 | if length <= MAX_INLINE_VIEW_LEN { |
312 | 2 | let mut view_buffer = [0; 16]; |
313 | 2 | view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); |
314 | 2 | view_buffer[4..4 + v.len()].copy_from_slice(v); |
315 | 2 | self.views_buffer.push(u128::from_le_bytes(view_buffer)); |
316 | 2 | self.null_buffer_builder.append_non_null(); |
317 | 2 | return; |
318 | 0 | } |
319 | | |
320 | | // Deduplication if: |
321 | | // (1) deduplication is enabled. |
322 | | // (2) len > 12 |
323 | 0 | if let Some((mut ht, hasher)) = self.string_tracker.take() { |
324 | 0 | let hash_val = hasher.hash_one(v); |
325 | 0 | let hasher_fn = |v: &_| hasher.hash_one(v); |
326 | | |
327 | 0 | let entry = ht.entry( |
328 | 0 | hash_val, |
329 | 0 | |idx| { |
330 | 0 | let stored_value = self.get_value(*idx); |
331 | 0 | v == stored_value |
332 | 0 | }, |
333 | 0 | hasher_fn, |
334 | | ); |
335 | 0 | match entry { |
336 | 0 | Entry::Occupied(occupied) => { |
337 | | // If the string already exists, we will directly use the view |
338 | 0 | let idx = occupied.get(); |
339 | 0 | self.views_buffer.push(self.views_buffer[*idx]); |
340 | 0 | self.null_buffer_builder.append_non_null(); |
341 | 0 | self.string_tracker = Some((ht, hasher)); |
342 | 0 | return; |
343 | | } |
344 | 0 | Entry::Vacant(vacant) => { |
345 | 0 | // o.w. we insert the (string hash -> view index) |
346 | 0 | // the idx is current length of views_builder, as we are inserting a new view |
347 | 0 | vacant.insert(self.views_buffer.len()); |
348 | 0 | } |
349 | | } |
350 | 0 | self.string_tracker = Some((ht, hasher)); |
351 | 0 | } |
352 | | |
353 | 0 | let required_cap = self.in_progress.len() + v.len(); |
354 | 0 | if self.in_progress.capacity() < required_cap { |
355 | 0 | self.flush_in_progress(); |
356 | 0 | let to_reserve = v.len().max(self.block_size.next_size() as usize); |
357 | 0 | self.in_progress.reserve(to_reserve); |
358 | 0 | }; |
359 | 0 | let offset = self.in_progress.len() as u32; |
360 | 0 | self.in_progress.extend_from_slice(v); |
361 | | |
362 | 0 | let view = ByteView { |
363 | 0 | length, |
364 | 0 | prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), |
365 | 0 | buffer_index: self.completed.len() as u32, |
366 | 0 | offset, |
367 | 0 | }; |
368 | 0 | self.views_buffer.push(view.into()); |
369 | 0 | self.null_buffer_builder.append_non_null(); |
370 | 2 | } |
371 | | |
372 | | /// Append an `Option` value into the builder |
373 | | #[inline] |
374 | 0 | pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) { |
375 | 0 | match value { |
376 | 0 | None => self.append_null(), |
377 | 0 | Some(v) => self.append_value(v), |
378 | | }; |
379 | 0 | } |
380 | | |
381 | | /// Append a null value into the builder |
382 | | #[inline] |
383 | 0 | pub fn append_null(&mut self) { |
384 | 0 | self.null_buffer_builder.append_null(); |
385 | 0 | self.views_buffer.push(0); |
386 | 0 | } |
387 | | |
388 | | /// Builds the [`GenericByteViewArray`] and reset this builder |
389 | 1 | pub fn finish(&mut self) -> GenericByteViewArray<T> { |
390 | 1 | self.flush_in_progress(); |
391 | 1 | let completed = std::mem::take(&mut self.completed); |
392 | 1 | let nulls = self.null_buffer_builder.finish(); |
393 | 1 | if let Some((ref mut ht0 , _)) = self.string_tracker.as_mut() { |
394 | 0 | ht.clear(); |
395 | 1 | } |
396 | 1 | let views = std::mem::take(&mut self.views_buffer); |
397 | | // SAFETY: valid by construction |
398 | 1 | unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) } |
399 | 1 | } |
400 | | |
401 | | /// Builds the [`GenericByteViewArray`] without resetting the builder |
402 | 0 | pub fn finish_cloned(&self) -> GenericByteViewArray<T> { |
403 | 0 | let mut completed = self.completed.clone(); |
404 | 0 | if !self.in_progress.is_empty() { |
405 | 0 | completed.push(Buffer::from_slice_ref(&self.in_progress)); |
406 | 0 | } |
407 | 0 | let len = self.views_buffer.len(); |
408 | 0 | let views = Buffer::from_slice_ref(self.views_buffer.as_slice()); |
409 | 0 | let views = ScalarBuffer::new(views, 0, len); |
410 | 0 | let nulls = self.null_buffer_builder.finish_cloned(); |
411 | | // SAFETY: valid by construction |
412 | 0 | unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } |
413 | 0 | } |
414 | | |
415 | | /// Returns the current null buffer as a slice |
416 | | pub fn validity_slice(&self) -> Option<&[u8]> { |
417 | | self.null_buffer_builder.as_slice() |
418 | | } |
419 | | |
420 | | /// Return the allocated size of this builder in bytes, useful for memory accounting. |
421 | | pub fn allocated_size(&self) -> usize { |
422 | | let views = self.views_buffer.capacity() * std::mem::size_of::<u128>(); |
423 | | let null = self.null_buffer_builder.allocated_size(); |
424 | | let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>(); |
425 | | let in_progress = self.in_progress.capacity(); |
426 | | let tracker = match &self.string_tracker { |
427 | | Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(), |
428 | | None => 0, |
429 | | }; |
430 | | buffer_size + in_progress + tracker + views + null |
431 | | } |
432 | | } |
433 | | |
434 | | impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> { |
435 | | fn default() -> Self { |
436 | | Self::new() |
437 | | } |
438 | | } |
439 | | |
440 | | impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> { |
441 | | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
442 | | write!(f, "{}ViewBuilder", T::PREFIX)?; |
443 | | f.debug_struct("") |
444 | | .field("views_buffer", &self.views_buffer) |
445 | | .field("in_progress", &self.in_progress) |
446 | | .field("completed", &self.completed) |
447 | | .field("null_buffer_builder", &self.null_buffer_builder) |
448 | | .finish() |
449 | | } |
450 | | } |
451 | | |
452 | | impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> { |
453 | 0 | fn len(&self) -> usize { |
454 | 0 | self.null_buffer_builder.len() |
455 | 0 | } |
456 | | |
457 | 0 | fn finish(&mut self) -> ArrayRef { |
458 | 0 | Arc::new(self.finish()) |
459 | 0 | } |
460 | | |
461 | 0 | fn finish_cloned(&self) -> ArrayRef { |
462 | 0 | Arc::new(self.finish_cloned()) |
463 | 0 | } |
464 | | |
465 | 0 | fn as_any(&self) -> &dyn Any { |
466 | 0 | self |
467 | 0 | } |
468 | | |
469 | 0 | fn as_any_mut(&mut self) -> &mut dyn Any { |
470 | 0 | self |
471 | 0 | } |
472 | | |
473 | 0 | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
474 | 0 | self |
475 | 0 | } |
476 | | } |
477 | | |
478 | | impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>> |
479 | | for GenericByteViewBuilder<T> |
480 | | { |
481 | | #[inline] |
482 | 0 | fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) { |
483 | 0 | for v in iter { |
484 | 0 | self.append_option(v) |
485 | | } |
486 | 0 | } |
487 | | } |
488 | | |
489 | | /// Array builder for [`StringViewArray`][crate::StringViewArray] |
490 | | /// |
491 | | /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
492 | | /// [`GenericByteViewBuilder::append_null`] as normal. |
493 | | /// |
494 | | /// # Example |
495 | | /// ``` |
496 | | /// # use arrow_array::builder::StringViewBuilder; |
497 | | /// # use arrow_array::StringViewArray; |
498 | | /// let mut builder = StringViewBuilder::new(); |
499 | | /// builder.append_value("hello"); |
500 | | /// builder.append_null(); |
501 | | /// builder.append_value("world"); |
502 | | /// let array = builder.finish(); |
503 | | /// |
504 | | /// let expected = vec![Some("hello"), None, Some("world")]; |
505 | | /// let actual: Vec<_> = array.iter().collect(); |
506 | | /// assert_eq!(expected, actual); |
507 | | /// ``` |
508 | | pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; |
509 | | |
510 | | /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] |
511 | | /// |
512 | | /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
513 | | /// [`GenericByteViewBuilder::append_null`] as normal. |
514 | | /// |
515 | | /// # Example |
516 | | /// ``` |
517 | | /// # use arrow_array::builder::BinaryViewBuilder; |
518 | | /// use arrow_array::BinaryViewArray; |
519 | | /// let mut builder = BinaryViewBuilder::new(); |
520 | | /// builder.append_value("hello"); |
521 | | /// builder.append_null(); |
522 | | /// builder.append_value("world"); |
523 | | /// let array = builder.finish(); |
524 | | /// |
525 | | /// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")]; |
526 | | /// let actual: Vec<_> = array.iter().collect(); |
527 | | /// assert_eq!(expected, actual); |
528 | | /// ``` |
529 | | /// |
530 | | pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>; |
531 | | |
532 | | /// Creates a view from a fixed length input (the compiler can generate |
533 | | /// specialized code for this) |
534 | 0 | fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 { |
535 | 0 | let mut view_buffer = [0; 16]; |
536 | 0 | view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); |
537 | 0 | view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); |
538 | 0 | u128::from_le_bytes(view_buffer) |
539 | 0 | } |
540 | | |
541 | | /// Create a view based on the given data, block id and offset. |
542 | | /// |
543 | | /// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G> |
544 | | /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), |
545 | | /// which slows down things. |
546 | | #[inline(never)] |
547 | 0 | pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { |
548 | 0 | let len = data.len(); |
549 | | |
550 | | // Generate specialized code for each potential small string length |
551 | | // to improve performance |
552 | 0 | match len { |
553 | 0 | 0 => make_inlined_view::<0>(data), |
554 | 0 | 1 => make_inlined_view::<1>(data), |
555 | 0 | 2 => make_inlined_view::<2>(data), |
556 | 0 | 3 => make_inlined_view::<3>(data), |
557 | 0 | 4 => make_inlined_view::<4>(data), |
558 | 0 | 5 => make_inlined_view::<5>(data), |
559 | 0 | 6 => make_inlined_view::<6>(data), |
560 | 0 | 7 => make_inlined_view::<7>(data), |
561 | 0 | 8 => make_inlined_view::<8>(data), |
562 | 0 | 9 => make_inlined_view::<9>(data), |
563 | 0 | 10 => make_inlined_view::<10>(data), |
564 | 0 | 11 => make_inlined_view::<11>(data), |
565 | 0 | 12 => make_inlined_view::<12>(data), |
566 | | // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. |
567 | | _ => { |
568 | 0 | let view = ByteView { |
569 | 0 | length: len as u32, |
570 | 0 | prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), |
571 | 0 | buffer_index: block_id, |
572 | 0 | offset, |
573 | 0 | }; |
574 | 0 | view.as_u128() |
575 | | } |
576 | | } |
577 | 0 | } |
578 | | |
579 | | #[cfg(test)] |
580 | | mod tests { |
581 | | use core::str; |
582 | | |
583 | | use super::*; |
584 | | use crate::Array; |
585 | | |
586 | | #[test] |
587 | | fn test_string_view_deduplicate() { |
588 | | let value_1 = "long string to test string view"; |
589 | | let value_2 = "not so similar string but long"; |
590 | | |
591 | | let mut builder = StringViewBuilder::new() |
592 | | .with_deduplicate_strings() |
593 | | .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers |
594 | | |
595 | | let values = vec![ |
596 | | Some(value_1), |
597 | | Some(value_2), |
598 | | Some("short"), |
599 | | Some(value_1), |
600 | | None, |
601 | | Some(value_2), |
602 | | Some(value_1), |
603 | | ]; |
604 | | builder.extend(values.clone()); |
605 | | |
606 | | let array = builder.finish_cloned(); |
607 | | array.to_data().validate_full().unwrap(); |
608 | | assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. |
609 | | let actual: Vec<_> = array.iter().collect(); |
610 | | assert_eq!(actual, values); |
611 | | |
612 | | let view0 = array.views().first().unwrap(); |
613 | | let view3 = array.views().get(3).unwrap(); |
614 | | let view6 = array.views().get(6).unwrap(); |
615 | | |
616 | | assert_eq!(view0, view3); |
617 | | assert_eq!(view0, view6); |
618 | | |
619 | | assert_eq!(array.views().get(1), array.views().get(5)); |
620 | | } |
621 | | |
622 | | #[test] |
623 | | fn test_string_view_deduplicate_after_finish() { |
624 | | let mut builder = StringViewBuilder::new().with_deduplicate_strings(); |
625 | | |
626 | | let value_1 = "long string to test string view"; |
627 | | let value_2 = "not so similar string but long"; |
628 | | builder.append_value(value_1); |
629 | | let _array = builder.finish(); |
630 | | builder.append_value(value_2); |
631 | | let _array = builder.finish(); |
632 | | builder.append_value(value_1); |
633 | | let _array = builder.finish(); |
634 | | } |
635 | | |
636 | | #[test] |
637 | | fn test_string_view() { |
638 | | let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81"); |
639 | | let b2 = Buffer::from(b"cupcakes"); |
640 | | let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity"); |
641 | | |
642 | | let mut v = StringViewBuilder::new(); |
643 | | assert_eq!(v.append_block(b1), 0); |
644 | | |
645 | | v.append_value("This is a very long string that exceeds the inline length"); |
646 | | v.append_value("This is another very long string that exceeds the inline length"); |
647 | | |
648 | | assert_eq!(v.append_block(b2), 2); |
649 | | assert_eq!(v.append_block(b3), 3); |
650 | | |
651 | | // Test short strings |
652 | | v.try_append_view(0, 0, 5).unwrap(); // world |
653 | | v.try_append_view(0, 6, 7).unwrap(); // bananas |
654 | | v.try_append_view(2, 3, 5).unwrap(); // cake |
655 | | v.try_append_view(2, 0, 3).unwrap(); // cup |
656 | | v.try_append_view(2, 0, 8).unwrap(); // cupcakes |
657 | | v.try_append_view(0, 13, 4).unwrap(); // 😁 |
658 | | v.try_append_view(0, 13, 0).unwrap(); // |
659 | | |
660 | | // Test longer strings |
661 | | v.try_append_view(3, 0, 16).unwrap(); // Many strings are |
662 | | v.try_append_view(1, 0, 19).unwrap(); // This is a very long |
663 | | v.try_append_view(3, 13, 27).unwrap(); // here contained of great length |
664 | | |
665 | | v.append_value("I do so like long strings"); |
666 | | |
667 | | let array = v.finish_cloned(); |
668 | | array.to_data().validate_full().unwrap(); |
669 | | assert_eq!(array.data_buffers().len(), 5); |
670 | | let actual: Vec<_> = array.iter().flatten().collect(); |
671 | | assert_eq!( |
672 | | actual, |
673 | | &[ |
674 | | "This is a very long string that exceeds the inline length", |
675 | | "This is another very long string that exceeds the inline length", |
676 | | "world", |
677 | | "bananas", |
678 | | "cakes", |
679 | | "cup", |
680 | | "cupcakes", |
681 | | "😁", |
682 | | "", |
683 | | "Many strings are", |
684 | | "This is a very long", |
685 | | "are here contained of great", |
686 | | "I do so like long strings" |
687 | | ] |
688 | | ); |
689 | | |
690 | | let err = v.try_append_view(0, u32::MAX, 1).unwrap_err(); |
691 | | assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17"); |
692 | | |
693 | | let err = v.try_append_view(0, 1, u32::MAX).unwrap_err(); |
694 | | assert_eq!( |
695 | | err.to_string(), |
696 | | "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17" |
697 | | ); |
698 | | |
699 | | let err = v.try_append_view(0, 13, 2).unwrap_err(); |
700 | | assert_eq!(err.to_string(), "Invalid argument error: Invalid view data"); |
701 | | |
702 | | let err = v.try_append_view(0, 40, 0).unwrap_err(); |
703 | | assert_eq!( |
704 | | err.to_string(), |
705 | | "Invalid argument error: Range 40..40 out of bounds for block of length 17" |
706 | | ); |
707 | | |
708 | | let err = v.try_append_view(5, 0, 0).unwrap_err(); |
709 | | assert_eq!( |
710 | | err.to_string(), |
711 | | "Invalid argument error: No block found with index 5" |
712 | | ); |
713 | | } |
714 | | |
715 | | #[test] |
716 | | fn test_string_view_with_block_size_growth() { |
717 | | let mut exp_builder = StringViewBuilder::new(); |
718 | | let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); |
719 | | |
720 | | let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); |
721 | | |
722 | | for i in 0..9 { |
723 | | // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M |
724 | | for _ in 0..(2_u32.pow(i)) { |
725 | | exp_builder.append_value(long_string); |
726 | | fixed_builder.append_value(long_string); |
727 | | } |
728 | | exp_builder.flush_in_progress(); |
729 | | fixed_builder.flush_in_progress(); |
730 | | |
731 | | // Every step only add one buffer, but the buffer size is much larger |
732 | | assert_eq!(exp_builder.completed.len(), i as usize + 1); |
733 | | assert_eq!( |
734 | | exp_builder.completed[i as usize].len(), |
735 | | STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) |
736 | | ); |
737 | | |
738 | | // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 |
739 | | assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); |
740 | | |
741 | | // Every buffer is fixed size |
742 | | assert!(fixed_builder |
743 | | .completed |
744 | | .iter() |
745 | | .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)); |
746 | | } |
747 | | |
748 | | // Add one more value, and the buffer stop growing. |
749 | | exp_builder.append_value(long_string); |
750 | | exp_builder.flush_in_progress(); |
751 | | assert_eq!( |
752 | | exp_builder.completed.last().unwrap().capacity(), |
753 | | MAX_BLOCK_SIZE as usize |
754 | | ); |
755 | | } |
756 | | } |