/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_view_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::any::Any; |
19 | | use std::marker::PhantomData; |
20 | | use std::sync::Arc; |
21 | | |
22 | | use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; |
23 | | use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; |
24 | | use arrow_schema::ArrowError; |
25 | | use hashbrown::HashTable; |
26 | | use hashbrown::hash_table::Entry; |
27 | | |
28 | | use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder}; |
29 | | use crate::types::bytes::ByteArrayNativeType; |
30 | | use crate::types::{BinaryViewType, ByteViewType, StringViewType}; |
31 | | use crate::{Array, ArrayRef, GenericByteViewArray}; |
32 | | |
33 | | const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB |
34 | | const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB |
35 | | |
36 | | enum BlockSizeGrowthStrategy { |
37 | | Fixed { size: u32 }, |
38 | | Exponential { current_size: u32 }, |
39 | | } |
40 | | |
41 | | impl BlockSizeGrowthStrategy { |
42 | 195 | fn next_size(&mut self) -> u32 { |
43 | 195 | match self { |
44 | 25 | Self::Fixed { size } => *size, |
45 | 170 | Self::Exponential { current_size } => { |
46 | 170 | if *current_size < MAX_BLOCK_SIZE { |
47 | | // we have fixed start/end block sizes, so we can't overflow |
48 | 170 | *current_size = current_size.saturating_mul(2); |
49 | 170 | *current_size |
50 | | } else { |
51 | 0 | MAX_BLOCK_SIZE |
52 | | } |
53 | | } |
54 | | } |
55 | 195 | } |
56 | | } |
57 | | |
58 | | /// A builder for [`GenericByteViewArray`] |
59 | | /// |
60 | | /// A [`GenericByteViewArray`] consists of a list of data blocks containing string data, |
61 | | /// and a list of views into those buffers. |
62 | | /// |
63 | | /// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`] |
64 | | /// |
65 | | /// This builder can be used in two ways |
66 | | /// |
67 | | /// # Append Values |
68 | | /// |
69 | | /// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable |
70 | | /// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] |
71 | | /// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller |
72 | | /// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the |
73 | | /// in-progress block, it will be closed, and a new block of sufficient size allocated |
74 | | /// |
75 | | /// # Append Views |
76 | | /// |
77 | | /// Some use-cases may wish to reuse an existing allocation containing string data, for example, |
78 | | /// when parsing data from a parquet data page. In such a case entire blocks can be appended |
79 | | /// using [`GenericByteViewBuilder::append_block`] and then views into this block appended |
80 | | /// using [`GenericByteViewBuilder::try_append_view`] |
81 | | pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> { |
82 | | views_buffer: Vec<u128>, |
83 | | null_buffer_builder: NullBufferBuilder, |
84 | | completed: Vec<Buffer>, |
85 | | in_progress: Vec<u8>, |
86 | | block_size: BlockSizeGrowthStrategy, |
87 | | /// Some if deduplicating strings |
88 | | /// map `<string hash> -> <index to the views>` |
89 | | string_tracker: Option<(HashTable<usize>, ahash::RandomState)>, |
90 | | phantom: PhantomData<T>, |
91 | | } |
92 | | |
93 | | impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { |
94 | | /// Creates a new [`GenericByteViewBuilder`]. |
95 | 92 | pub fn new() -> Self { |
96 | 92 | Self::with_capacity(1024) |
97 | 92 | } |
98 | | |
99 | | /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. |
100 | 166 | pub fn with_capacity(capacity: usize) -> Self { |
101 | 166 | Self { |
102 | 166 | views_buffer: Vec::with_capacity(capacity), |
103 | 166 | null_buffer_builder: NullBufferBuilder::new(capacity), |
104 | 166 | completed: vec![], |
105 | 166 | in_progress: vec![], |
106 | 166 | block_size: BlockSizeGrowthStrategy::Exponential { |
107 | 166 | current_size: STARTING_BLOCK_SIZE, |
108 | 166 | }, |
109 | 166 | string_tracker: None, |
110 | 166 | phantom: Default::default(), |
111 | 166 | } |
112 | 166 | } |
113 | | |
114 | | /// Set a fixed buffer size for variable length strings |
115 | | /// |
116 | | /// The block size is the size of the buffer used to store values greater |
117 | | /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current |
118 | | /// buffer is full. |
119 | | /// |
120 | | /// By default the builder balances buffer size and buffer count by |
121 | | /// growing buffer size exponentially from 8KB up to 2MB. The |
122 | | /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. |
123 | | /// |
124 | | /// If this method is used, any new buffers allocated are |
125 | | /// exactly this size. This can be useful for advanced users |
126 | | /// that want to control the memory usage and buffer count. |
127 | | /// |
128 | | /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications. |
129 | 12 | pub fn with_fixed_block_size(self, block_size: u32) -> Self { |
130 | 12 | debug_assert!(block_size > 0, "Block size must be greater than 0"0 ); |
131 | 12 | Self { |
132 | 12 | block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, |
133 | 12 | ..self |
134 | 12 | } |
135 | 12 | } |
136 | | |
137 | | /// Deduplicate strings while building the array |
138 | | /// |
139 | | /// This will potentially decrease the memory usage if the array have repeated strings |
140 | | /// It will also increase the time to build the array as it needs to hash the strings |
141 | | pub fn with_deduplicate_strings(self) -> Self { |
142 | | Self { |
143 | | string_tracker: Some(( |
144 | | HashTable::with_capacity(self.views_buffer.capacity()), |
145 | | Default::default(), |
146 | | )), |
147 | | ..self |
148 | | } |
149 | | } |
150 | | |
151 | | /// Append a new data block returning the new block offset |
152 | | /// |
153 | | /// Note: this will first flush any in-progress block |
154 | | /// |
155 | | /// This allows appending views from blocks added using [`Self::append_block`]. See |
156 | | /// [`Self::append_value`] for appending individual values |
157 | | /// |
158 | | /// ``` |
159 | | /// # use arrow_array::builder::StringViewBuilder; |
160 | | /// let mut builder = StringViewBuilder::new(); |
161 | | /// |
162 | | /// let block = builder.append_block(b"helloworldbingobongo".into()); |
163 | | /// |
164 | | /// builder.try_append_view(block, 0, 5).unwrap(); |
165 | | /// builder.try_append_view(block, 5, 5).unwrap(); |
166 | | /// builder.try_append_view(block, 10, 5).unwrap(); |
167 | | /// builder.try_append_view(block, 15, 5).unwrap(); |
168 | | /// builder.try_append_view(block, 0, 15).unwrap(); |
169 | | /// let array = builder.finish(); |
170 | | /// |
171 | | /// let actual: Vec<_> = array.iter().flatten().collect(); |
172 | | /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"]; |
173 | | /// assert_eq!(actual, expected); |
174 | | /// ``` |
175 | 4 | pub fn append_block(&mut self, buffer: Buffer) -> u32 { |
176 | 4 | assert!(buffer.len() < u32::MAX as usize); |
177 | | |
178 | 4 | self.flush_in_progress(); |
179 | 4 | let offset = self.completed.len(); |
180 | 4 | self.push_completed(buffer); |
181 | 4 | offset as u32 |
182 | 4 | } |
183 | | |
184 | | /// Append a view of the given `block`, `offset` and `length` |
185 | | /// |
186 | | /// # Safety |
187 | | /// (1) The block must have been added using [`Self::append_block`] |
188 | | /// (2) The range `offset..offset+length` must be within the bounds of the block |
189 | | /// (3) The data in the block must be valid of type `T` |
190 | 16 | pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { |
191 | 16 | let b = unsafe { self.completed.get_unchecked(block as usize) }; |
192 | 16 | let start = offset as usize; |
193 | 16 | let end = start.saturating_add(len as usize); |
194 | 16 | let b = unsafe { b.get_unchecked(start..end) }; |
195 | | |
196 | 16 | let view = make_view(b, block, offset); |
197 | 16 | self.views_buffer.push(view); |
198 | 16 | self.null_buffer_builder.append_non_null(); |
199 | 16 | } |
200 | | |
201 | | /// Appends an array to the builder. |
202 | | /// This will flush any in-progress block and append the data buffers |
203 | | /// and add the (adapted) views. |
204 | 93 | pub fn append_array(&mut self, array: &GenericByteViewArray<T>) { |
205 | 93 | self.flush_in_progress(); |
206 | | // keep original views if this array is the first to be added or if there are no data buffers (all inline views) |
207 | 93 | let keep_views = self.completed.is_empty() || array.data_buffers()81 .is_empty81 (); |
208 | 93 | let starting_buffer = self.completed.len() as u32; |
209 | | |
210 | 93 | self.completed.extend(array.data_buffers().iter().cloned()); |
211 | | |
212 | 93 | if keep_views { |
213 | 14 | self.views_buffer.extend_from_slice(array.views()); |
214 | 14 | } else { |
215 | 22.3k | self.views_buffer79 .extend79 (array.views().iter()79 .map79 (|v| { |
216 | 22.3k | let mut byte_view = ByteView::from(*v); |
217 | 22.3k | if byte_view.length > MAX_INLINE_VIEW_LEN { |
218 | 6.37k | // Small views (<=12 bytes) are inlined, so only need to update large views |
219 | 6.37k | byte_view.buffer_index += starting_buffer; |
220 | 16.0k | }; |
221 | | |
222 | 22.3k | byte_view.as_u128() |
223 | 22.3k | })); |
224 | | } |
225 | | |
226 | 93 | if let Some(null_buffer41 ) = array.nulls() { |
227 | 41 | self.null_buffer_builder.append_buffer(null_buffer); |
228 | 52 | } else { |
229 | 52 | self.null_buffer_builder.append_n_non_nulls(array.len()); |
230 | 52 | } |
231 | 93 | } |
232 | | |
233 | | /// Try to append a view of the given `block`, `offset` and `length` |
234 | | /// |
235 | | /// See [`Self::append_block`] |
236 | | pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> { |
237 | | let b = self.completed.get(block as usize).ok_or_else(|| { |
238 | | ArrowError::InvalidArgumentError(format!("No block found with index {block}")) |
239 | | })?; |
240 | | let start = offset as usize; |
241 | | let end = start.saturating_add(len as usize); |
242 | | |
243 | | let b = b.get(start..end).ok_or_else(|| { |
244 | | ArrowError::InvalidArgumentError(format!( |
245 | | "Range {start}..{end} out of bounds for block of length {}", |
246 | | b.len() |
247 | | )) |
248 | | })?; |
249 | | |
250 | | if T::Native::from_bytes_checked(b).is_none() { |
251 | | return Err(ArrowError::InvalidArgumentError( |
252 | | "Invalid view data".to_string(), |
253 | | )); |
254 | | } |
255 | | |
256 | | unsafe { |
257 | | self.append_view_unchecked(block, offset, len); |
258 | | } |
259 | | Ok(()) |
260 | | } |
261 | | |
262 | | /// Flushes the in progress block if any |
263 | | #[inline] |
264 | 458 | fn flush_in_progress(&mut self) { |
265 | 458 | if !self.in_progress.is_empty() { |
266 | 195 | let f = Buffer::from_vec(std::mem::take(&mut self.in_progress)); |
267 | 195 | self.push_completed(f) |
268 | 263 | } |
269 | 458 | } |
270 | | |
271 | | /// Append a block to `self.completed`, checking for overflow |
272 | | #[inline] |
273 | 199 | fn push_completed(&mut self, block: Buffer) { |
274 | 199 | assert!(block.len() < u32::MAX as usize, "Block too large"0 ); |
275 | 199 | assert!(self.completed.len() < u32::MAX as usize, "Too many blocks"0 ); |
276 | 199 | self.completed.push(block); |
277 | 199 | } |
278 | | |
279 | | /// Returns the value at the given index |
280 | | /// Useful if we want to know what value has been inserted to the builder |
281 | | /// The index has to be smaller than `self.len()`, otherwise it will panic |
282 | 0 | pub fn get_value(&self, index: usize) -> &[u8] { |
283 | 0 | let view = self.views_buffer.as_slice().get(index).unwrap(); |
284 | 0 | let len = *view as u32; |
285 | 0 | if len <= MAX_INLINE_VIEW_LEN { |
286 | | // # Safety |
287 | | // The view is valid from the builder |
288 | 0 | unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) } |
289 | | } else { |
290 | 0 | let view = ByteView::from(*view); |
291 | 0 | if view.buffer_index < self.completed.len() as u32 { |
292 | 0 | let block = &self.completed[view.buffer_index as usize]; |
293 | 0 | &block[view.offset as usize..view.offset as usize + view.length as usize] |
294 | | } else { |
295 | 0 | &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] |
296 | | } |
297 | | } |
298 | 0 | } |
299 | | |
300 | | /// Appends a value into the builder |
301 | | /// |
302 | | /// # Panics |
303 | | /// |
304 | | /// Panics if |
305 | | /// - String buffer count exceeds `u32::MAX` |
306 | | /// - String length exceeds `u32::MAX` |
307 | | #[inline] |
308 | 255k | pub fn append_value(&mut self, value: impl AsRef<T::Native>) { |
309 | 255k | self.try_append_value(value).unwrap() |
310 | 255k | } |
311 | | |
312 | | /// Appends a value into the builder |
313 | | /// |
314 | | /// # Errors |
315 | | /// |
316 | | /// Returns an error if: |
317 | | /// - String buffer count exceeds `u32::MAX` |
318 | | /// - String length exceeds `u32::MAX` |
319 | | #[inline] |
320 | 255k | pub fn try_append_value(&mut self, value: impl AsRef<T::Native>) -> Result<(), ArrowError> { |
321 | 255k | let v: &[u8] = value.as_ref().as_ref(); |
322 | 255k | let length: u32 = v.len().try_into().map_err(|_| {0 |
323 | 0 | ArrowError::InvalidArgumentError(format!("String length {} exceeds u32::MAX", v.len())) |
324 | 0 | })?; |
325 | | |
326 | 255k | if length <= MAX_INLINE_VIEW_LEN { |
327 | 205k | let mut view_buffer = [0; 16]; |
328 | 205k | view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); |
329 | 205k | view_buffer[4..4 + v.len()].copy_from_slice(v); |
330 | 205k | self.views_buffer.push(u128::from_le_bytes(view_buffer)); |
331 | 205k | self.null_buffer_builder.append_non_null(); |
332 | 205k | return Ok(()); |
333 | 50.0k | } |
334 | | |
335 | | // Deduplication if: |
336 | | // (1) deduplication is enabled. |
337 | | // (2) len > 12 |
338 | 50.0k | if let Some((mut ht0 , hasher0 )) = self.string_tracker.take() { |
339 | 0 | let hash_val = hasher.hash_one(v); |
340 | 0 | let hasher_fn = |v: &_| hasher.hash_one(v); |
341 | | |
342 | 0 | let entry = ht.entry( |
343 | 0 | hash_val, |
344 | 0 | |idx| { |
345 | 0 | let stored_value = self.get_value(*idx); |
346 | 0 | v == stored_value |
347 | 0 | }, |
348 | 0 | hasher_fn, |
349 | | ); |
350 | 0 | match entry { |
351 | 0 | Entry::Occupied(occupied) => { |
352 | | // If the string already exists, we will directly use the view |
353 | 0 | let idx = occupied.get(); |
354 | 0 | self.views_buffer.push(self.views_buffer[*idx]); |
355 | 0 | self.null_buffer_builder.append_non_null(); |
356 | 0 | self.string_tracker = Some((ht, hasher)); |
357 | 0 | return Ok(()); |
358 | | } |
359 | 0 | Entry::Vacant(vacant) => { |
360 | 0 | // o.w. we insert the (string hash -> view index) |
361 | 0 | // the idx is current length of views_builder, as we are inserting a new view |
362 | 0 | vacant.insert(self.views_buffer.len()); |
363 | 0 | } |
364 | | } |
365 | 0 | self.string_tracker = Some((ht, hasher)); |
366 | 50.0k | } |
367 | | |
368 | 50.0k | let required_cap = self.in_progress.len() + v.len(); |
369 | 50.0k | if self.in_progress.capacity() < required_cap { |
370 | 195 | self.flush_in_progress(); |
371 | 195 | let to_reserve = v.len().max(self.block_size.next_size() as usize); |
372 | 195 | self.in_progress.reserve(to_reserve); |
373 | 49.8k | }; |
374 | | |
375 | 50.0k | let offset = self.in_progress.len() as u32; |
376 | 50.0k | self.in_progress.extend_from_slice(v); |
377 | | |
378 | 50.0k | let buffer_index: u32 = self.completed.len().try_into().map_err(|_| {0 |
379 | 0 | ArrowError::InvalidArgumentError(format!( |
380 | 0 | "Buffer count {} exceeds u32::MAX", |
381 | 0 | self.completed.len() |
382 | 0 | )) |
383 | 0 | })?; |
384 | | |
385 | 50.0k | let view = ByteView { |
386 | 50.0k | length, |
387 | 50.0k | // This won't panic as we checked the length of prefix earlier. |
388 | 50.0k | prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), |
389 | 50.0k | buffer_index, |
390 | 50.0k | offset, |
391 | 50.0k | }; |
392 | 50.0k | self.views_buffer.push(view.into()); |
393 | 50.0k | self.null_buffer_builder.append_non_null(); |
394 | | |
395 | 50.0k | Ok(()) |
396 | 255k | } |
397 | | |
398 | | /// Append an `Option` value into the builder |
399 | | #[inline] |
400 | 312k | pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) { |
401 | 312k | match value { |
402 | 57.3k | None => self.append_null(), |
403 | 255k | Some(v) => self.append_value(v), |
404 | | }; |
405 | 312k | } |
406 | | |
407 | | /// Append a null value into the builder |
408 | | #[inline] |
409 | 57.3k | pub fn append_null(&mut self) { |
410 | 57.3k | self.null_buffer_builder.append_null(); |
411 | 57.3k | self.views_buffer.push(0); |
412 | 57.3k | } |
413 | | |
414 | | /// Builds the [`GenericByteViewArray`] and reset this builder |
415 | 166 | pub fn finish(&mut self) -> GenericByteViewArray<T> { |
416 | 166 | self.flush_in_progress(); |
417 | 166 | let completed = std::mem::take(&mut self.completed); |
418 | 166 | let nulls = self.null_buffer_builder.finish(); |
419 | 166 | if let Some((ht0 , _)) = self.string_tracker.as_mut() { |
420 | 0 | ht.clear(); |
421 | 166 | } |
422 | 166 | let views = std::mem::take(&mut self.views_buffer); |
423 | | // SAFETY: valid by construction |
424 | 166 | unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) } |
425 | 166 | } |
426 | | |
427 | | /// Builds the [`GenericByteViewArray`] without resetting the builder |
428 | 0 | pub fn finish_cloned(&self) -> GenericByteViewArray<T> { |
429 | 0 | let mut completed = self.completed.clone(); |
430 | 0 | if !self.in_progress.is_empty() { |
431 | 0 | completed.push(Buffer::from_slice_ref(&self.in_progress)); |
432 | 0 | } |
433 | 0 | let len = self.views_buffer.len(); |
434 | 0 | let views = Buffer::from_slice_ref(self.views_buffer.as_slice()); |
435 | 0 | let views = ScalarBuffer::new(views, 0, len); |
436 | 0 | let nulls = self.null_buffer_builder.finish_cloned(); |
437 | | // SAFETY: valid by construction |
438 | 0 | unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } |
439 | 0 | } |
440 | | |
441 | | /// Returns the current null buffer as a slice |
442 | | pub fn validity_slice(&self) -> Option<&[u8]> { |
443 | | self.null_buffer_builder.as_slice() |
444 | | } |
445 | | |
446 | | /// Return the allocated size of this builder in bytes, useful for memory accounting. |
447 | | pub fn allocated_size(&self) -> usize { |
448 | | let views = self.views_buffer.capacity() * std::mem::size_of::<u128>(); |
449 | | let null = self.null_buffer_builder.allocated_size(); |
450 | | let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>(); |
451 | | let in_progress = self.in_progress.capacity(); |
452 | | let tracker = match &self.string_tracker { |
453 | | Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(), |
454 | | None => 0, |
455 | | }; |
456 | | buffer_size + in_progress + tracker + views + null |
457 | | } |
458 | | } |
459 | | |
460 | | impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> { |
461 | | fn default() -> Self { |
462 | | Self::new() |
463 | | } |
464 | | } |
465 | | |
466 | | impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> { |
467 | | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
468 | | write!(f, "{}ViewBuilder", T::PREFIX)?; |
469 | | f.debug_struct("") |
470 | | .field("views_buffer", &self.views_buffer) |
471 | | .field("in_progress", &self.in_progress) |
472 | | .field("completed", &self.completed) |
473 | | .field("null_buffer_builder", &self.null_buffer_builder) |
474 | | .finish() |
475 | | } |
476 | | } |
477 | | |
478 | | impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> { |
479 | 4 | fn len(&self) -> usize { |
480 | 4 | self.null_buffer_builder.len() |
481 | 4 | } |
482 | | |
483 | 0 | fn finish(&mut self) -> ArrayRef { |
484 | 0 | Arc::new(self.finish()) |
485 | 0 | } |
486 | | |
487 | 0 | fn finish_cloned(&self) -> ArrayRef { |
488 | 0 | Arc::new(self.finish_cloned()) |
489 | 0 | } |
490 | | |
491 | 0 | fn as_any(&self) -> &dyn Any { |
492 | 0 | self |
493 | 0 | } |
494 | | |
495 | 0 | fn as_any_mut(&mut self) -> &mut dyn Any { |
496 | 0 | self |
497 | 0 | } |
498 | | |
499 | 0 | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
500 | 0 | self |
501 | 0 | } |
502 | | } |
503 | | |
504 | | impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>> |
505 | | for GenericByteViewBuilder<T> |
506 | | { |
507 | | #[inline] |
508 | 45 | fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) { |
509 | 249k | for v249k in iter { |
510 | 249k | self.append_option(v) |
511 | | } |
512 | 45 | } |
513 | | } |
514 | | |
515 | | /// Array builder for [`StringViewArray`][crate::StringViewArray] |
516 | | /// |
517 | | /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
518 | | /// [`GenericByteViewBuilder::append_null`] as normal. |
519 | | /// |
520 | | /// # Example |
521 | | /// ``` |
522 | | /// # use arrow_array::builder::StringViewBuilder; |
523 | | /// # use arrow_array::StringViewArray; |
524 | | /// let mut builder = StringViewBuilder::new(); |
525 | | /// builder.append_value("hello"); |
526 | | /// builder.append_null(); |
527 | | /// builder.append_value("world"); |
528 | | /// let array = builder.finish(); |
529 | | /// |
530 | | /// let expected = vec![Some("hello"), None, Some("world")]; |
531 | | /// let actual: Vec<_> = array.iter().collect(); |
532 | | /// assert_eq!(expected, actual); |
533 | | /// ``` |
534 | | pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; |
535 | | |
536 | | impl StringLikeArrayBuilder for StringViewBuilder { |
537 | 0 | fn type_name() -> &'static str { |
538 | 0 | std::any::type_name::<StringViewBuilder>() |
539 | 0 | } |
540 | 0 | fn with_capacity(capacity: usize) -> Self { |
541 | 0 | Self::with_capacity(capacity) |
542 | 0 | } |
543 | 0 | fn append_value(&mut self, value: &str) { |
544 | 0 | Self::append_value(self, value); |
545 | 0 | } |
546 | 0 | fn append_null(&mut self) { |
547 | 0 | Self::append_null(self); |
548 | 0 | } |
549 | | } |
550 | | |
551 | | /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] |
552 | | /// |
553 | | /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
554 | | /// [`GenericByteViewBuilder::append_null`] as normal. |
555 | | /// |
556 | | /// # Example |
557 | | /// ``` |
558 | | /// # use arrow_array::builder::BinaryViewBuilder; |
559 | | /// use arrow_array::BinaryViewArray; |
560 | | /// let mut builder = BinaryViewBuilder::new(); |
561 | | /// builder.append_value("hello"); |
562 | | /// builder.append_null(); |
563 | | /// builder.append_value("world"); |
564 | | /// let array = builder.finish(); |
565 | | /// |
566 | | /// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")]; |
567 | | /// let actual: Vec<_> = array.iter().collect(); |
568 | | /// assert_eq!(expected, actual); |
569 | | /// ``` |
570 | | /// |
571 | | pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>; |
572 | | |
573 | | impl BinaryLikeArrayBuilder for BinaryViewBuilder { |
574 | 0 | fn type_name() -> &'static str { |
575 | 0 | std::any::type_name::<BinaryViewBuilder>() |
576 | 0 | } |
577 | 0 | fn with_capacity(capacity: usize) -> Self { |
578 | 0 | Self::with_capacity(capacity) |
579 | 0 | } |
580 | 0 | fn append_value(&mut self, value: &[u8]) { |
581 | 0 | Self::append_value(self, value); |
582 | 0 | } |
583 | 0 | fn append_null(&mut self) { |
584 | 0 | Self::append_null(self); |
585 | 0 | } |
586 | | } |
587 | | |
588 | | /// Creates a view from a fixed length input (the compiler can generate |
589 | | /// specialized code for this) |
590 | 12 | fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 { |
591 | 12 | let mut view_buffer = [0; 16]; |
592 | 12 | view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); |
593 | 12 | view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); |
594 | 12 | u128::from_le_bytes(view_buffer) |
595 | 12 | } |
596 | | |
597 | | /// Create a view based on the given data, block id and offset. |
598 | | /// |
599 | | /// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G> |
600 | | /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), |
601 | | /// which slows down things. |
602 | | #[inline(never)] |
603 | 16 | pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { |
604 | 16 | let len = data.len(); |
605 | | |
606 | | // Generate specialized code for each potential small string length |
607 | | // to improve performance |
608 | 16 | match len { |
609 | 0 | 0 => make_inlined_view::<0>(data), |
610 | 0 | 1 => make_inlined_view::<1>(data), |
611 | 0 | 2 => make_inlined_view::<2>(data), |
612 | 4 | 3 => make_inlined_view::<3>(data), |
613 | 5 | 4 => make_inlined_view::<4>(data), |
614 | 3 | 5 => make_inlined_view::<5>(data), |
615 | 0 | 6 => make_inlined_view::<6>(data), |
616 | 0 | 7 => make_inlined_view::<7>(data), |
617 | 0 | 8 => make_inlined_view::<8>(data), |
618 | 0 | 9 => make_inlined_view::<9>(data), |
619 | 0 | 10 => make_inlined_view::<10>(data), |
620 | 0 | 11 => make_inlined_view::<11>(data), |
621 | 0 | 12 => make_inlined_view::<12>(data), |
622 | | // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. |
623 | | _ => { |
624 | 4 | let view = ByteView { |
625 | 4 | length: len as u32, |
626 | 4 | prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), |
627 | 4 | buffer_index: block_id, |
628 | 4 | offset, |
629 | 4 | }; |
630 | 4 | view.as_u128() |
631 | | } |
632 | | } |
633 | 16 | } |
634 | | |
635 | | #[cfg(test)] |
636 | | mod tests { |
637 | | use core::str; |
638 | | |
639 | | use super::*; |
640 | | |
641 | | #[test] |
642 | | fn test_string_view_deduplicate() { |
643 | | let value_1 = "long string to test string view"; |
644 | | let value_2 = "not so similar string but long"; |
645 | | |
646 | | let mut builder = StringViewBuilder::new() |
647 | | .with_deduplicate_strings() |
648 | | .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers |
649 | | |
650 | | let values = vec![ |
651 | | Some(value_1), |
652 | | Some(value_2), |
653 | | Some("short"), |
654 | | Some(value_1), |
655 | | None, |
656 | | Some(value_2), |
657 | | Some(value_1), |
658 | | ]; |
659 | | builder.extend(values.clone()); |
660 | | |
661 | | let array = builder.finish_cloned(); |
662 | | array.to_data().validate_full().unwrap(); |
663 | | assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. |
664 | | let actual: Vec<_> = array.iter().collect(); |
665 | | assert_eq!(actual, values); |
666 | | |
667 | | let view0 = array.views().first().unwrap(); |
668 | | let view3 = array.views().get(3).unwrap(); |
669 | | let view6 = array.views().get(6).unwrap(); |
670 | | |
671 | | assert_eq!(view0, view3); |
672 | | assert_eq!(view0, view6); |
673 | | |
674 | | assert_eq!(array.views().get(1), array.views().get(5)); |
675 | | } |
676 | | |
677 | | #[test] |
678 | | fn test_string_view_deduplicate_after_finish() { |
679 | | let mut builder = StringViewBuilder::new().with_deduplicate_strings(); |
680 | | |
681 | | let value_1 = "long string to test string view"; |
682 | | let value_2 = "not so similar string but long"; |
683 | | builder.append_value(value_1); |
684 | | let _array = builder.finish(); |
685 | | builder.append_value(value_2); |
686 | | let _array = builder.finish(); |
687 | | builder.append_value(value_1); |
688 | | let _array = builder.finish(); |
689 | | } |
690 | | |
691 | | #[test] |
692 | | fn test_string_view() { |
693 | | let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81"); |
694 | | let b2 = Buffer::from(b"cupcakes"); |
695 | | let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity"); |
696 | | |
697 | | let mut v = StringViewBuilder::new(); |
698 | | assert_eq!(v.append_block(b1), 0); |
699 | | |
700 | | v.append_value("This is a very long string that exceeds the inline length"); |
701 | | v.append_value("This is another very long string that exceeds the inline length"); |
702 | | |
703 | | assert_eq!(v.append_block(b2), 2); |
704 | | assert_eq!(v.append_block(b3), 3); |
705 | | |
706 | | // Test short strings |
707 | | v.try_append_view(0, 0, 5).unwrap(); // world |
708 | | v.try_append_view(0, 6, 7).unwrap(); // bananas |
709 | | v.try_append_view(2, 3, 5).unwrap(); // cake |
710 | | v.try_append_view(2, 0, 3).unwrap(); // cup |
711 | | v.try_append_view(2, 0, 8).unwrap(); // cupcakes |
712 | | v.try_append_view(0, 13, 4).unwrap(); // 😁 |
713 | | v.try_append_view(0, 13, 0).unwrap(); // |
714 | | |
715 | | // Test longer strings |
716 | | v.try_append_view(3, 0, 16).unwrap(); // Many strings are |
717 | | v.try_append_view(1, 0, 19).unwrap(); // This is a very long |
718 | | v.try_append_view(3, 13, 27).unwrap(); // here contained of great length |
719 | | |
720 | | v.append_value("I do so like long strings"); |
721 | | |
722 | | let array = v.finish_cloned(); |
723 | | array.to_data().validate_full().unwrap(); |
724 | | assert_eq!(array.data_buffers().len(), 5); |
725 | | let actual: Vec<_> = array.iter().flatten().collect(); |
726 | | assert_eq!( |
727 | | actual, |
728 | | &[ |
729 | | "This is a very long string that exceeds the inline length", |
730 | | "This is another very long string that exceeds the inline length", |
731 | | "world", |
732 | | "bananas", |
733 | | "cakes", |
734 | | "cup", |
735 | | "cupcakes", |
736 | | "😁", |
737 | | "", |
738 | | "Many strings are", |
739 | | "This is a very long", |
740 | | "are here contained of great", |
741 | | "I do so like long strings" |
742 | | ] |
743 | | ); |
744 | | |
745 | | let err = v.try_append_view(0, u32::MAX, 1).unwrap_err(); |
746 | | assert_eq!( |
747 | | err.to_string(), |
748 | | "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17" |
749 | | ); |
750 | | |
751 | | let err = v.try_append_view(0, 1, u32::MAX).unwrap_err(); |
752 | | assert_eq!( |
753 | | err.to_string(), |
754 | | "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17" |
755 | | ); |
756 | | |
757 | | let err = v.try_append_view(0, 13, 2).unwrap_err(); |
758 | | assert_eq!(err.to_string(), "Invalid argument error: Invalid view data"); |
759 | | |
760 | | let err = v.try_append_view(0, 40, 0).unwrap_err(); |
761 | | assert_eq!( |
762 | | err.to_string(), |
763 | | "Invalid argument error: Range 40..40 out of bounds for block of length 17" |
764 | | ); |
765 | | |
766 | | let err = v.try_append_view(5, 0, 0).unwrap_err(); |
767 | | assert_eq!( |
768 | | err.to_string(), |
769 | | "Invalid argument error: No block found with index 5" |
770 | | ); |
771 | | } |
772 | | |
773 | | #[test] |
774 | | fn test_string_view_with_block_size_growth() { |
775 | | let mut exp_builder = StringViewBuilder::new(); |
776 | | let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); |
777 | | |
778 | | let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); |
779 | | |
780 | | for i in 0..9 { |
781 | | // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M |
782 | | for _ in 0..(2_u32.pow(i)) { |
783 | | exp_builder.append_value(long_string); |
784 | | fixed_builder.append_value(long_string); |
785 | | } |
786 | | exp_builder.flush_in_progress(); |
787 | | fixed_builder.flush_in_progress(); |
788 | | |
789 | | // Every step only add one buffer, but the buffer size is much larger |
790 | | assert_eq!(exp_builder.completed.len(), i as usize + 1); |
791 | | assert_eq!( |
792 | | exp_builder.completed[i as usize].len(), |
793 | | STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) |
794 | | ); |
795 | | |
796 | | // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 |
797 | | assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); |
798 | | |
799 | | // Every buffer is fixed size |
800 | | assert!( |
801 | | fixed_builder |
802 | | .completed |
803 | | .iter() |
804 | | .all(|b| b.len() == STARTING_BLOCK_SIZE as usize) |
805 | | ); |
806 | | } |
807 | | |
808 | | // Add one more value, and the buffer stop growing. |
809 | | exp_builder.append_value(long_string); |
810 | | exp_builder.flush_in_progress(); |
811 | | assert_eq!( |
812 | | exp_builder.completed.last().unwrap().capacity(), |
813 | | MAX_BLOCK_SIZE as usize |
814 | | ); |
815 | | } |
816 | | } |