Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_view_builder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::any::Any;
19
use std::marker::PhantomData;
20
use std::sync::Arc;
21
22
use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
23
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
24
use arrow_schema::ArrowError;
25
use hashbrown::HashTable;
26
use hashbrown::hash_table::Entry;
27
28
use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder};
29
use crate::types::bytes::ByteArrayNativeType;
30
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
31
use crate::{Array, ArrayRef, GenericByteViewArray};
32
33
const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
34
const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
35
36
enum BlockSizeGrowthStrategy {
37
    Fixed { size: u32 },
38
    Exponential { current_size: u32 },
39
}
40
41
impl BlockSizeGrowthStrategy {
42
195
    fn next_size(&mut self) -> u32 {
43
195
        match self {
44
25
            Self::Fixed { size } => *size,
45
170
            Self::Exponential { current_size } => {
46
170
                if *current_size < MAX_BLOCK_SIZE {
47
                    // we have fixed start/end block sizes, so we can't overflow
48
170
                    *current_size = current_size.saturating_mul(2);
49
170
                    *current_size
50
                } else {
51
0
                    MAX_BLOCK_SIZE
52
                }
53
            }
54
        }
55
195
    }
56
}
57
58
/// A builder for [`GenericByteViewArray`]
59
///
60
/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data,
61
/// and a list of views into those buffers.
62
///
63
/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`]
64
///
65
/// This builder can be used in two ways
66
///
67
/// # Append Values
68
///
69
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
70
/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71
/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
72
/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
73
/// in-progress block, it will be closed, and a new block of sufficient size allocated
74
///
75
/// # Append Views
76
///
77
/// Some use-cases may wish to reuse an existing allocation containing string data, for example,
78
/// when parsing data from a parquet data page. In such a case entire blocks can be appended
79
/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
80
/// using [`GenericByteViewBuilder::try_append_view`]
81
pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
82
    views_buffer: Vec<u128>,
83
    null_buffer_builder: NullBufferBuilder,
84
    completed: Vec<Buffer>,
85
    in_progress: Vec<u8>,
86
    block_size: BlockSizeGrowthStrategy,
87
    /// Some if deduplicating strings
88
    /// map `<string hash> -> <index to the views>`
89
    string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90
    phantom: PhantomData<T>,
91
}
92
93
impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
94
    /// Creates a new [`GenericByteViewBuilder`].
95
92
    pub fn new() -> Self {
96
92
        Self::with_capacity(1024)
97
92
    }
98
99
    /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
100
166
    pub fn with_capacity(capacity: usize) -> Self {
101
166
        Self {
102
166
            views_buffer: Vec::with_capacity(capacity),
103
166
            null_buffer_builder: NullBufferBuilder::new(capacity),
104
166
            completed: vec![],
105
166
            in_progress: vec![],
106
166
            block_size: BlockSizeGrowthStrategy::Exponential {
107
166
                current_size: STARTING_BLOCK_SIZE,
108
166
            },
109
166
            string_tracker: None,
110
166
            phantom: Default::default(),
111
166
        }
112
166
    }
113
114
    /// Set a fixed buffer size for variable length strings
115
    ///
116
    /// The block size is the size of the buffer used to store values greater
117
    /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
118
    /// buffer is full.
119
    ///
120
    /// By default the builder balances buffer size and buffer count by
121
    /// growing buffer size exponentially from 8KB up to 2MB. The
122
    /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
123
    ///
124
    /// If this method is used, any new buffers allocated are
125
    /// exactly this size. This can be useful for advanced users
126
    /// that want to control the memory usage and buffer count.
127
    ///
128
    /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
129
12
    pub fn with_fixed_block_size(self, block_size: u32) -> Self {
130
12
        debug_assert!(block_size > 0, 
"Block size must be greater than 0"0
);
131
12
        Self {
132
12
            block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
133
12
            ..self
134
12
        }
135
12
    }
136
137
    /// Deduplicate strings while building the array
138
    ///
139
    /// This will potentially decrease the memory usage if the array have repeated strings
140
    /// It will also increase the time to build the array as it needs to hash the strings
141
    pub fn with_deduplicate_strings(self) -> Self {
142
        Self {
143
            string_tracker: Some((
144
                HashTable::with_capacity(self.views_buffer.capacity()),
145
                Default::default(),
146
            )),
147
            ..self
148
        }
149
    }
150
151
    /// Append a new data block returning the new block offset
152
    ///
153
    /// Note: this will first flush any in-progress block
154
    ///
155
    /// This allows appending views from blocks added using [`Self::append_block`]. See
156
    /// [`Self::append_value`] for appending individual values
157
    ///
158
    /// ```
159
    /// # use arrow_array::builder::StringViewBuilder;
160
    /// let mut builder = StringViewBuilder::new();
161
    ///
162
    /// let block = builder.append_block(b"helloworldbingobongo".into());
163
    ///
164
    /// builder.try_append_view(block, 0, 5).unwrap();
165
    /// builder.try_append_view(block, 5, 5).unwrap();
166
    /// builder.try_append_view(block, 10, 5).unwrap();
167
    /// builder.try_append_view(block, 15, 5).unwrap();
168
    /// builder.try_append_view(block, 0, 15).unwrap();
169
    /// let array = builder.finish();
170
    ///
171
    /// let actual: Vec<_> = array.iter().flatten().collect();
172
    /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"];
173
    /// assert_eq!(actual, expected);
174
    /// ```
175
4
    pub fn append_block(&mut self, buffer: Buffer) -> u32 {
176
4
        assert!(buffer.len() < u32::MAX as usize);
177
178
4
        self.flush_in_progress();
179
4
        let offset = self.completed.len();
180
4
        self.push_completed(buffer);
181
4
        offset as u32
182
4
    }
183
184
    /// Append a view of the given `block`, `offset` and `length`
185
    ///
186
    /// # Safety
187
    /// (1) The block must have been added using [`Self::append_block`]
188
    /// (2) The range `offset..offset+length` must be within the bounds of the block
189
    /// (3) The data in the block must be valid of type `T`
190
16
    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
191
16
        let b = unsafe { self.completed.get_unchecked(block as usize) };
192
16
        let start = offset as usize;
193
16
        let end = start.saturating_add(len as usize);
194
16
        let b = unsafe { b.get_unchecked(start..end) };
195
196
16
        let view = make_view(b, block, offset);
197
16
        self.views_buffer.push(view);
198
16
        self.null_buffer_builder.append_non_null();
199
16
    }
200
201
    /// Appends an array to the builder.
202
    /// This will flush any in-progress block and append the data buffers
203
    /// and add the (adapted) views.
204
93
    pub fn append_array(&mut self, array: &GenericByteViewArray<T>) {
205
93
        self.flush_in_progress();
206
        // keep original views if this array is the first to be added or if there are no data buffers (all inline views)
207
93
        let keep_views = self.completed.is_empty() || 
array.data_buffers()81
.
is_empty81
();
208
93
        let starting_buffer = self.completed.len() as u32;
209
210
93
        self.completed.extend(array.data_buffers().iter().cloned());
211
212
93
        if keep_views {
213
14
            self.views_buffer.extend_from_slice(array.views());
214
14
        } else {
215
22.3k
            
self.views_buffer79
.
extend79
(
array.views().iter()79
.
map79
(|v| {
216
22.3k
                let mut byte_view = ByteView::from(*v);
217
22.3k
                if byte_view.length > MAX_INLINE_VIEW_LEN {
218
6.37k
                    // Small views (<=12 bytes) are inlined, so only need to update large views
219
6.37k
                    byte_view.buffer_index += starting_buffer;
220
16.0k
                };
221
222
22.3k
                byte_view.as_u128()
223
22.3k
            }));
224
        }
225
226
93
        if let Some(
null_buffer41
) = array.nulls() {
227
41
            self.null_buffer_builder.append_buffer(null_buffer);
228
52
        } else {
229
52
            self.null_buffer_builder.append_n_non_nulls(array.len());
230
52
        }
231
93
    }
232
233
    /// Try to append a view of the given `block`, `offset` and `length`
234
    ///
235
    /// See [`Self::append_block`]
236
    pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> {
237
        let b = self.completed.get(block as usize).ok_or_else(|| {
238
            ArrowError::InvalidArgumentError(format!("No block found with index {block}"))
239
        })?;
240
        let start = offset as usize;
241
        let end = start.saturating_add(len as usize);
242
243
        let b = b.get(start..end).ok_or_else(|| {
244
            ArrowError::InvalidArgumentError(format!(
245
                "Range {start}..{end} out of bounds for block of length {}",
246
                b.len()
247
            ))
248
        })?;
249
250
        if T::Native::from_bytes_checked(b).is_none() {
251
            return Err(ArrowError::InvalidArgumentError(
252
                "Invalid view data".to_string(),
253
            ));
254
        }
255
256
        unsafe {
257
            self.append_view_unchecked(block, offset, len);
258
        }
259
        Ok(())
260
    }
261
262
    /// Flushes the in progress block if any
263
    #[inline]
264
458
    fn flush_in_progress(&mut self) {
265
458
        if !self.in_progress.is_empty() {
266
195
            let f = Buffer::from_vec(std::mem::take(&mut self.in_progress));
267
195
            self.push_completed(f)
268
263
        }
269
458
    }
270
271
    /// Append a block to `self.completed`, checking for overflow
272
    #[inline]
273
199
    fn push_completed(&mut self, block: Buffer) {
274
199
        assert!(block.len() < u32::MAX as usize, 
"Block too large"0
);
275
199
        assert!(self.completed.len() < u32::MAX as usize, 
"Too many blocks"0
);
276
199
        self.completed.push(block);
277
199
    }
278
279
    /// Returns the value at the given index
280
    /// Useful if we want to know what value has been inserted to the builder
281
    /// The index has to be smaller than `self.len()`, otherwise it will panic
282
0
    pub fn get_value(&self, index: usize) -> &[u8] {
283
0
        let view = self.views_buffer.as_slice().get(index).unwrap();
284
0
        let len = *view as u32;
285
0
        if len <= MAX_INLINE_VIEW_LEN {
286
            // # Safety
287
            // The view is valid from the builder
288
0
            unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
289
        } else {
290
0
            let view = ByteView::from(*view);
291
0
            if view.buffer_index < self.completed.len() as u32 {
292
0
                let block = &self.completed[view.buffer_index as usize];
293
0
                &block[view.offset as usize..view.offset as usize + view.length as usize]
294
            } else {
295
0
                &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize]
296
            }
297
        }
298
0
    }
299
300
    /// Appends a value into the builder
301
    ///
302
    /// # Panics
303
    ///
304
    /// Panics if
305
    /// - String buffer count exceeds `u32::MAX`
306
    /// - String length exceeds `u32::MAX`
307
    #[inline]
308
255k
    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
309
255k
        self.try_append_value(value).unwrap()
310
255k
    }
311
312
    /// Appends a value into the builder
313
    ///
314
    /// # Errors
315
    ///
316
    /// Returns an error if:
317
    /// - String buffer count exceeds `u32::MAX`
318
    /// - String length exceeds `u32::MAX`
319
    #[inline]
320
255k
    pub fn try_append_value(&mut self, value: impl AsRef<T::Native>) -> Result<(), ArrowError> {
321
255k
        let v: &[u8] = value.as_ref().as_ref();
322
255k
        let length: u32 = v.len().try_into().map_err(|_| 
{0
323
0
            ArrowError::InvalidArgumentError(format!("String length {} exceeds u32::MAX", v.len()))
324
0
        })?;
325
326
255k
        if length <= MAX_INLINE_VIEW_LEN {
327
205k
            let mut view_buffer = [0; 16];
328
205k
            view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
329
205k
            view_buffer[4..4 + v.len()].copy_from_slice(v);
330
205k
            self.views_buffer.push(u128::from_le_bytes(view_buffer));
331
205k
            self.null_buffer_builder.append_non_null();
332
205k
            return Ok(());
333
50.0k
        }
334
335
        // Deduplication if:
336
        // (1) deduplication is enabled.
337
        // (2) len > 12
338
50.0k
        if let Some((
mut ht0
,
hasher0
)) = self.string_tracker.take() {
339
0
            let hash_val = hasher.hash_one(v);
340
0
            let hasher_fn = |v: &_| hasher.hash_one(v);
341
342
0
            let entry = ht.entry(
343
0
                hash_val,
344
0
                |idx| {
345
0
                    let stored_value = self.get_value(*idx);
346
0
                    v == stored_value
347
0
                },
348
0
                hasher_fn,
349
            );
350
0
            match entry {
351
0
                Entry::Occupied(occupied) => {
352
                    // If the string already exists, we will directly use the view
353
0
                    let idx = occupied.get();
354
0
                    self.views_buffer.push(self.views_buffer[*idx]);
355
0
                    self.null_buffer_builder.append_non_null();
356
0
                    self.string_tracker = Some((ht, hasher));
357
0
                    return Ok(());
358
                }
359
0
                Entry::Vacant(vacant) => {
360
0
                    // o.w. we insert the (string hash -> view index)
361
0
                    // the idx is current length of views_builder, as we are inserting a new view
362
0
                    vacant.insert(self.views_buffer.len());
363
0
                }
364
            }
365
0
            self.string_tracker = Some((ht, hasher));
366
50.0k
        }
367
368
50.0k
        let required_cap = self.in_progress.len() + v.len();
369
50.0k
        if self.in_progress.capacity() < required_cap {
370
195
            self.flush_in_progress();
371
195
            let to_reserve = v.len().max(self.block_size.next_size() as usize);
372
195
            self.in_progress.reserve(to_reserve);
373
49.8k
        };
374
375
50.0k
        let offset = self.in_progress.len() as u32;
376
50.0k
        self.in_progress.extend_from_slice(v);
377
378
50.0k
        let buffer_index: u32 = self.completed.len().try_into().map_err(|_| 
{0
379
0
            ArrowError::InvalidArgumentError(format!(
380
0
                "Buffer count {} exceeds u32::MAX",
381
0
                self.completed.len()
382
0
            ))
383
0
        })?;
384
385
50.0k
        let view = ByteView {
386
50.0k
            length,
387
50.0k
            // This won't panic as we checked the length of prefix earlier.
388
50.0k
            prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
389
50.0k
            buffer_index,
390
50.0k
            offset,
391
50.0k
        };
392
50.0k
        self.views_buffer.push(view.into());
393
50.0k
        self.null_buffer_builder.append_non_null();
394
395
50.0k
        Ok(())
396
255k
    }
397
398
    /// Append an `Option` value into the builder
399
    #[inline]
400
312k
    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
401
312k
        match value {
402
57.3k
            None => self.append_null(),
403
255k
            Some(v) => self.append_value(v),
404
        };
405
312k
    }
406
407
    /// Append a null value into the builder
408
    #[inline]
409
57.3k
    pub fn append_null(&mut self) {
410
57.3k
        self.null_buffer_builder.append_null();
411
57.3k
        self.views_buffer.push(0);
412
57.3k
    }
413
414
    /// Builds the [`GenericByteViewArray`] and reset this builder
415
166
    pub fn finish(&mut self) -> GenericByteViewArray<T> {
416
166
        self.flush_in_progress();
417
166
        let completed = std::mem::take(&mut self.completed);
418
166
        let nulls = self.null_buffer_builder.finish();
419
166
        if let Some((
ht0
, _)) = self.string_tracker.as_mut() {
420
0
            ht.clear();
421
166
        }
422
166
        let views = std::mem::take(&mut self.views_buffer);
423
        // SAFETY: valid by construction
424
166
        unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) }
425
166
    }
426
427
    /// Builds the [`GenericByteViewArray`] without resetting the builder
428
0
    pub fn finish_cloned(&self) -> GenericByteViewArray<T> {
429
0
        let mut completed = self.completed.clone();
430
0
        if !self.in_progress.is_empty() {
431
0
            completed.push(Buffer::from_slice_ref(&self.in_progress));
432
0
        }
433
0
        let len = self.views_buffer.len();
434
0
        let views = Buffer::from_slice_ref(self.views_buffer.as_slice());
435
0
        let views = ScalarBuffer::new(views, 0, len);
436
0
        let nulls = self.null_buffer_builder.finish_cloned();
437
        // SAFETY: valid by construction
438
0
        unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
439
0
    }
440
441
    /// Returns the current null buffer as a slice
442
    pub fn validity_slice(&self) -> Option<&[u8]> {
443
        self.null_buffer_builder.as_slice()
444
    }
445
446
    /// Return the allocated size of this builder in bytes, useful for memory accounting.
447
    pub fn allocated_size(&self) -> usize {
448
        let views = self.views_buffer.capacity() * std::mem::size_of::<u128>();
449
        let null = self.null_buffer_builder.allocated_size();
450
        let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
451
        let in_progress = self.in_progress.capacity();
452
        let tracker = match &self.string_tracker {
453
            Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
454
            None => 0,
455
        };
456
        buffer_size + in_progress + tracker + views + null
457
    }
458
}
459
460
impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
461
    fn default() -> Self {
462
        Self::new()
463
    }
464
}
465
466
impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
467
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
468
        write!(f, "{}ViewBuilder", T::PREFIX)?;
469
        f.debug_struct("")
470
            .field("views_buffer", &self.views_buffer)
471
            .field("in_progress", &self.in_progress)
472
            .field("completed", &self.completed)
473
            .field("null_buffer_builder", &self.null_buffer_builder)
474
            .finish()
475
    }
476
}
477
478
impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> {
479
4
    fn len(&self) -> usize {
480
4
        self.null_buffer_builder.len()
481
4
    }
482
483
0
    fn finish(&mut self) -> ArrayRef {
484
0
        Arc::new(self.finish())
485
0
    }
486
487
0
    fn finish_cloned(&self) -> ArrayRef {
488
0
        Arc::new(self.finish_cloned())
489
0
    }
490
491
0
    fn as_any(&self) -> &dyn Any {
492
0
        self
493
0
    }
494
495
0
    fn as_any_mut(&mut self) -> &mut dyn Any {
496
0
        self
497
0
    }
498
499
0
    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
500
0
        self
501
0
    }
502
}
503
504
impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
505
    for GenericByteViewBuilder<T>
506
{
507
    #[inline]
508
45
    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
509
249k
        for 
v249k
in iter {
510
249k
            self.append_option(v)
511
        }
512
45
    }
513
}
514
515
/// Array builder for [`StringViewArray`][crate::StringViewArray]
516
///
517
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
518
/// [`GenericByteViewBuilder::append_null`] as normal.
519
///
520
/// # Example
521
/// ```
522
/// # use arrow_array::builder::StringViewBuilder;
523
/// # use arrow_array::StringViewArray;
524
/// let mut builder = StringViewBuilder::new();
525
/// builder.append_value("hello");
526
/// builder.append_null();
527
/// builder.append_value("world");
528
/// let array = builder.finish();
529
///
530
/// let expected = vec![Some("hello"), None, Some("world")];
531
/// let actual: Vec<_> = array.iter().collect();
532
/// assert_eq!(expected, actual);
533
/// ```
534
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
535
536
impl StringLikeArrayBuilder for StringViewBuilder {
537
0
    fn type_name() -> &'static str {
538
0
        std::any::type_name::<StringViewBuilder>()
539
0
    }
540
0
    fn with_capacity(capacity: usize) -> Self {
541
0
        Self::with_capacity(capacity)
542
0
    }
543
0
    fn append_value(&mut self, value: &str) {
544
0
        Self::append_value(self, value);
545
0
    }
546
0
    fn append_null(&mut self) {
547
0
        Self::append_null(self);
548
0
    }
549
}
550
551
///  Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
552
///
553
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
554
/// [`GenericByteViewBuilder::append_null`] as normal.
555
///
556
/// # Example
557
/// ```
558
/// # use arrow_array::builder::BinaryViewBuilder;
559
/// use arrow_array::BinaryViewArray;
560
/// let mut builder = BinaryViewBuilder::new();
561
/// builder.append_value("hello");
562
/// builder.append_null();
563
/// builder.append_value("world");
564
/// let array = builder.finish();
565
///
566
/// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")];
567
/// let actual: Vec<_> = array.iter().collect();
568
/// assert_eq!(expected, actual);
569
/// ```
570
///
571
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
572
573
impl BinaryLikeArrayBuilder for BinaryViewBuilder {
574
0
    fn type_name() -> &'static str {
575
0
        std::any::type_name::<BinaryViewBuilder>()
576
0
    }
577
0
    fn with_capacity(capacity: usize) -> Self {
578
0
        Self::with_capacity(capacity)
579
0
    }
580
0
    fn append_value(&mut self, value: &[u8]) {
581
0
        Self::append_value(self, value);
582
0
    }
583
0
    fn append_null(&mut self) {
584
0
        Self::append_null(self);
585
0
    }
586
}
587
588
/// Creates a view from a fixed length input (the compiler can generate
589
/// specialized code for this)
590
12
fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
591
12
    let mut view_buffer = [0; 16];
592
12
    view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes());
593
12
    view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]);
594
12
    u128::from_le_bytes(view_buffer)
595
12
}
596
597
/// Create a view based on the given data, block id and offset.
598
///
599
/// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
600
/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
601
/// which slows down things.
602
#[inline(never)]
603
16
pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
604
16
    let len = data.len();
605
606
    // Generate specialized code for each potential small string length
607
    // to improve performance
608
16
    match len {
609
0
        0 => make_inlined_view::<0>(data),
610
0
        1 => make_inlined_view::<1>(data),
611
0
        2 => make_inlined_view::<2>(data),
612
4
        3 => make_inlined_view::<3>(data),
613
5
        4 => make_inlined_view::<4>(data),
614
3
        5 => make_inlined_view::<5>(data),
615
0
        6 => make_inlined_view::<6>(data),
616
0
        7 => make_inlined_view::<7>(data),
617
0
        8 => make_inlined_view::<8>(data),
618
0
        9 => make_inlined_view::<9>(data),
619
0
        10 => make_inlined_view::<10>(data),
620
0
        11 => make_inlined_view::<11>(data),
621
0
        12 => make_inlined_view::<12>(data),
622
        // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead.
623
        _ => {
624
4
            let view = ByteView {
625
4
                length: len as u32,
626
4
                prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()),
627
4
                buffer_index: block_id,
628
4
                offset,
629
4
            };
630
4
            view.as_u128()
631
        }
632
    }
633
16
}
634
635
#[cfg(test)]
636
mod tests {
637
    use core::str;
638
639
    use super::*;
640
641
    #[test]
642
    fn test_string_view_deduplicate() {
643
        let value_1 = "long string to test string view";
644
        let value_2 = "not so similar string but long";
645
646
        let mut builder = StringViewBuilder::new()
647
            .with_deduplicate_strings()
648
            .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
649
650
        let values = vec![
651
            Some(value_1),
652
            Some(value_2),
653
            Some("short"),
654
            Some(value_1),
655
            None,
656
            Some(value_2),
657
            Some(value_1),
658
        ];
659
        builder.extend(values.clone());
660
661
        let array = builder.finish_cloned();
662
        array.to_data().validate_full().unwrap();
663
        assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers.
664
        let actual: Vec<_> = array.iter().collect();
665
        assert_eq!(actual, values);
666
667
        let view0 = array.views().first().unwrap();
668
        let view3 = array.views().get(3).unwrap();
669
        let view6 = array.views().get(6).unwrap();
670
671
        assert_eq!(view0, view3);
672
        assert_eq!(view0, view6);
673
674
        assert_eq!(array.views().get(1), array.views().get(5));
675
    }
676
677
    #[test]
678
    fn test_string_view_deduplicate_after_finish() {
679
        let mut builder = StringViewBuilder::new().with_deduplicate_strings();
680
681
        let value_1 = "long string to test string view";
682
        let value_2 = "not so similar string but long";
683
        builder.append_value(value_1);
684
        let _array = builder.finish();
685
        builder.append_value(value_2);
686
        let _array = builder.finish();
687
        builder.append_value(value_1);
688
        let _array = builder.finish();
689
    }
690
691
    #[test]
692
    fn test_string_view() {
693
        let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");
694
        let b2 = Buffer::from(b"cupcakes");
695
        let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity");
696
697
        let mut v = StringViewBuilder::new();
698
        assert_eq!(v.append_block(b1), 0);
699
700
        v.append_value("This is a very long string that exceeds the inline length");
701
        v.append_value("This is another very long string that exceeds the inline length");
702
703
        assert_eq!(v.append_block(b2), 2);
704
        assert_eq!(v.append_block(b3), 3);
705
706
        // Test short strings
707
        v.try_append_view(0, 0, 5).unwrap(); // world
708
        v.try_append_view(0, 6, 7).unwrap(); // bananas
709
        v.try_append_view(2, 3, 5).unwrap(); // cake
710
        v.try_append_view(2, 0, 3).unwrap(); // cup
711
        v.try_append_view(2, 0, 8).unwrap(); // cupcakes
712
        v.try_append_view(0, 13, 4).unwrap(); // 😁
713
        v.try_append_view(0, 13, 0).unwrap(); //
714
715
        // Test longer strings
716
        v.try_append_view(3, 0, 16).unwrap(); // Many strings are
717
        v.try_append_view(1, 0, 19).unwrap(); // This is a very long
718
        v.try_append_view(3, 13, 27).unwrap(); // here contained of great length
719
720
        v.append_value("I do so like long strings");
721
722
        let array = v.finish_cloned();
723
        array.to_data().validate_full().unwrap();
724
        assert_eq!(array.data_buffers().len(), 5);
725
        let actual: Vec<_> = array.iter().flatten().collect();
726
        assert_eq!(
727
            actual,
728
            &[
729
                "This is a very long string that exceeds the inline length",
730
                "This is another very long string that exceeds the inline length",
731
                "world",
732
                "bananas",
733
                "cakes",
734
                "cup",
735
                "cupcakes",
736
                "😁",
737
                "",
738
                "Many strings are",
739
                "This is a very long",
740
                "are here contained of great",
741
                "I do so like long strings"
742
            ]
743
        );
744
745
        let err = v.try_append_view(0, u32::MAX, 1).unwrap_err();
746
        assert_eq!(
747
            err.to_string(),
748
            "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17"
749
        );
750
751
        let err = v.try_append_view(0, 1, u32::MAX).unwrap_err();
752
        assert_eq!(
753
            err.to_string(),
754
            "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17"
755
        );
756
757
        let err = v.try_append_view(0, 13, 2).unwrap_err();
758
        assert_eq!(err.to_string(), "Invalid argument error: Invalid view data");
759
760
        let err = v.try_append_view(0, 40, 0).unwrap_err();
761
        assert_eq!(
762
            err.to_string(),
763
            "Invalid argument error: Range 40..40 out of bounds for block of length 17"
764
        );
765
766
        let err = v.try_append_view(5, 0, 0).unwrap_err();
767
        assert_eq!(
768
            err.to_string(),
769
            "Invalid argument error: No block found with index 5"
770
        );
771
    }
772
773
    #[test]
774
    fn test_string_view_with_block_size_growth() {
775
        let mut exp_builder = StringViewBuilder::new();
776
        let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
777
778
        let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap();
779
780
        for i in 0..9 {
781
            // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
782
            for _ in 0..(2_u32.pow(i)) {
783
                exp_builder.append_value(long_string);
784
                fixed_builder.append_value(long_string);
785
            }
786
            exp_builder.flush_in_progress();
787
            fixed_builder.flush_in_progress();
788
789
            // Every step only add one buffer, but the buffer size is much larger
790
            assert_eq!(exp_builder.completed.len(), i as usize + 1);
791
            assert_eq!(
792
                exp_builder.completed[i as usize].len(),
793
                STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
794
            );
795
796
            // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
797
            assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
798
799
            // Every buffer is fixed size
800
            assert!(
801
                fixed_builder
802
                    .completed
803
                    .iter()
804
                    .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)
805
            );
806
        }
807
808
        // Add one more value, and the buffer stop growing.
809
        exp_builder.append_value(long_string);
810
        exp_builder.flush_in_progress();
811
        assert_eq!(
812
            exp_builder.completed.last().unwrap().capacity(),
813
            MAX_BLOCK_SIZE as usize
814
        );
815
    }
816
}