Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_view_builder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::any::Any;
19
use std::marker::PhantomData;
20
use std::sync::Arc;
21
22
use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
23
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
24
use arrow_schema::ArrowError;
25
use hashbrown::hash_table::Entry;
26
use hashbrown::HashTable;
27
28
use crate::builder::ArrayBuilder;
29
use crate::types::bytes::ByteArrayNativeType;
30
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
31
use crate::{Array, ArrayRef, GenericByteViewArray};
32
33
const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
34
const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
35
36
enum BlockSizeGrowthStrategy {
37
    Fixed { size: u32 },
38
    Exponential { current_size: u32 },
39
}
40
41
impl BlockSizeGrowthStrategy {
42
0
    fn next_size(&mut self) -> u32 {
43
0
        match self {
44
0
            Self::Fixed { size } => *size,
45
0
            Self::Exponential { current_size } => {
46
0
                if *current_size < MAX_BLOCK_SIZE {
47
                    // we have fixed start/end block sizes, so we can't overflow
48
0
                    *current_size = current_size.saturating_mul(2);
49
0
                    *current_size
50
                } else {
51
0
                    MAX_BLOCK_SIZE
52
                }
53
            }
54
        }
55
0
    }
56
}
57
58
/// A builder for [`GenericByteViewArray`]
59
///
60
/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data,
61
/// and a list of views into those buffers.
62
///
63
/// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`]
64
///
65
/// This builder can be used in two ways
66
///
67
/// # Append Values
68
///
69
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
70
/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
71
/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
72
/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
73
/// in-progress block, it will be closed, and a new block of sufficient size allocated
74
///
75
/// # Append Views
76
///
77
/// Some use-cases may wish to reuse an existing allocation containing string data, for example,
78
/// when parsing data from a parquet data page. In such a case entire blocks can be appended
79
/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
80
/// using [`GenericByteViewBuilder::try_append_view`]
81
pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
82
    views_buffer: Vec<u128>,
83
    null_buffer_builder: NullBufferBuilder,
84
    completed: Vec<Buffer>,
85
    in_progress: Vec<u8>,
86
    block_size: BlockSizeGrowthStrategy,
87
    /// Some if deduplicating strings
88
    /// map `<string hash> -> <index to the views>`
89
    string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90
    phantom: PhantomData<T>,
91
}
92
93
impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
94
    /// Creates a new [`GenericByteViewBuilder`].
95
    pub fn new() -> Self {
96
        Self::with_capacity(1024)
97
    }
98
99
    /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
100
1
    pub fn with_capacity(capacity: usize) -> Self {
101
1
        Self {
102
1
            views_buffer: Vec::with_capacity(capacity),
103
1
            null_buffer_builder: NullBufferBuilder::new(capacity),
104
1
            completed: vec![],
105
1
            in_progress: vec![],
106
1
            block_size: BlockSizeGrowthStrategy::Exponential {
107
1
                current_size: STARTING_BLOCK_SIZE,
108
1
            },
109
1
            string_tracker: None,
110
1
            phantom: Default::default(),
111
1
        }
112
1
    }
113
114
    /// Set a fixed buffer size for variable length strings
115
    ///
116
    /// The block size is the size of the buffer used to store values greater
117
    /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
118
    /// buffer is full.
119
    ///
120
    /// By default the builder balances buffer size and buffer count by
121
    /// growing buffer size exponentially from 8KB up to 2MB. The
122
    /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
123
    ///
124
    /// If this method is used, any new buffers allocated are  
125
    /// exactly this size. This can be useful for advanced users
126
    /// that want to control the memory usage and buffer count.
127
    ///
128
    /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
129
    pub fn with_fixed_block_size(self, block_size: u32) -> Self {
130
        debug_assert!(block_size > 0, "Block size must be greater than 0");
131
        Self {
132
            block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
133
            ..self
134
        }
135
    }
136
137
    /// Deduplicate strings while building the array
138
    ///
139
    /// This will potentially decrease the memory usage if the array have repeated strings
140
    /// It will also increase the time to build the array as it needs to hash the strings
141
    pub fn with_deduplicate_strings(self) -> Self {
142
        Self {
143
            string_tracker: Some((
144
                HashTable::with_capacity(self.views_buffer.capacity()),
145
                Default::default(),
146
            )),
147
            ..self
148
        }
149
    }
150
151
    /// Append a new data block returning the new block offset
152
    ///
153
    /// Note: this will first flush any in-progress block
154
    ///
155
    /// This allows appending views from blocks added using [`Self::append_block`]. See
156
    /// [`Self::append_value`] for appending individual values
157
    ///
158
    /// ```
159
    /// # use arrow_array::builder::StringViewBuilder;
160
    /// let mut builder = StringViewBuilder::new();
161
    ///
162
    /// let block = builder.append_block(b"helloworldbingobongo".into());
163
    ///
164
    /// builder.try_append_view(block, 0, 5).unwrap();
165
    /// builder.try_append_view(block, 5, 5).unwrap();
166
    /// builder.try_append_view(block, 10, 5).unwrap();
167
    /// builder.try_append_view(block, 15, 5).unwrap();
168
    /// builder.try_append_view(block, 0, 15).unwrap();
169
    /// let array = builder.finish();
170
    ///
171
    /// let actual: Vec<_> = array.iter().flatten().collect();
172
    /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"];
173
    /// assert_eq!(actual, expected);
174
    /// ```
175
0
    pub fn append_block(&mut self, buffer: Buffer) -> u32 {
176
0
        assert!(buffer.len() < u32::MAX as usize);
177
178
0
        self.flush_in_progress();
179
0
        let offset = self.completed.len();
180
0
        self.push_completed(buffer);
181
0
        offset as u32
182
0
    }
183
184
    /// Append a view of the given `block`, `offset` and `length`
185
    ///
186
    /// # Safety
187
    /// (1) The block must have been added using [`Self::append_block`]
188
    /// (2) The range `offset..offset+length` must be within the bounds of the block
189
    /// (3) The data in the block must be valid of type `T`
190
0
    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
191
0
        let b = self.completed.get_unchecked(block as usize);
192
0
        let start = offset as usize;
193
0
        let end = start.saturating_add(len as usize);
194
0
        let b = b.get_unchecked(start..end);
195
196
0
        let view = make_view(b, block, offset);
197
0
        self.views_buffer.push(view);
198
0
        self.null_buffer_builder.append_non_null();
199
0
    }
200
201
    /// Appends an array to the builder.
202
    /// This will flush any in-progress block and append the data buffers
203
    /// and add the (adapted) views.
204
0
    pub fn append_array(&mut self, array: &GenericByteViewArray<T>) {
205
0
        self.flush_in_progress();
206
        // keep original views if this array is the first to be added or if there are no data buffers (all inline views)
207
0
        let keep_views = self.completed.is_empty() || array.data_buffers().is_empty();
208
0
        let starting_buffer = self.completed.len() as u32;
209
210
0
        self.completed.extend(array.data_buffers().iter().cloned());
211
212
0
        if keep_views {
213
0
            self.views_buffer.extend_from_slice(array.views());
214
0
        } else {
215
0
            self.views_buffer.extend(array.views().iter().map(|v| {
216
0
                let mut byte_view = ByteView::from(*v);
217
0
                if byte_view.length > MAX_INLINE_VIEW_LEN {
218
0
                    // Small views (<=12 bytes) are inlined, so only need to update large views
219
0
                    byte_view.buffer_index += starting_buffer;
220
0
                };
221
222
0
                byte_view.as_u128()
223
0
            }));
224
        }
225
226
0
        if let Some(null_buffer) = array.nulls() {
227
0
            self.null_buffer_builder.append_buffer(null_buffer);
228
0
        } else {
229
0
            self.null_buffer_builder.append_n_non_nulls(array.len());
230
0
        }
231
0
    }
232
233
    /// Try to append a view of the given `block`, `offset` and `length`
234
    ///
235
    /// See [`Self::append_block`]
236
    pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> {
237
        let b = self.completed.get(block as usize).ok_or_else(|| {
238
            ArrowError::InvalidArgumentError(format!("No block found with index {block}"))
239
        })?;
240
        let start = offset as usize;
241
        let end = start.saturating_add(len as usize);
242
243
        let b = b.get(start..end).ok_or_else(|| {
244
            ArrowError::InvalidArgumentError(format!(
245
                "Range {start}..{end} out of bounds for block of length {}",
246
                b.len()
247
            ))
248
        })?;
249
250
        if T::Native::from_bytes_checked(b).is_none() {
251
            return Err(ArrowError::InvalidArgumentError(
252
                "Invalid view data".to_string(),
253
            ));
254
        }
255
256
        unsafe {
257
            self.append_view_unchecked(block, offset, len);
258
        }
259
        Ok(())
260
    }
261
262
    /// Flushes the in progress block if any
263
    #[inline]
264
1
    fn flush_in_progress(&mut self) {
265
1
        if !self.in_progress.is_empty() {
266
0
            let f = Buffer::from_vec(std::mem::take(&mut self.in_progress));
267
0
            self.push_completed(f)
268
1
        }
269
1
    }
270
271
    /// Append a block to `self.completed`, checking for overflow
272
    #[inline]
273
0
    fn push_completed(&mut self, block: Buffer) {
274
0
        assert!(block.len() < u32::MAX as usize, "Block too large");
275
0
        assert!(self.completed.len() < u32::MAX as usize, "Too many blocks");
276
0
        self.completed.push(block);
277
0
    }
278
279
    /// Returns the value at the given index
280
    /// Useful if we want to know what value has been inserted to the builder
281
    /// The index has to be smaller than `self.len()`, otherwise it will panic
282
0
    pub fn get_value(&self, index: usize) -> &[u8] {
283
0
        let view = self.views_buffer.as_slice().get(index).unwrap();
284
0
        let len = *view as u32;
285
0
        if len <= MAX_INLINE_VIEW_LEN {
286
            // # Safety
287
            // The view is valid from the builder
288
0
            unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
289
        } else {
290
0
            let view = ByteView::from(*view);
291
0
            if view.buffer_index < self.completed.len() as u32 {
292
0
                let block = &self.completed[view.buffer_index as usize];
293
0
                &block[view.offset as usize..view.offset as usize + view.length as usize]
294
            } else {
295
0
                &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize]
296
            }
297
        }
298
0
    }
299
300
    /// Appends a value into the builder
301
    ///
302
    /// # Panics
303
    ///
304
    /// Panics if
305
    /// - String buffer count exceeds `u32::MAX`
306
    /// - String length exceeds `u32::MAX`
307
    #[inline]
308
2
    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
309
2
        let v: &[u8] = value.as_ref().as_ref();
310
2
        let length: u32 = v.len().try_into().unwrap();
311
2
        if length <= MAX_INLINE_VIEW_LEN {
312
2
            let mut view_buffer = [0; 16];
313
2
            view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
314
2
            view_buffer[4..4 + v.len()].copy_from_slice(v);
315
2
            self.views_buffer.push(u128::from_le_bytes(view_buffer));
316
2
            self.null_buffer_builder.append_non_null();
317
2
            return;
318
0
        }
319
320
        // Deduplication if:
321
        // (1) deduplication is enabled.
322
        // (2) len > 12
323
0
        if let Some((mut ht, hasher)) = self.string_tracker.take() {
324
0
            let hash_val = hasher.hash_one(v);
325
0
            let hasher_fn = |v: &_| hasher.hash_one(v);
326
327
0
            let entry = ht.entry(
328
0
                hash_val,
329
0
                |idx| {
330
0
                    let stored_value = self.get_value(*idx);
331
0
                    v == stored_value
332
0
                },
333
0
                hasher_fn,
334
            );
335
0
            match entry {
336
0
                Entry::Occupied(occupied) => {
337
                    // If the string already exists, we will directly use the view
338
0
                    let idx = occupied.get();
339
0
                    self.views_buffer.push(self.views_buffer[*idx]);
340
0
                    self.null_buffer_builder.append_non_null();
341
0
                    self.string_tracker = Some((ht, hasher));
342
0
                    return;
343
                }
344
0
                Entry::Vacant(vacant) => {
345
0
                    // o.w. we insert the (string hash -> view index)
346
0
                    // the idx is current length of views_builder, as we are inserting a new view
347
0
                    vacant.insert(self.views_buffer.len());
348
0
                }
349
            }
350
0
            self.string_tracker = Some((ht, hasher));
351
0
        }
352
353
0
        let required_cap = self.in_progress.len() + v.len();
354
0
        if self.in_progress.capacity() < required_cap {
355
0
            self.flush_in_progress();
356
0
            let to_reserve = v.len().max(self.block_size.next_size() as usize);
357
0
            self.in_progress.reserve(to_reserve);
358
0
        };
359
0
        let offset = self.in_progress.len() as u32;
360
0
        self.in_progress.extend_from_slice(v);
361
362
0
        let view = ByteView {
363
0
            length,
364
0
            prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
365
0
            buffer_index: self.completed.len() as u32,
366
0
            offset,
367
0
        };
368
0
        self.views_buffer.push(view.into());
369
0
        self.null_buffer_builder.append_non_null();
370
2
    }
371
372
    /// Append an `Option` value into the builder
373
    #[inline]
374
0
    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
375
0
        match value {
376
0
            None => self.append_null(),
377
0
            Some(v) => self.append_value(v),
378
        };
379
0
    }
380
381
    /// Append a null value into the builder
382
    #[inline]
383
0
    pub fn append_null(&mut self) {
384
0
        self.null_buffer_builder.append_null();
385
0
        self.views_buffer.push(0);
386
0
    }
387
388
    /// Builds the [`GenericByteViewArray`] and reset this builder
389
1
    pub fn finish(&mut self) -> GenericByteViewArray<T> {
390
1
        self.flush_in_progress();
391
1
        let completed = std::mem::take(&mut self.completed);
392
1
        let nulls = self.null_buffer_builder.finish();
393
1
        if let Some((
ref mut ht0
, _)) = self.string_tracker.as_mut() {
394
0
            ht.clear();
395
1
        }
396
1
        let views = std::mem::take(&mut self.views_buffer);
397
        // SAFETY: valid by construction
398
1
        unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) }
399
1
    }
400
401
    /// Builds the [`GenericByteViewArray`] without resetting the builder
402
0
    pub fn finish_cloned(&self) -> GenericByteViewArray<T> {
403
0
        let mut completed = self.completed.clone();
404
0
        if !self.in_progress.is_empty() {
405
0
            completed.push(Buffer::from_slice_ref(&self.in_progress));
406
0
        }
407
0
        let len = self.views_buffer.len();
408
0
        let views = Buffer::from_slice_ref(self.views_buffer.as_slice());
409
0
        let views = ScalarBuffer::new(views, 0, len);
410
0
        let nulls = self.null_buffer_builder.finish_cloned();
411
        // SAFETY: valid by construction
412
0
        unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
413
0
    }
414
415
    /// Returns the current null buffer as a slice
416
    pub fn validity_slice(&self) -> Option<&[u8]> {
417
        self.null_buffer_builder.as_slice()
418
    }
419
420
    /// Return the allocated size of this builder in bytes, useful for memory accounting.
421
    pub fn allocated_size(&self) -> usize {
422
        let views = self.views_buffer.capacity() * std::mem::size_of::<u128>();
423
        let null = self.null_buffer_builder.allocated_size();
424
        let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
425
        let in_progress = self.in_progress.capacity();
426
        let tracker = match &self.string_tracker {
427
            Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
428
            None => 0,
429
        };
430
        buffer_size + in_progress + tracker + views + null
431
    }
432
}
433
434
impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
435
    fn default() -> Self {
436
        Self::new()
437
    }
438
}
439
440
impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
441
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
442
        write!(f, "{}ViewBuilder", T::PREFIX)?;
443
        f.debug_struct("")
444
            .field("views_buffer", &self.views_buffer)
445
            .field("in_progress", &self.in_progress)
446
            .field("completed", &self.completed)
447
            .field("null_buffer_builder", &self.null_buffer_builder)
448
            .finish()
449
    }
450
}
451
452
impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> {
453
0
    fn len(&self) -> usize {
454
0
        self.null_buffer_builder.len()
455
0
    }
456
457
0
    fn finish(&mut self) -> ArrayRef {
458
0
        Arc::new(self.finish())
459
0
    }
460
461
0
    fn finish_cloned(&self) -> ArrayRef {
462
0
        Arc::new(self.finish_cloned())
463
0
    }
464
465
0
    fn as_any(&self) -> &dyn Any {
466
0
        self
467
0
    }
468
469
0
    fn as_any_mut(&mut self) -> &mut dyn Any {
470
0
        self
471
0
    }
472
473
0
    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
474
0
        self
475
0
    }
476
}
477
478
impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
479
    for GenericByteViewBuilder<T>
480
{
481
    #[inline]
482
0
    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
483
0
        for v in iter {
484
0
            self.append_option(v)
485
        }
486
0
    }
487
}
488
489
/// Array builder for [`StringViewArray`][crate::StringViewArray]
490
///
491
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
492
/// [`GenericByteViewBuilder::append_null`] as normal.
493
///
494
/// # Example
495
/// ```
496
/// # use arrow_array::builder::StringViewBuilder;
497
/// # use arrow_array::StringViewArray;
498
/// let mut builder = StringViewBuilder::new();
499
/// builder.append_value("hello");
500
/// builder.append_null();
501
/// builder.append_value("world");
502
/// let array = builder.finish();
503
///
504
/// let expected = vec![Some("hello"), None, Some("world")];
505
/// let actual: Vec<_> = array.iter().collect();
506
/// assert_eq!(expected, actual);
507
/// ```
508
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
509
510
///  Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
511
///
512
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
513
/// [`GenericByteViewBuilder::append_null`] as normal.
514
///
515
/// # Example
516
/// ```
517
/// # use arrow_array::builder::BinaryViewBuilder;
518
/// use arrow_array::BinaryViewArray;
519
/// let mut builder = BinaryViewBuilder::new();
520
/// builder.append_value("hello");
521
/// builder.append_null();
522
/// builder.append_value("world");
523
/// let array = builder.finish();
524
///
525
/// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")];
526
/// let actual: Vec<_> = array.iter().collect();
527
/// assert_eq!(expected, actual);
528
/// ```
529
///
530
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
531
532
/// Creates a view from a fixed length input (the compiler can generate
533
/// specialized code for this)
534
0
fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
535
0
    let mut view_buffer = [0; 16];
536
0
    view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes());
537
0
    view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]);
538
0
    u128::from_le_bytes(view_buffer)
539
0
}
540
541
/// Create a view based on the given data, block id and offset.
542
///
543
/// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
544
/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
545
/// which slows down things.
546
#[inline(never)]
547
0
pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
548
0
    let len = data.len();
549
550
    // Generate specialized code for each potential small string length
551
    // to improve performance
552
0
    match len {
553
0
        0 => make_inlined_view::<0>(data),
554
0
        1 => make_inlined_view::<1>(data),
555
0
        2 => make_inlined_view::<2>(data),
556
0
        3 => make_inlined_view::<3>(data),
557
0
        4 => make_inlined_view::<4>(data),
558
0
        5 => make_inlined_view::<5>(data),
559
0
        6 => make_inlined_view::<6>(data),
560
0
        7 => make_inlined_view::<7>(data),
561
0
        8 => make_inlined_view::<8>(data),
562
0
        9 => make_inlined_view::<9>(data),
563
0
        10 => make_inlined_view::<10>(data),
564
0
        11 => make_inlined_view::<11>(data),
565
0
        12 => make_inlined_view::<12>(data),
566
        // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead.
567
        _ => {
568
0
            let view = ByteView {
569
0
                length: len as u32,
570
0
                prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()),
571
0
                buffer_index: block_id,
572
0
                offset,
573
0
            };
574
0
            view.as_u128()
575
        }
576
    }
577
0
}
578
579
#[cfg(test)]
580
mod tests {
581
    use core::str;
582
583
    use super::*;
584
    use crate::Array;
585
586
    #[test]
587
    fn test_string_view_deduplicate() {
588
        let value_1 = "long string to test string view";
589
        let value_2 = "not so similar string but long";
590
591
        let mut builder = StringViewBuilder::new()
592
            .with_deduplicate_strings()
593
            .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
594
595
        let values = vec![
596
            Some(value_1),
597
            Some(value_2),
598
            Some("short"),
599
            Some(value_1),
600
            None,
601
            Some(value_2),
602
            Some(value_1),
603
        ];
604
        builder.extend(values.clone());
605
606
        let array = builder.finish_cloned();
607
        array.to_data().validate_full().unwrap();
608
        assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers.
609
        let actual: Vec<_> = array.iter().collect();
610
        assert_eq!(actual, values);
611
612
        let view0 = array.views().first().unwrap();
613
        let view3 = array.views().get(3).unwrap();
614
        let view6 = array.views().get(6).unwrap();
615
616
        assert_eq!(view0, view3);
617
        assert_eq!(view0, view6);
618
619
        assert_eq!(array.views().get(1), array.views().get(5));
620
    }
621
622
    #[test]
623
    fn test_string_view_deduplicate_after_finish() {
624
        let mut builder = StringViewBuilder::new().with_deduplicate_strings();
625
626
        let value_1 = "long string to test string view";
627
        let value_2 = "not so similar string but long";
628
        builder.append_value(value_1);
629
        let _array = builder.finish();
630
        builder.append_value(value_2);
631
        let _array = builder.finish();
632
        builder.append_value(value_1);
633
        let _array = builder.finish();
634
    }
635
636
    #[test]
637
    fn test_string_view() {
638
        let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");
639
        let b2 = Buffer::from(b"cupcakes");
640
        let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity");
641
642
        let mut v = StringViewBuilder::new();
643
        assert_eq!(v.append_block(b1), 0);
644
645
        v.append_value("This is a very long string that exceeds the inline length");
646
        v.append_value("This is another very long string that exceeds the inline length");
647
648
        assert_eq!(v.append_block(b2), 2);
649
        assert_eq!(v.append_block(b3), 3);
650
651
        // Test short strings
652
        v.try_append_view(0, 0, 5).unwrap(); // world
653
        v.try_append_view(0, 6, 7).unwrap(); // bananas
654
        v.try_append_view(2, 3, 5).unwrap(); // cake
655
        v.try_append_view(2, 0, 3).unwrap(); // cup
656
        v.try_append_view(2, 0, 8).unwrap(); // cupcakes
657
        v.try_append_view(0, 13, 4).unwrap(); // 😁
658
        v.try_append_view(0, 13, 0).unwrap(); //
659
660
        // Test longer strings
661
        v.try_append_view(3, 0, 16).unwrap(); // Many strings are
662
        v.try_append_view(1, 0, 19).unwrap(); // This is a very long
663
        v.try_append_view(3, 13, 27).unwrap(); // here contained of great length
664
665
        v.append_value("I do so like long strings");
666
667
        let array = v.finish_cloned();
668
        array.to_data().validate_full().unwrap();
669
        assert_eq!(array.data_buffers().len(), 5);
670
        let actual: Vec<_> = array.iter().flatten().collect();
671
        assert_eq!(
672
            actual,
673
            &[
674
                "This is a very long string that exceeds the inline length",
675
                "This is another very long string that exceeds the inline length",
676
                "world",
677
                "bananas",
678
                "cakes",
679
                "cup",
680
                "cupcakes",
681
                "😁",
682
                "",
683
                "Many strings are",
684
                "This is a very long",
685
                "are here contained of great",
686
                "I do so like long strings"
687
            ]
688
        );
689
690
        let err = v.try_append_view(0, u32::MAX, 1).unwrap_err();
691
        assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17");
692
693
        let err = v.try_append_view(0, 1, u32::MAX).unwrap_err();
694
        assert_eq!(
695
            err.to_string(),
696
            "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17"
697
        );
698
699
        let err = v.try_append_view(0, 13, 2).unwrap_err();
700
        assert_eq!(err.to_string(), "Invalid argument error: Invalid view data");
701
702
        let err = v.try_append_view(0, 40, 0).unwrap_err();
703
        assert_eq!(
704
            err.to_string(),
705
            "Invalid argument error: Range 40..40 out of bounds for block of length 17"
706
        );
707
708
        let err = v.try_append_view(5, 0, 0).unwrap_err();
709
        assert_eq!(
710
            err.to_string(),
711
            "Invalid argument error: No block found with index 5"
712
        );
713
    }
714
715
    #[test]
716
    fn test_string_view_with_block_size_growth() {
717
        let mut exp_builder = StringViewBuilder::new();
718
        let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
719
720
        let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap();
721
722
        for i in 0..9 {
723
            // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
724
            for _ in 0..(2_u32.pow(i)) {
725
                exp_builder.append_value(long_string);
726
                fixed_builder.append_value(long_string);
727
            }
728
            exp_builder.flush_in_progress();
729
            fixed_builder.flush_in_progress();
730
731
            // Every step only add one buffer, but the buffer size is much larger
732
            assert_eq!(exp_builder.completed.len(), i as usize + 1);
733
            assert_eq!(
734
                exp_builder.completed[i as usize].len(),
735
                STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
736
            );
737
738
            // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
739
            assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
740
741
            // Every buffer is fixed size
742
            assert!(fixed_builder
743
                .completed
744
                .iter()
745
                .all(|b| b.len() == STARTING_BLOCK_SIZE as usize));
746
        }
747
748
        // Add one more value, and the buffer stop growing.
749
        exp_builder.append_value(long_string);
750
        exp_builder.flush_in_progress();
751
        assert_eq!(
752
            exp_builder.completed.last().unwrap().capacity(),
753
            MAX_BLOCK_SIZE as usize
754
        );
755
    }
756
}