Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/byte_view_array.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::array::print_long_array;
19
use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20
use crate::iterator::ArrayIter;
21
use crate::types::bytes::ByteArrayNativeType;
22
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23
use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
26
use arrow_schema::{ArrowError, DataType};
27
use core::str;
28
use num_traits::ToPrimitive;
29
use std::any::Any;
30
use std::cmp::Ordering;
31
use std::fmt::Debug;
32
use std::marker::PhantomData;
33
use std::sync::Arc;
34
35
use super::ByteArrayType;
36
37
/// [Variable-size Binary View Layout]: An array of variable length bytes views.
38
///
39
/// This array type is used to store variable length byte data (e.g. Strings, Binary)
40
/// and has efficient operations such as `take`, `filter`, and comparison.
41
///
42
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
43
///
44
/// This is different from [`GenericByteArray`], which also stores variable
45
/// length byte data, as it represents strings with an offset and length. `take`
46
/// and `filter` like operations are implemented by manipulating the "views"
47
/// (`u128`) without modifying the bytes. Each view also stores an inlined
48
/// prefix which speed up comparisons.
49
///
50
/// # See Also
51
///
52
/// * [`StringViewArray`] for storing utf8 encoded string data
53
/// * [`BinaryViewArray`] for storing bytes
54
/// * [`ByteView`] to interpret `u128`s layout of the views.
55
///
56
/// [`ByteView`]: arrow_data::ByteView
57
///
58
/// # Layout: "views" and buffers
59
///
60
/// A `GenericByteViewArray` stores variable length byte strings. An array of
61
/// `N` elements is stored as `N` fixed length "views" and a variable number
62
/// of variable length "buffers".
63
///
64
/// Each view is a `u128` value whose layout is different depending on the
65
/// length of the string stored at that location:
66
///
67
/// ```text
68
///                         ┌──────┬────────────────────────┐
69
///                         │length│      string value      │
70
///    Strings (len <= 12)  │      │    (padded with 0)     │
71
///                         └──────┴────────────────────────┘
72
///                          0    31                      127
73
///
74
///                         ┌───────┬───────┬───────┬───────┐
75
///                         │length │prefix │  buf  │offset │
76
///    Strings (len > 12)   │       │       │ index │       │
77
///                         └───────┴───────┴───────┴───────┘
78
///                          0    31       63      95    127
79
/// ```
80
///
81
/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
82
///   the view. See [`Self::inline_value`] to access the inlined prefix from a
83
///   short view.
84
///
85
/// * Strings with length > 12: The first four bytes are stored inline in the
86
///   view and the entire string is stored in one of the buffers. See [`ByteView`]
87
///   to access the fields of the these views.
88
///
89
/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
90
/// the easiest and fastest way to work with this data. However, it is possible
91
/// to access the views and buffers directly for more control.
92
///
93
/// For example
94
///
95
/// ```rust
96
/// # use arrow_array::StringViewArray;
97
/// # use arrow_array::Array;
98
/// use arrow_data::ByteView;
99
/// let array = StringViewArray::from(vec![
100
///   "hello",
101
///   "this string is longer than 12 bytes",
102
///   "this string is also longer than 12 bytes"
103
/// ]);
104
///
105
/// // ** Examine the first view (short string) **
106
/// assert!(array.is_valid(0)); // Check for nulls
107
/// let short_view: u128 = array.views()[0]; // "hello"
108
/// // get length of the string
109
/// let len = short_view as u32;
110
/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
111
/// // SAFETY: `view` is a valid view
112
/// let value = unsafe {
113
///   StringViewArray::inline_value(&short_view, len as usize)
114
/// };
115
/// assert_eq!(value, b"hello");
116
///
117
/// // ** Examine the third view (long string) **
118
/// assert!(array.is_valid(12)); // Check for nulls
119
/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
120
/// let len = long_view as u32;
121
/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
122
/// let view = ByteView::from(long_view); // use ByteView to access the fields
123
/// assert_eq!(view.length, 40);
124
/// assert_eq!(view.buffer_index, 0);
125
/// assert_eq!(view.offset, 35); // data starts after the first long string
126
/// // Views for long strings store a 4 byte prefix
127
/// let prefix = view.prefix.to_le_bytes();
128
/// assert_eq!(&prefix, b"this");
129
/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
130
/// assert_eq!(value, "this string is also longer than 12 bytes");
131
/// ```
132
///
133
/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
134
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
135
///
136
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
137
/// than they must point into a valid buffer. However, they can be out of order,
138
/// non continuous and overlapping.
139
///
140
/// For example, in the following diagram, the strings "FishWasInTownToday" and
141
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
142
/// separate buffer while the string "LavaMonster" is stored inlined in the
143
/// view. In this case, the same bytes for "Fish" are used to store both strings.
144
///
145
/// [`ByteView`]: arrow_data::ByteView
146
///
147
/// ```text
148
///                                                                            ┌───┐
149
///                         ┌──────┬──────┬──────┬──────┐               offset │...│
150
/// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
151
///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
152
///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
153
/// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
154
///                         └──────┴──────┴──────┴──────┘                      │ced│
155
///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
156
/// "LavaMonster"           │  11  │   LavaMonster      │                      │hWa│
157
///                         └──────┴────────────────────┘               offset │sIn│
158
///                                                                       115  │Tow│
159
///                                                                            │nTo│
160
///                                                                            │day│
161
///                                  u128 "views"                              │Yay│
162
///                                                                   buffer 0 │...│
163
///                                                                            └───┘
164
/// ```
165
pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
166
    data_type: DataType,
167
    views: ScalarBuffer<u128>,
168
    buffers: Vec<Buffer>,
169
    phantom: PhantomData<T>,
170
    nulls: Option<NullBuffer>,
171
}
172
173
impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
174
108
    fn clone(&self) -> Self {
175
108
        Self {
176
108
            data_type: T::DATA_TYPE,
177
108
            views: self.views.clone(),
178
108
            buffers: self.buffers.clone(),
179
108
            nulls: self.nulls.clone(),
180
108
            phantom: Default::default(),
181
108
        }
182
108
    }
183
}
184
185
impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
186
    /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
187
    ///
188
    /// # Panics
189
    ///
190
    /// Panics if [`GenericByteViewArray::try_new`] returns an error
191
    pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
192
        Self::try_new(views, buffers, nulls).unwrap()
193
    }
194
195
    /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
196
    ///
197
    /// # Errors
198
    ///
199
    /// * `views.len() != nulls.len()`
200
    /// * [ByteViewType::validate] fails
201
2
    pub fn try_new(
202
2
        views: ScalarBuffer<u128>,
203
2
        buffers: Vec<Buffer>,
204
2
        nulls: Option<NullBuffer>,
205
2
    ) -> Result<Self, ArrowError> {
206
2
        T::validate(&views, &buffers)
?0
;
207
208
2
        if let Some(
n0
) = nulls.as_ref() {
209
0
            if n.len() != views.len() {
210
0
                return Err(ArrowError::InvalidArgumentError(format!(
211
0
                    "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
212
0
                    T::PREFIX,
213
0
                    views.len(),
214
0
                    n.len(),
215
0
                )));
216
0
            }
217
2
        }
218
219
2
        Ok(Self {
220
2
            data_type: T::DATA_TYPE,
221
2
            views,
222
2
            buffers,
223
2
            nulls,
224
2
            phantom: Default::default(),
225
2
        })
226
2
    }
227
228
    /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
229
    ///
230
    /// # Safety
231
    ///
232
    /// Safe if [`Self::try_new`] would not error
233
216
    pub unsafe fn new_unchecked(
234
216
        views: ScalarBuffer<u128>,
235
216
        buffers: Vec<Buffer>,
236
216
        nulls: Option<NullBuffer>,
237
216
    ) -> Self {
238
216
        if cfg!(feature = "force_validate") {
239
0
            return Self::new(views, buffers, nulls);
240
216
        }
241
242
216
        Self {
243
216
            data_type: T::DATA_TYPE,
244
216
            phantom: Default::default(),
245
216
            views,
246
216
            buffers,
247
216
            nulls,
248
216
        }
249
216
    }
250
251
    /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
252
    pub fn new_null(len: usize) -> Self {
253
        Self {
254
            data_type: T::DATA_TYPE,
255
            views: vec![0; len].into(),
256
            buffers: vec![],
257
            nulls: Some(NullBuffer::new_null(len)),
258
            phantom: Default::default(),
259
        }
260
    }
261
262
    /// Create a new [`Scalar`] from `value`
263
    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
264
        Scalar::new(Self::from_iter_values(std::iter::once(value)))
265
    }
266
267
    /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
268
2
    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
269
2
    where
270
2
        Ptr: AsRef<T::Native>,
271
2
        I: IntoIterator<Item = Ptr>,
272
    {
273
2
        let iter = iter.into_iter();
274
2
        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
275
7
        for 
v5
in iter {
276
5
            builder.append_value(v);
277
5
        }
278
2
        builder.finish()
279
2
    }
280
281
    /// Deconstruct this array into its constituent parts
282
    pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
283
        (self.views, self.buffers, self.nulls)
284
    }
285
286
    /// Returns the views buffer
287
    #[inline]
288
409
    pub fn views(&self) -> &ScalarBuffer<u128> {
289
409
        &self.views
290
409
    }
291
292
    /// Returns the buffers storing string data
293
    #[inline]
294
600
    pub fn data_buffers(&self) -> &[Buffer] {
295
600
        &self.buffers
296
600
    }
297
298
    /// Returns the element at index `i`
299
    ///
300
    /// Note: This method does not check for nulls and the value is arbitrary
301
    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
302
    ///
303
    /// # Panics
304
    /// Panics if index `i` is out of bounds.
305
0
    pub fn value(&self, i: usize) -> &T::Native {
306
0
        assert!(
307
0
            i < self.len(),
308
0
            "Trying to access an element at index {} from a {}ViewArray of length {}",
309
            i,
310
            T::PREFIX,
311
0
            self.len()
312
        );
313
314
0
        unsafe { self.value_unchecked(i) }
315
0
    }
316
317
    /// Returns the element at index `i` without bounds checking
318
    ///
319
    /// Note: This method does not check for nulls and the value is arbitrary
320
    /// if [`is_null`](Self::is_null) returns true for the index.
321
    ///
322
    /// # Safety
323
    ///
324
    /// Caller is responsible for ensuring that the index is within the bounds
325
    /// of the array
326
47.9k
    pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
327
47.9k
        let v = unsafe { self.views.get_unchecked(idx) };
328
47.9k
        let len = *v as u32;
329
47.9k
        let b = if len <= MAX_INLINE_VIEW_LEN {
330
30.8k
            unsafe { Self::inline_value(v, len as usize) }
331
        } else {
332
17.1k
            let view = ByteView::from(*v);
333
17.1k
            let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) };
334
17.1k
            let offset = view.offset as usize;
335
17.1k
            unsafe { data.get_unchecked(offset..offset + len as usize) }
336
        };
337
47.9k
        unsafe { T::Native::from_bytes_unchecked(b) }
338
47.9k
    }
339
340
    /// Returns the first `len` bytes the inline value of the view.
341
    ///
342
    /// # Safety
343
    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
344
    /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
345
    #[inline(always)]
346
30.8k
    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
347
30.8k
        debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
348
        unsafe {
349
30.8k
            std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
350
        }
351
30.8k
    }
352
353
    /// Constructs a new iterator for iterating over the values of this array
354
87
    pub fn iter(&self) -> ArrayIter<&Self> {
355
87
        ArrayIter::new(self)
356
87
    }
357
358
    /// Returns an iterator over the bytes of this array, including null values
359
    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
360
        self.views.iter().map(move |v| {
361
            let len = *v as u32;
362
            if len <= MAX_INLINE_VIEW_LEN {
363
                unsafe { Self::inline_value(v, len as usize) }
364
            } else {
365
                let view = ByteView::from(*v);
366
                let data = &self.buffers[view.buffer_index as usize];
367
                let offset = view.offset as usize;
368
                unsafe { data.get_unchecked(offset..offset + len as usize) }
369
            }
370
        })
371
    }
372
373
    /// Returns an iterator over the first `prefix_len` bytes of each array
374
    /// element, including null values.
375
    ///
376
    /// If `prefix_len` is larger than the element's length, the iterator will
377
    /// return an empty slice (`&[]`).
378
    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
379
        self.views().into_iter().map(move |v| {
380
            let len = (*v as u32) as usize;
381
382
            if len < prefix_len {
383
                return &[] as &[u8];
384
            }
385
386
            if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
387
                unsafe { StringViewArray::inline_value(v, prefix_len) }
388
            } else {
389
                let view = ByteView::from(*v);
390
                let data = unsafe {
391
                    self.data_buffers()
392
                        .get_unchecked(view.buffer_index as usize)
393
                };
394
                let offset = view.offset as usize;
395
                unsafe { data.get_unchecked(offset..offset + prefix_len) }
396
            }
397
        })
398
    }
399
400
    /// Returns an iterator over the last `suffix_len` bytes of each array
401
    /// element, including null values.
402
    ///
403
    /// Note that for [`StringViewArray`] the last bytes may start in the middle
404
    /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
405
    ///
406
    /// If `suffix_len` is larger than the element's length, the iterator will
407
    /// return an empty slice (`&[]`).
408
    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
409
        self.views().into_iter().map(move |v| {
410
            let len = (*v as u32) as usize;
411
412
            if len < suffix_len {
413
                return &[] as &[u8];
414
            }
415
416
            if len as u32 <= MAX_INLINE_VIEW_LEN {
417
                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
418
            } else {
419
                let view = ByteView::from(*v);
420
                let data = unsafe {
421
                    self.data_buffers()
422
                        .get_unchecked(view.buffer_index as usize)
423
                };
424
                let offset = view.offset as usize;
425
                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
426
            }
427
        })
428
    }
429
430
    /// Returns a zero-copy slice of this array with the indicated offset and length.
431
53
    pub fn slice(&self, offset: usize, length: usize) -> Self {
432
        Self {
433
53
            data_type: T::DATA_TYPE,
434
53
            views: self.views.slice(offset, length),
435
53
            buffers: self.buffers.clone(),
436
53
            nulls: self.nulls.as_ref().map(|n| 
n35
.
slice35
(
offset35
,
length35
)),
437
53
            phantom: Default::default(),
438
        }
439
53
    }
440
441
    /// Returns a "compacted" version of this array
442
    ///
443
    /// The original array will *not* be modified
444
    ///
445
    /// # Garbage Collection
446
    ///
447
    /// Before GC:
448
    /// ```text
449
    ///                                        ┌──────┐
450
    ///                                        │......│
451
    ///                                        │......│
452
    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
453
    /// │       View 1       │─ ─ ─ ─          │......│  with data that
454
    /// ├────────────────────┤                 │......│ is not referred
455
    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
456
    /// └────────────────────┘                 │......│      View 2
457
    ///                                        │......│
458
    ///    2 views, refer to                   │......│
459
    ///   small portions of a                  └──────┘
460
    ///      large buffer
461
    /// ```
462
    ///
463
    /// After GC:
464
    ///
465
    /// ```text
466
    /// ┌────────────────────┐                 ┌─────┐    After gc, only
467
    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
468
    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
469
    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
470
    /// └────────────────────┘                                 left
471
    ///
472
    ///
473
    ///         2 views
474
    /// ```
475
    /// This method will compact the data buffers by recreating the view array and only include the data
476
    /// that is pointed to by the views.
477
    ///
478
    /// Note that it will copy the array regardless of whether the original array is compact.
479
    /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
480
    /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
481
    ///
482
    /// Note: this function does not attempt to canonicalize / deduplicate values. For this
483
    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
484
    pub fn gc(&self) -> Self {
485
        // 1) Read basic properties once
486
        let len = self.len(); // number of elements
487
        let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
488
489
        // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
490
        if self.data_buffers().is_empty() {
491
            return unsafe {
492
                GenericByteViewArray::new_unchecked(
493
                    self.views().clone(),
494
                    vec![], // empty data blocks
495
                    nulls,
496
                )
497
            };
498
        }
499
500
        // 2) Calculate total size of all non-inline data and detect if any exists
501
        let total_large = self.total_buffer_bytes_used();
502
503
        // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
504
        if total_large == 0 {
505
            // Views are inline-only or all null; just reuse original views and no data blocks
506
            return unsafe {
507
                GenericByteViewArray::new_unchecked(
508
                    self.views().clone(),
509
                    vec![], // empty data blocks
510
                    nulls,
511
                )
512
            };
513
        }
514
515
        let (views_buf, data_blocks) = if total_large < i32::MAX as usize {
516
            // fast path, the entire data fits in a single buffer
517
            // 3) Allocate exactly capacity for all non-inline data
518
            let mut data_buf = Vec::with_capacity(total_large);
519
520
            // 4) Iterate over views and process each inline/non-inline view
521
            let views_buf: Vec<u128> = (0..len)
522
                .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) })
523
                .collect();
524
            let data_block = Buffer::from_vec(data_buf);
525
            let data_blocks = vec![data_block];
526
            (views_buf, data_blocks)
527
        } else {
528
            // slow path, need to split into multiple buffers
529
530
            struct GcCopyGroup {
531
                total_buffer_bytes: usize,
532
                total_len: usize,
533
            }
534
535
            impl GcCopyGroup {
536
0
                fn new(total_buffer_bytes: u32, total_len: usize) -> Self {
537
0
                    Self {
538
0
                        total_buffer_bytes: total_buffer_bytes as usize,
539
0
                        total_len,
540
0
                    }
541
0
                }
542
            }
543
544
            let mut groups = Vec::new();
545
            let mut current_length = 0;
546
            let mut current_elements = 0;
547
548
            for view in self.views() {
549
                let len = *view as u32;
550
                if len > MAX_INLINE_VIEW_LEN {
551
                    if current_length + len > i32::MAX as u32 {
552
                        // Start a new group
553
                        groups.push(GcCopyGroup::new(current_length, current_elements));
554
                        current_length = 0;
555
                        current_elements = 0;
556
                    }
557
                    current_length += len;
558
                    current_elements += 1;
559
                }
560
            }
561
            if current_elements != 0 {
562
                groups.push(GcCopyGroup::new(current_length, current_elements));
563
            }
564
            debug_assert!(groups.len() <= i32::MAX as usize);
565
566
            // 3) Copy the buffers group by group
567
            let mut views_buf = Vec::with_capacity(len);
568
            let mut data_blocks = Vec::with_capacity(groups.len());
569
570
            let mut current_view_idx = 0;
571
572
            for (group_idx, gc_copy_group) in groups.iter().enumerate() {
573
                let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes);
574
575
                // Directly push views to avoid intermediate Vec allocation
576
                let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map(
577
                    |view_idx| {
578
                        // safety: the view index came from iterating over valid range
579
                        unsafe {
580
                            self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf)
581
                        }
582
                    },
583
                );
584
                views_buf.extend(new_views);
585
586
                data_blocks.push(Buffer::from_vec(data_buf));
587
                current_view_idx += gc_copy_group.total_len;
588
            }
589
            (views_buf, data_blocks)
590
        };
591
592
        // 5) Wrap up views buffer
593
        let views_scalar = ScalarBuffer::from(views_buf);
594
595
        // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
596
        unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
597
    }
598
599
    /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
600
    ///
601
    /// # Safety
602
    ///
603
    /// - `i < self.len()`.
604
    /// - Every element in `self.views()` must currently refer to a valid slice
605
    ///   inside one of `self.buffers`.
606
    /// - `data_buf` must be ready to have additional bytes appended.
607
    /// - After this call, the returned view will have its
608
    ///   `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points
609
    ///   into the bytes just appended at the end of `data_buf`.
610
    #[inline(always)]
611
    unsafe fn copy_view_to_buffer(
612
        &self,
613
        i: usize,
614
        buffer_idx: i32,
615
        data_buf: &mut Vec<u8>,
616
    ) -> u128 {
617
        // SAFETY: `i < self.len()` ensures this is in‑bounds.
618
        let raw_view = unsafe { *self.views().get_unchecked(i) };
619
        let mut bv = ByteView::from(raw_view);
620
621
        // Inline‑small views stay as‑is.
622
        if bv.length <= MAX_INLINE_VIEW_LEN {
623
            raw_view
624
        } else {
625
            // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
626
            // must both lie within valid ranges for `self.buffers`.
627
            let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) };
628
            let start = bv.offset as usize;
629
            let end = start + bv.length as usize;
630
            let slice = unsafe { buffer.get_unchecked(start..end) };
631
632
            // Copy out‑of‑line data into our single “0” buffer.
633
            let new_offset = data_buf.len() as u32;
634
            data_buf.extend_from_slice(slice);
635
636
            bv.buffer_index = buffer_idx as u32;
637
            bv.offset = new_offset;
638
            bv.into()
639
        }
640
    }
641
642
    /// Returns the total number of bytes used by all non inlined views in all
643
    /// buffers.
644
    ///
645
    /// Note this does not account for views that point at the same underlying
646
    /// data in buffers
647
    ///
648
    /// For example, if the array has three strings views:
649
    /// * View with length = 9 (inlined)
650
    /// * View with length = 32 (non inlined)
651
    /// * View with length = 16 (non inlined)
652
    ///
653
    /// Then this method would report 48
654
90
    pub fn total_buffer_bytes_used(&self) -> usize {
655
90
        self.views()
656
90
            .iter()
657
27.5k
            .
map90
(|v| {
658
27.5k
                let len = *v as u32;
659
27.5k
                if len > MAX_INLINE_VIEW_LEN {
660
9.21k
                    len as usize
661
                } else {
662
18.2k
                    0
663
                }
664
27.5k
            })
665
90
            .sum()
666
90
    }
667
668
    /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
669
    ///
670
    /// Comparing two ByteView types are non-trivial.
671
    /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
672
    ///
673
    /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
674
    /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
675
    ///     Meaning that reading one array element requires only one memory access
676
    ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
677
    ///
678
    /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
679
    ///     thanks to the inlined 4 bytes.
680
    ///     Consider equality check:
681
    ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
682
    ///
683
    /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
684
    /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
685
    ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
686
    ///
687
    /// # Order check flow
688
    /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
689
    /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
690
    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
691
    ///     (2.2) o.w., we need to compare the full string.
692
    ///
693
    /// # Safety
694
    /// The left/right_idx must within range of each array
695
    pub unsafe fn compare_unchecked(
696
        left: &GenericByteViewArray<T>,
697
        left_idx: usize,
698
        right: &GenericByteViewArray<T>,
699
        right_idx: usize,
700
    ) -> Ordering {
701
        let l_view = unsafe { left.views().get_unchecked(left_idx) };
702
        let l_byte_view = ByteView::from(*l_view);
703
704
        let r_view = unsafe { right.views().get_unchecked(right_idx) };
705
        let r_byte_view = ByteView::from(*r_view);
706
707
        let l_len = l_byte_view.length;
708
        let r_len = r_byte_view.length;
709
710
        if l_len <= 12 && r_len <= 12 {
711
            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
712
        }
713
714
        // one of the string is larger than 12 bytes,
715
        // we then try to compare the inlined data first
716
717
        // Note: In theory, ByteView is only used for string which is larger than 12 bytes,
718
        // but we can still use it to get the inlined prefix for shorter strings.
719
        // The prefix is always the first 4 bytes of the view, for both short and long strings.
720
        let l_inlined_be = l_byte_view.prefix.swap_bytes();
721
        let r_inlined_be = r_byte_view.prefix.swap_bytes();
722
        if l_inlined_be != r_inlined_be {
723
            return l_inlined_be.cmp(&r_inlined_be);
724
        }
725
726
        // unfortunately, we need to compare the full data
727
        let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
728
        let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
729
730
        l_full_data.cmp(r_full_data)
731
    }
732
733
    /// Builds a 128-bit composite key for an inline value:
734
    ///
735
    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
736
    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
737
    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
738
    ///
739
    /// This function extracts the length and the 12-byte inline string data from the raw
740
    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
741
    /// into a single `u128` value suitable for fast, branchless comparisons.
742
    ///
743
    /// # Why include length?
744
    ///
745
    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
746
    /// compare equal—either because one is a true prefix of the other or because zero-padding
747
    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
748
    /// handles both content and length in one go.
749
    ///
750
    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
751
    ///
752
    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
753
    /// |------------|-----------------------|---------------------------------|
754
    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
755
    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
756
    ///
757
    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
758
    /// then differentiates:
759
    ///
760
    /// ```text
761
    /// key("bar")   = 0x0000000000000000000062617200000003
762
    /// key("bar\0") = 0x0000000000000000000062617200000004
763
    /// ⇒ key("bar") < key("bar\0")
764
    /// ```
765
    /// # Inlining and Endianness
766
    ///
767
    /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory
768
    ///   representation is little‑endian on x86/ARM.
769
    /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free.
770
    /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`.
771
    /// - We serialize `length` as big‑endian into `buf[12..16]`.
772
    /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte
773
    ///   and `buf[15]` as the least significant, producing a `u128` whose integer value
774
    ///   directly encodes “inline data then length” in big‑endian form.
775
    ///
776
    /// This ensures that a simple `u128` comparison is equivalent to the desired
777
    /// lexicographical comparison of the inline bytes followed by length.
778
    #[inline(always)]
779
    pub fn inline_key_fast(raw: u128) -> u128 {
780
        // 1. Decompose `raw` into little‑endian bytes:
781
        //    - raw_bytes[0..4]  = length in LE
782
        //    - raw_bytes[4..16] = inline string data
783
        let raw_bytes = raw.to_le_bytes();
784
785
        // 2. Numerically truncate to get the low 32‑bit length (endianness‑free).
786
        let length = raw as u32;
787
788
        // 3. Build a 16‑byte buffer in big‑endian order:
789
        //    - buf[0..12]  = inline string bytes (in original order)
790
        //    - buf[12..16] = length.to_be_bytes() (BE)
791
        let mut buf = [0u8; 16];
792
        buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data
793
794
        // Why convert length to big-endian for comparison?
795
        //
796
        // Rust (on most platforms) stores integers in little-endian format,
797
        // meaning the least significant byte is at the lowest memory address.
798
        // For example, an u32 value like 0x22345677 is stored in memory as:
799
        //
800
        //   [0x77, 0x56, 0x34, 0x22]  // little-endian layout
801
        //    ^     ^     ^     ^
802
        //  LSB   ↑↑↑           MSB
803
        //
804
        // This layout is efficient for arithmetic but *not* suitable for
805
        // lexicographic (dictionary-style) comparison of byte arrays.
806
        //
807
        // To compare values by byte order—e.g., for sorted keys or binary trees—
808
        // we must convert them to **big-endian**, where:
809
        //
810
        //   - The most significant byte (MSB) comes first (index 0)
811
        //   - The least significant byte (LSB) comes last (index N-1)
812
        //
813
        // In big-endian, the same u32 = 0x22345677 would be represented as:
814
        //
815
        //   [0x22, 0x34, 0x56, 0x77]
816
        //
817
        // This ordering aligns with natural string/byte sorting, so calling
818
        // `.to_be_bytes()` allows us to construct
819
        // keys where standard numeric comparison (e.g., `<`, `>`) behaves
820
        // like lexicographic byte comparison.
821
        buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE
822
823
        // 4. Deserialize the buffer as a big‑endian u128:
824
        //    buf[0] is MSB, buf[15] is LSB.
825
        // Details:
826
        // Note on endianness and layout:
827
        //
828
        // Although `buf[0]` is stored at the lowest memory address,
829
        // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**,
830
        // and `buf[15]` as the **least significant byte (LSB)**.
831
        //
832
        // This is the core principle of **big-endian decoding**:
833
        //   - Byte at index 0 maps to bits 127..120 (highest)
834
        //   - Byte at index 1 maps to bits 119..112
835
        //   - ...
836
        //   - Byte at index 15 maps to bits 7..0 (lowest)
837
        //
838
        // So even though memory layout goes from low to high (left to right),
839
        // big-endian treats the **first byte** as highest in value.
840
        //
841
        // This guarantees that comparing two `u128` keys is equivalent to lexicographically
842
        // comparing the original inline bytes, followed by length.
843
        u128::from_be_bytes(buf)
844
    }
845
}
846
847
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
848
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
849
0
        write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
850
0
        print_long_array(self, f, |array, index, f| {
851
0
            std::fmt::Debug::fmt(&array.value(index), f)
852
0
        })?;
853
0
        write!(f, "]")
854
0
    }
855
}
856
857
impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
858
510
    fn as_any(&self) -> &dyn Any {
859
510
        self
860
510
    }
861
862
108
    fn to_data(&self) -> ArrayData {
863
108
        self.clone().into()
864
108
    }
865
866
0
    fn into_data(self) -> ArrayData {
867
0
        self.into()
868
0
    }
869
870
520
    fn data_type(&self) -> &DataType {
871
520
        &self.data_type
872
520
    }
873
874
53
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
875
53
        Arc::new(self.slice(offset, length))
876
53
    }
877
878
663
    fn len(&self) -> usize {
879
663
        self.views.len()
880
663
    }
881
882
0
    fn is_empty(&self) -> bool {
883
0
        self.views.is_empty()
884
0
    }
885
886
0
    fn shrink_to_fit(&mut self) {
887
0
        self.views.shrink_to_fit();
888
0
        self.buffers.iter_mut().for_each(|b| b.shrink_to_fit());
889
0
        self.buffers.shrink_to_fit();
890
0
        if let Some(nulls) = &mut self.nulls {
891
0
            nulls.shrink_to_fit();
892
0
        }
893
0
    }
894
895
0
    fn offset(&self) -> usize {
896
0
        0
897
0
    }
898
899
445
    fn nulls(&self) -> Option<&NullBuffer> {
900
445
        self.nulls.as_ref()
901
445
    }
902
903
0
    fn logical_null_count(&self) -> usize {
904
        // More efficient that the default implementation
905
0
        self.null_count()
906
0
    }
907
908
0
    fn get_buffer_memory_size(&self) -> usize {
909
0
        let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
910
0
        sum += self.views.inner().capacity();
911
0
        if let Some(x) = &self.nulls {
912
0
            sum += x.buffer().capacity()
913
0
        }
914
0
        sum
915
0
    }
916
917
0
    fn get_array_memory_size(&self) -> usize {
918
0
        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
919
0
    }
920
}
921
922
impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
923
    type Item = &'a T::Native;
924
925
    fn value(&self, index: usize) -> Self::Item {
926
        GenericByteViewArray::value(self, index)
927
    }
928
929
47.9k
    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
930
47.9k
        unsafe { GenericByteViewArray::value_unchecked(self, index) }
931
47.9k
    }
932
}
933
934
impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
935
    type Item = Option<&'a T::Native>;
936
    type IntoIter = ArrayIter<Self>;
937
938
    fn into_iter(self) -> Self::IntoIter {
939
        ArrayIter::new(self)
940
    }
941
}
942
943
impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
944
86
    fn from(value: ArrayData) -> Self {
945
86
        let views = value.buffers()[0].clone();
946
86
        let views = ScalarBuffer::new(views, value.offset(), value.len());
947
86
        let buffers = value.buffers()[1..].to_vec();
948
86
        Self {
949
86
            data_type: T::DATA_TYPE,
950
86
            views,
951
86
            buffers,
952
86
            nulls: value.nulls().cloned(),
953
86
            phantom: Default::default(),
954
86
        }
955
86
    }
956
}
957
958
/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
959
///
960
/// For example this method can convert a [`StringArray`] to a
961
/// [`StringViewArray`].
962
///
963
/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
964
/// is built without copying the underlying string data (views are created
965
/// directly into the existing buffer)
966
///
967
/// [`StringArray`]: crate::StringArray
968
impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
969
where
970
    FROM: ByteArrayType,
971
    FROM::Offset: OffsetSizeTrait + ToPrimitive,
972
    V: ByteViewType<Native = FROM::Native>,
973
{
974
4
    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
975
4
        let offsets = byte_array.offsets();
976
977
4
        let can_reuse_buffer = match offsets.last() {
978
4
            Some(offset) => offset.as_usize() < u32::MAX as usize,
979
0
            None => true,
980
        };
981
982
4
        if can_reuse_buffer {
983
            // build views directly pointing to the existing buffer
984
4
            let len = byte_array.len();
985
4
            let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
986
4
            let str_values_buf = byte_array.values().clone();
987
4
            let block = views_builder.append_block(str_values_buf);
988
20
            for (i, w) in 
offsets.windows(2)4
.
enumerate4
() {
989
20
                let offset = w[0].as_usize();
990
20
                let end = w[1].as_usize();
991
20
                let length = end - offset;
992
993
20
                if byte_array.is_null(i) {
994
4
                    views_builder.append_null();
995
4
                } else {
996
                    // Safety: the input was a valid array so it valid UTF8 (if string). And
997
                    // all offsets were valid
998
                    unsafe {
999
16
                        views_builder.append_view_unchecked(block, offset as u32, length as u32)
1000
                    }
1001
                }
1002
            }
1003
4
            assert_eq!(views_builder.len(), len);
1004
4
            views_builder.finish()
1005
        } else {
1006
            // Otherwise, create a new buffer for large strings
1007
            // TODO: the original buffer could still be used
1008
            // by making multiple slices of u32::MAX length
1009
0
            GenericByteViewArray::<V>::from_iter(byte_array.iter())
1010
        }
1011
4
    }
1012
}
1013
1014
impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
1015
108
    fn from(mut array: GenericByteViewArray<T>) -> Self {
1016
108
        let len = array.len();
1017
108
        array.buffers.insert(0, array.views.into_inner());
1018
108
        let builder = ArrayDataBuilder::new(T::DATA_TYPE)
1019
108
            .len(len)
1020
108
            .buffers(array.buffers)
1021
108
            .nulls(array.nulls);
1022
1023
108
        unsafe { builder.build_unchecked() }
1024
108
    }
1025
}
1026
1027
impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
1028
where
1029
    Ptr: AsRef<T::Native> + 'a,
1030
    T: ByteViewType + ?Sized,
1031
{
1032
1
    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
1033
1
        iter.into_iter()
1034
1.00k
            .
map1
(|o| o.as_ref().map(|p|
p667
.
as_ref667
()))
1035
1
            .collect()
1036
1
    }
1037
}
1038
1039
impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
1040
where
1041
    Ptr: AsRef<T::Native>,
1042
{
1043
45
    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
1044
45
        let iter = iter.into_iter();
1045
45
        let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
1046
45
        builder.extend(iter);
1047
45
        builder.finish()
1048
45
    }
1049
}
1050
1051
/// A [`GenericByteViewArray`] of `[u8]`
1052
///
1053
/// See [`GenericByteViewArray`] for format and layout details.
1054
///
1055
/// # Example
1056
/// ```
1057
/// use arrow_array::BinaryViewArray;
1058
/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
1059
/// assert_eq!(array.value(0), b"hello");
1060
/// assert_eq!(array.value(3), b"large payload over 12 bytes");
1061
/// ```
1062
pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
1063
1064
impl BinaryViewArray {
1065
    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1066
    /// If items not utf8 data, validate will fail and error returned.
1067
0
    pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
1068
0
        StringViewType::validate(self.views(), self.data_buffers())?;
1069
0
        unsafe { Ok(self.to_string_view_unchecked()) }
1070
0
    }
1071
1072
    /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
1073
    /// # Safety
1074
    /// Caller is responsible for ensuring that items in array are utf8 data.
1075
0
    pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
1076
0
        unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1077
0
    }
1078
}
1079
1080
impl From<Vec<&[u8]>> for BinaryViewArray {
1081
0
    fn from(v: Vec<&[u8]>) -> Self {
1082
0
        Self::from_iter_values(v)
1083
0
    }
1084
}
1085
1086
impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
1087
0
    fn from(v: Vec<Option<&[u8]>>) -> Self {
1088
0
        v.into_iter().collect()
1089
0
    }
1090
}
1091
1092
/// A [`GenericByteViewArray`] that stores utf8 data
1093
///
1094
/// See [`GenericByteViewArray`] for format and layout details.
1095
///
1096
/// # Example
1097
/// ```
1098
/// use arrow_array::StringViewArray;
1099
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
1100
/// assert_eq!(array.value(0), "hello");
1101
/// assert_eq!(array.value(3), "large payload over 12 bytes");
1102
/// ```
1103
pub type StringViewArray = GenericByteViewArray<StringViewType>;
1104
1105
impl StringViewArray {
1106
    /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
1107
0
    pub fn to_binary_view(self) -> BinaryViewArray {
1108
0
        unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
1109
0
    }
1110
1111
    /// Returns true if all data within this array is ASCII
1112
0
    pub fn is_ascii(&self) -> bool {
1113
        // Alternative (but incorrect): directly check the underlying buffers
1114
        // (1) Our string view might be sparse, i.e., a subset of the buffers,
1115
        //      so even if the buffer is not ascii, we can still be ascii.
1116
        // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
1117
        // This means that this operation is quite expensive, shall we cache the result?
1118
        //  i.e. track `is_ascii` in the builder.
1119
0
        self.iter().all(|v| match v {
1120
0
            Some(v) => v.is_ascii(),
1121
0
            None => true,
1122
0
        })
1123
0
    }
1124
}
1125
1126
impl From<Vec<&str>> for StringViewArray {
1127
2
    fn from(v: Vec<&str>) -> Self {
1128
2
        Self::from_iter_values(v)
1129
2
    }
1130
}
1131
1132
impl From<Vec<Option<&str>>> for StringViewArray {
1133
2
    fn from(v: Vec<Option<&str>>) -> Self {
1134
2
        v.into_iter().collect()
1135
2
    }
1136
}
1137
1138
impl From<Vec<String>> for StringViewArray {
1139
0
    fn from(v: Vec<String>) -> Self {
1140
0
        Self::from_iter_values(v)
1141
0
    }
1142
}
1143
1144
impl From<Vec<Option<String>>> for StringViewArray {
1145
0
    fn from(v: Vec<Option<String>>) -> Self {
1146
0
        v.into_iter().collect()
1147
0
    }
1148
}
1149
1150
#[cfg(test)]
1151
mod tests {
1152
    use crate::builder::{BinaryViewBuilder, StringViewBuilder};
1153
    use crate::types::BinaryViewType;
1154
    use crate::{
1155
        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
1156
    };
1157
    use arrow_buffer::{Buffer, ScalarBuffer};
1158
    use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
1159
    use rand::prelude::StdRng;
1160
    use rand::{Rng, SeedableRng};
1161
1162
    const BLOCK_SIZE: u32 = 8;
1163
1164
    #[test]
1165
    fn try_new_string() {
1166
        let array = StringViewArray::from_iter_values(vec![
1167
            "hello",
1168
            "world",
1169
            "lulu",
1170
            "large payload over 12 bytes",
1171
        ]);
1172
        assert_eq!(array.value(0), "hello");
1173
        assert_eq!(array.value(3), "large payload over 12 bytes");
1174
    }
1175
1176
    #[test]
1177
    fn try_new_binary() {
1178
        let array = BinaryViewArray::from_iter_values(vec![
1179
            b"hello".as_slice(),
1180
            b"world".as_slice(),
1181
            b"lulu".as_slice(),
1182
            b"large payload over 12 bytes".as_slice(),
1183
        ]);
1184
        assert_eq!(array.value(0), b"hello");
1185
        assert_eq!(array.value(3), b"large payload over 12 bytes");
1186
    }
1187
1188
    #[test]
1189
    fn try_new_empty_string() {
1190
        // test empty array
1191
        let array = {
1192
            let mut builder = StringViewBuilder::new();
1193
            builder.finish()
1194
        };
1195
        assert!(array.is_empty());
1196
    }
1197
1198
    #[test]
1199
    fn try_new_empty_binary() {
1200
        // test empty array
1201
        let array = {
1202
            let mut builder = BinaryViewBuilder::new();
1203
            builder.finish()
1204
        };
1205
        assert!(array.is_empty());
1206
    }
1207
1208
    #[test]
1209
    fn test_append_string() {
1210
        // test builder append
1211
        let array = {
1212
            let mut builder = StringViewBuilder::new();
1213
            builder.append_value("hello");
1214
            builder.append_null();
1215
            builder.append_option(Some("large payload over 12 bytes"));
1216
            builder.finish()
1217
        };
1218
        assert_eq!(array.value(0), "hello");
1219
        assert!(array.is_null(1));
1220
        assert_eq!(array.value(2), "large payload over 12 bytes");
1221
    }
1222
1223
    #[test]
1224
    fn test_append_binary() {
1225
        // test builder append
1226
        let array = {
1227
            let mut builder = BinaryViewBuilder::new();
1228
            builder.append_value(b"hello");
1229
            builder.append_null();
1230
            builder.append_option(Some(b"large payload over 12 bytes"));
1231
            builder.finish()
1232
        };
1233
        assert_eq!(array.value(0), b"hello");
1234
        assert!(array.is_null(1));
1235
        assert_eq!(array.value(2), b"large payload over 12 bytes");
1236
    }
1237
1238
    #[test]
1239
    fn test_in_progress_recreation() {
1240
        let array = {
1241
            // make a builder with small block size.
1242
            let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
1243
            builder.append_value("large payload over 12 bytes");
1244
            builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
1245
            builder.finish()
1246
        };
1247
        assert_eq!(array.value(0), "large payload over 12 bytes");
1248
        assert_eq!(
1249
            array.value(1),
1250
            "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"
1251
        );
1252
        assert_eq!(2, array.buffers.len());
1253
    }
1254
1255
    #[test]
1256
    #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
1257
    fn new_with_invalid_view_data() {
1258
        let v = "large payload over 12 bytes";
1259
        let view = ByteView::new(13, &v.as_bytes()[0..4])
1260
            .with_buffer_index(3)
1261
            .with_offset(1);
1262
        let views = ScalarBuffer::from(vec![view.into()]);
1263
        let buffers = vec![Buffer::from_slice_ref(v)];
1264
        StringViewArray::new(views, buffers, None);
1265
    }
1266
1267
    #[test]
1268
    #[should_panic(
1269
        expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
1270
    )]
1271
    fn new_with_invalid_utf8_data() {
1272
        let v: Vec<u8> = vec![
1273
            // invalid UTF8
1274
            0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
1275
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1276
        ];
1277
        let view = ByteView::new(v.len() as u32, &v[0..4]);
1278
        let views = ScalarBuffer::from(vec![view.into()]);
1279
        let buffers = vec![Buffer::from_slice_ref(v)];
1280
        StringViewArray::new(views, buffers, None);
1281
    }
1282
1283
    #[test]
1284
    #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
1285
    fn new_with_invalid_zero_padding() {
1286
        let mut data = [0; 12];
1287
        data[0] = b'H';
1288
        data[11] = 1; // no zero padding
1289
1290
        let mut view_buffer = [0; 16];
1291
        view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
1292
        view_buffer[4..].copy_from_slice(&data);
1293
1294
        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1295
        let views = ScalarBuffer::from(vec![view.into()]);
1296
        let buffers = vec![];
1297
        StringViewArray::new(views, buffers, None);
1298
    }
1299
1300
    #[test]
1301
    #[should_panic(expected = "Mismatch between embedded prefix and data")]
1302
    fn test_mismatch_between_embedded_prefix_and_data() {
1303
        let input_str_1 = "Hello, Rustaceans!";
1304
        let input_str_2 = "Hallo, Rustaceans!";
1305
        let length = input_str_1.len() as u32;
1306
        assert!(input_str_1.len() > 12);
1307
1308
        let mut view_buffer = [0; 16];
1309
        view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
1310
        view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
1311
        view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
1312
        view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
1313
        let view = ByteView::from(u128::from_le_bytes(view_buffer));
1314
        let views = ScalarBuffer::from(vec![view.into()]);
1315
        let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
1316
1317
        StringViewArray::new(views, buffers, None);
1318
    }
1319
1320
    #[test]
1321
    fn test_gc() {
1322
        let test_data = [
1323
            Some("longer than 12 bytes"),
1324
            Some("short"),
1325
            Some("t"),
1326
            Some("longer than 12 bytes"),
1327
            None,
1328
            Some("short"),
1329
        ];
1330
1331
        let array = {
1332
            let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1333
            test_data.into_iter().for_each(|v| builder.append_option(v));
1334
            builder.finish()
1335
        };
1336
        assert!(array.buffers.len() > 1);
1337
1338
        fn check_gc(to_test: &StringViewArray) {
1339
            let gc = to_test.gc();
1340
            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1341
1342
            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1343
                assert_eq!(a, b);
1344
            });
1345
            assert_eq!(to_test.len(), gc.len());
1346
        }
1347
1348
        check_gc(&array);
1349
        check_gc(&array.slice(1, 3));
1350
        check_gc(&array.slice(2, 1));
1351
        check_gc(&array.slice(2, 2));
1352
        check_gc(&array.slice(3, 1));
1353
    }
1354
1355
    /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1356
    #[test]
1357
    fn test_gc_empty_array() {
1358
        let array = StringViewBuilder::new()
1359
            .with_fixed_block_size(BLOCK_SIZE)
1360
            .finish();
1361
        let gced = array.gc();
1362
        // length and null count remain zero
1363
        assert_eq!(gced.len(), 0);
1364
        assert_eq!(gced.null_count(), 0);
1365
        // no underlying data buffers should be allocated
1366
        assert!(
1367
            gced.data_buffers().is_empty(),
1368
            "Expected no data buffers for empty array"
1369
        );
1370
    }
1371
1372
    /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1373
    #[test]
1374
    fn test_gc_all_inline() {
1375
        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1376
        // append many short strings, each exactly INLINE_LEN long
1377
        for _ in 0..100 {
1378
            let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1379
            builder.append_option(Some(&s));
1380
        }
1381
        let array = builder.finish();
1382
        let gced = array.gc();
1383
        // Since all views fit inline, data buffer is empty
1384
        assert_eq!(
1385
            gced.data_buffers().len(),
1386
            0,
1387
            "Should have no data buffers for inline values"
1388
        );
1389
        assert_eq!(gced.len(), 100);
1390
        // verify element-wise equality
1391
        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1392
            assert_eq!(orig, got, "Inline value mismatch after gc");
1393
        });
1394
    }
1395
1396
    /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1397
    #[test]
1398
    fn test_gc_all_large() {
1399
        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1400
        let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1401
        // append multiple large strings
1402
        for _ in 0..50 {
1403
            builder.append_option(Some(&large_str));
1404
        }
1405
        let array = builder.finish();
1406
        let gced = array.gc();
1407
        // New data buffers should be populated (one or more blocks)
1408
        assert!(
1409
            !gced.data_buffers().is_empty(),
1410
            "Expected data buffers for large values"
1411
        );
1412
        assert_eq!(gced.len(), 50);
1413
        // verify that every large string emerges unchanged
1414
        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1415
            assert_eq!(orig, got, "Large view mismatch after gc");
1416
        });
1417
    }
1418
1419
    /// 4) All null elements: ensure null bitmap handling path is correct
1420
    #[test]
1421
    fn test_gc_all_nulls() {
1422
        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1423
        for _ in 0..20 {
1424
            builder.append_null();
1425
        }
1426
        let array = builder.finish();
1427
        let gced = array.gc();
1428
        // length and null count match
1429
        assert_eq!(gced.len(), 20);
1430
        assert_eq!(gced.null_count(), 20);
1431
        // data buffers remain empty for null-only array
1432
        assert!(
1433
            gced.data_buffers().is_empty(),
1434
            "No data should be stored for nulls"
1435
        );
1436
    }
1437
1438
    /// 5) Random mix of inline, large, and null values with slicing tests
1439
    #[test]
1440
    fn test_gc_random_mixed_and_slices() {
1441
        let mut rng = StdRng::seed_from_u64(42);
1442
        let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1443
        // Keep a Vec of original Option<String> for later comparison
1444
        let mut original: Vec<Option<String>> = Vec::new();
1445
1446
        for _ in 0..200 {
1447
            if rng.random_bool(0.1) {
1448
                // 10% nulls
1449
                builder.append_null();
1450
                original.push(None);
1451
            } else {
1452
                // random length between 0 and twice the inline limit
1453
                let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1454
                let s: String = "A".repeat(len as usize);
1455
                builder.append_option(Some(&s));
1456
                original.push(Some(s));
1457
            }
1458
        }
1459
1460
        let array = builder.finish();
1461
        // Test multiple slice ranges to ensure offset logic is correct
1462
        for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1463
            let sliced = array.slice(*offset, *slice_len);
1464
            let gced = sliced.gc();
1465
            // Build expected slice of Option<&str>
1466
            let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1467
                .iter()
1468
                .map(|opt| opt.as_deref())
1469
                .collect();
1470
1471
            assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1472
            // Compare element-wise
1473
            gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1474
                assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1475
            });
1476
        }
1477
    }
1478
1479
    #[test]
1480
    #[cfg_attr(miri, ignore)] // Takes too long
1481
    fn test_gc_huge_array() {
1482
        // Construct multiple 128 MiB BinaryView entries so total > 4 GiB
1483
        let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view
1484
        let num_views: usize = 36;
1485
1486
        // Create a single 128 MiB data block with a simple byte pattern
1487
        let buffer = Buffer::from_vec(vec![0xAB; block_len]);
1488
        let buffer2 = Buffer::from_vec(vec![0xFF; block_len]);
1489
1490
        // Append this block and then add many views pointing to it
1491
        let mut builder = BinaryViewBuilder::new();
1492
        let block_id = builder.append_block(buffer);
1493
        for _ in 0..num_views / 2 {
1494
            builder
1495
                .try_append_view(block_id, 0, block_len as u32)
1496
                .expect("append view into 128MiB block");
1497
        }
1498
        let block_id2 = builder.append_block(buffer2);
1499
        for _ in 0..num_views / 2 {
1500
            builder
1501
                .try_append_view(block_id2, 0, block_len as u32)
1502
                .expect("append view into 128MiB block");
1503
        }
1504
1505
        let array = builder.finish();
1506
        let total = array.total_buffer_bytes_used();
1507
        assert!(
1508
            total > u32::MAX as usize,
1509
            "Expected total non-inline bytes to exceed 4 GiB, got {}",
1510
            total
1511
        );
1512
1513
        // Run gc and verify correctness
1514
        let gced = array.gc();
1515
        assert_eq!(gced.len(), num_views, "Length mismatch after gc");
1516
        assert_eq!(gced.null_count(), 0, "Null count mismatch after gc");
1517
        assert_ne!(
1518
            gced.data_buffers().len(),
1519
            1,
1520
            "gc with huge buffer should not consolidate data into a single buffer"
1521
        );
1522
1523
        // Element-wise equality check across the entire array
1524
        array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1525
            assert_eq!(orig, got, "Value mismatch after gc on huge array");
1526
        });
1527
    }
1528
1529
    #[test]
1530
    fn test_eq() {
1531
        let test_data = [
1532
            Some("longer than 12 bytes"),
1533
            None,
1534
            Some("short"),
1535
            Some("again, this is longer than 12 bytes"),
1536
        ];
1537
1538
        let array1 = {
1539
            let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1540
            test_data.into_iter().for_each(|v| builder.append_option(v));
1541
            builder.finish()
1542
        };
1543
        let array2 = {
1544
            // create a new array with the same data but different layout
1545
            let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1546
            test_data.into_iter().for_each(|v| builder.append_option(v));
1547
            builder.finish()
1548
        };
1549
        assert_eq!(array1, array1.clone());
1550
        assert_eq!(array2, array2.clone());
1551
        assert_eq!(array1, array2);
1552
    }
1553
1554
    /// Integration tests for `inline_key_fast` covering:
1555
    ///
1556
    /// 1. Monotonic ordering across increasing lengths and lexical variations.
1557
    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
1558
    ///
1559
    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
1560
    /// the length field is required even when all inline bytes fit in 12 bytes.
1561
    ///
1562
    /// The test includes strings that verify correct byte order (prevent reversal bugs),
1563
    /// and length-based tie-breaking in the composite key.
1564
    ///
1565
    /// The test confirms that `inline_key_fast` produces keys which sort consistently
1566
    /// with the expected lexicographical order of the raw byte arrays.
1567
    #[test]
1568
    fn test_inline_key_fast_various_lengths_and_lexical() {
1569
        /// Helper to create a raw u128 value representing an inline ByteView:
1570
        /// - `length`: number of meaningful bytes (must be ≤ 12)
1571
        /// - `data`: the actual inline data bytes
1572
        ///
1573
        /// The first 4 bytes encode length in little-endian,
1574
        /// the following 12 bytes contain the inline string data (unpadded).
1575
        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
1576
            assert!(length as usize <= 12, "Inline length must be ≤ 12");
1577
            assert!(
1578
                data.len() == length as usize,
1579
                "Data length must match `length`"
1580
            );
1581
1582
            let mut raw_bytes = [0u8; 16];
1583
            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
1584
            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
1585
            u128::from_le_bytes(raw_bytes)
1586
        }
1587
1588
        // Test inputs: various lengths and lexical orders,
1589
        // plus special cases for byte order and length tie-breaking
1590
        let test_inputs: Vec<&[u8]> = vec![
1591
            b"a",
1592
            b"aa",
1593
            b"aaa",
1594
            b"aab",
1595
            b"abcd",
1596
            b"abcde",
1597
            b"abcdef",
1598
            b"abcdefg",
1599
            b"abcdefgh",
1600
            b"abcdefghi",
1601
            b"abcdefghij",
1602
            b"abcdefghijk",
1603
            b"abcdefghijkl",
1604
            // Tests for byte-order reversal bug:
1605
            // Without the fix, "backend one" would compare as "eno dnekcab",
1606
            // causing incorrect sort order relative to "backend two".
1607
            b"backend one",
1608
            b"backend two",
1609
            // Tests length-tiebreaker logic:
1610
            // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
1611
            // so only the length differentiates their ordering.
1612
            b"bar",
1613
            b"bar\0",
1614
            // Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
1615
            b"than12Byt",
1616
            b"than12Bytes",
1617
            b"than12Bytes\0",
1618
            b"than12Bytesx",
1619
            b"than12Bytex",
1620
            b"than12Bytez",
1621
            // Additional lexical tests
1622
            b"xyy",
1623
            b"xyz",
1624
            b"xza",
1625
        ];
1626
1627
        // Create a GenericBinaryArray for cross-comparison of lex order
1628
        let array: GenericBinaryArray<i32> =
1629
            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
1630
1631
        for i in 0..array.len() - 1 {
1632
            let v1 = array.value(i);
1633
            let v2 = array.value(i + 1);
1634
1635
            // Assert the array's natural lexical ordering is correct
1636
            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
1637
1638
            // Assert the keys produced by inline_key_fast reflect the same ordering
1639
            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1640
                v1.len() as u32,
1641
                v1,
1642
            ));
1643
            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
1644
                v2.len() as u32,
1645
                v2,
1646
            ));
1647
1648
            assert!(
1649
                key1 < key2,
1650
                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
1651
            );
1652
        }
1653
    }
1654
}