/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/byte_view_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::print_long_array; |
19 | | use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; |
20 | | use crate::iterator::ArrayIter; |
21 | | use crate::types::bytes::ByteArrayNativeType; |
22 | | use crate::types::{BinaryViewType, ByteViewType, StringViewType}; |
23 | | use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; |
24 | | use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; |
25 | | use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; |
26 | | use arrow_schema::{ArrowError, DataType}; |
27 | | use core::str; |
28 | | use num::ToPrimitive; |
29 | | use std::any::Any; |
30 | | use std::cmp::Ordering; |
31 | | use std::fmt::Debug; |
32 | | use std::marker::PhantomData; |
33 | | use std::sync::Arc; |
34 | | |
35 | | use super::ByteArrayType; |
36 | | |
37 | | /// [Variable-size Binary View Layout]: An array of variable length bytes views. |
38 | | /// |
39 | | /// This array type is used to store variable length byte data (e.g. Strings, Binary) |
40 | | /// and has efficient operations such as `take`, `filter`, and comparison. |
41 | | /// |
42 | | /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout |
43 | | /// |
44 | | /// This is different from [`GenericByteArray`], which also stores variable |
45 | | /// length byte data, as it represents strings with an offset and length. `take` |
46 | | /// and `filter` like operations are implemented by manipulating the "views" |
47 | | /// (`u128`) without modifying the bytes. Each view also stores an inlined |
48 | | /// prefix which speed up comparisons. |
49 | | /// |
50 | | /// # See Also |
51 | | /// |
52 | | /// * [`StringViewArray`] for storing utf8 encoded string data |
53 | | /// * [`BinaryViewArray`] for storing bytes |
54 | | /// * [`ByteView`] to interpret `u128`s layout of the views. |
55 | | /// |
56 | | /// [`ByteView`]: arrow_data::ByteView |
57 | | /// |
58 | | /// # Layout: "views" and buffers |
59 | | /// |
60 | | /// A `GenericByteViewArray` stores variable length byte strings. An array of |
61 | | /// `N` elements is stored as `N` fixed length "views" and a variable number |
62 | | /// of variable length "buffers". |
63 | | /// |
64 | | /// Each view is a `u128` value whose layout is different depending on the |
65 | | /// length of the string stored at that location: |
66 | | /// |
67 | | /// ```text |
68 | | /// ┌──────┬────────────────────────┐ |
69 | | /// │length│ string value │ |
70 | | /// Strings (len <= 12) │ │ (padded with 0) │ |
71 | | /// └──────┴────────────────────────┘ |
72 | | /// 0 31 127 |
73 | | /// |
74 | | /// ┌───────┬───────┬───────┬───────┐ |
75 | | /// │length │prefix │ buf │offset │ |
76 | | /// Strings (len > 12) │ │ │ index │ │ |
77 | | /// └───────┴───────┴───────┴───────┘ |
78 | | /// 0 31 63 95 127 |
79 | | /// ``` |
80 | | /// |
81 | | /// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in |
82 | | /// the view. See [`Self::inline_value`] to access the inlined prefix from a |
83 | | /// short view. |
84 | | /// |
85 | | /// * Strings with length > 12: The first four bytes are stored inline in the |
86 | | /// view and the entire string is stored in one of the buffers. See [`ByteView`] |
87 | | /// to access the fields of the these views. |
88 | | /// |
89 | | /// As with other arrays, the optimized kernels in [`arrow_compute`] are likely |
90 | | /// the easiest and fastest way to work with this data. However, it is possible |
91 | | /// to access the views and buffers directly for more control. |
92 | | /// |
93 | | /// For example |
94 | | /// |
95 | | /// ```rust |
96 | | /// # use arrow_array::StringViewArray; |
97 | | /// # use arrow_array::Array; |
98 | | /// use arrow_data::ByteView; |
99 | | /// let array = StringViewArray::from(vec![ |
100 | | /// "hello", |
101 | | /// "this string is longer than 12 bytes", |
102 | | /// "this string is also longer than 12 bytes" |
103 | | /// ]); |
104 | | /// |
105 | | /// // ** Examine the first view (short string) ** |
106 | | /// assert!(array.is_valid(0)); // Check for nulls |
107 | | /// let short_view: u128 = array.views()[0]; // "hello" |
108 | | /// // get length of the string |
109 | | /// let len = short_view as u32; |
110 | | /// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view |
111 | | /// // SAFETY: `view` is a valid view |
112 | | /// let value = unsafe { |
113 | | /// StringViewArray::inline_value(&short_view, len as usize) |
114 | | /// }; |
115 | | /// assert_eq!(value, b"hello"); |
116 | | /// |
117 | | /// // ** Examine the third view (long string) ** |
118 | | /// assert!(array.is_valid(12)); // Check for nulls |
119 | | /// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes" |
120 | | /// let len = long_view as u32; |
121 | | /// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer |
122 | | /// let view = ByteView::from(long_view); // use ByteView to access the fields |
123 | | /// assert_eq!(view.length, 40); |
124 | | /// assert_eq!(view.buffer_index, 0); |
125 | | /// assert_eq!(view.offset, 35); // data starts after the first long string |
126 | | /// // Views for long strings store a 4 byte prefix |
127 | | /// let prefix = view.prefix.to_le_bytes(); |
128 | | /// assert_eq!(&prefix, b"this"); |
129 | | /// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly) |
130 | | /// assert_eq!(value, "this string is also longer than 12 bytes"); |
131 | | /// ``` |
132 | | /// |
133 | | /// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN |
134 | | /// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html |
135 | | /// |
136 | | /// Unlike [`GenericByteArray`], there are no constraints on the offsets other |
137 | | /// than they must point into a valid buffer. However, they can be out of order, |
138 | | /// non continuous and overlapping. |
139 | | /// |
140 | | /// For example, in the following diagram, the strings "FishWasInTownToday" and |
141 | | /// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a |
142 | | /// separate buffer while the string "LavaMonster" is stored inlined in the |
143 | | /// view. In this case, the same bytes for "Fish" are used to store both strings. |
144 | | /// |
145 | | /// [`ByteView`]: arrow_data::ByteView |
146 | | /// |
147 | | /// ```text |
148 | | /// ┌───┐ |
149 | | /// ┌──────┬──────┬──────┬──────┐ offset │...│ |
150 | | /// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│ |
151 | | /// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│ |
152 | | /// ┌──────┬──────┬──────┬──────┐ │mpl│ |
153 | | /// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│ |
154 | | /// └──────┴──────┴──────┴──────┘ │ced│ |
155 | | /// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│ |
156 | | /// "LavaMonster" │ 11 │ LavaMonster │ │hWa│ |
157 | | /// └──────┴────────────────────┘ offset │sIn│ |
158 | | /// 115 │Tow│ |
159 | | /// │nTo│ |
160 | | /// │day│ |
161 | | /// u128 "views" │Yay│ |
162 | | /// buffer 0 │...│ |
163 | | /// └───┘ |
164 | | /// ``` |
165 | | pub struct GenericByteViewArray<T: ByteViewType + ?Sized> { |
166 | | data_type: DataType, |
167 | | views: ScalarBuffer<u128>, |
168 | | buffers: Vec<Buffer>, |
169 | | phantom: PhantomData<T>, |
170 | | nulls: Option<NullBuffer>, |
171 | | } |
172 | | |
173 | | impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> { |
174 | 0 | fn clone(&self) -> Self { |
175 | 0 | Self { |
176 | 0 | data_type: T::DATA_TYPE, |
177 | 0 | views: self.views.clone(), |
178 | 0 | buffers: self.buffers.clone(), |
179 | 0 | nulls: self.nulls.clone(), |
180 | 0 | phantom: Default::default(), |
181 | 0 | } |
182 | 0 | } |
183 | | } |
184 | | |
185 | | impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { |
186 | | /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure |
187 | | /// |
188 | | /// # Panics |
189 | | /// |
190 | | /// Panics if [`GenericByteViewArray::try_new`] returns an error |
191 | | pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self { |
192 | | Self::try_new(views, buffers, nulls).unwrap() |
193 | | } |
194 | | |
195 | | /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure |
196 | | /// |
197 | | /// # Errors |
198 | | /// |
199 | | /// * `views.len() != nulls.len()` |
200 | | /// * [ByteViewType::validate] fails |
201 | | pub fn try_new( |
202 | | views: ScalarBuffer<u128>, |
203 | | buffers: Vec<Buffer>, |
204 | | nulls: Option<NullBuffer>, |
205 | | ) -> Result<Self, ArrowError> { |
206 | | T::validate(&views, &buffers)?; |
207 | | |
208 | | if let Some(n) = nulls.as_ref() { |
209 | | if n.len() != views.len() { |
210 | | return Err(ArrowError::InvalidArgumentError(format!( |
211 | | "Incorrect length of null buffer for {}ViewArray, expected {} got {}", |
212 | | T::PREFIX, |
213 | | views.len(), |
214 | | n.len(), |
215 | | ))); |
216 | | } |
217 | | } |
218 | | |
219 | | Ok(Self { |
220 | | data_type: T::DATA_TYPE, |
221 | | views, |
222 | | buffers, |
223 | | nulls, |
224 | | phantom: Default::default(), |
225 | | }) |
226 | | } |
227 | | |
228 | | /// Create a new [`GenericByteViewArray`] from the provided parts, without validation |
229 | | /// |
230 | | /// # Safety |
231 | | /// |
232 | | /// Safe if [`Self::try_new`] would not error |
233 | 1 | pub unsafe fn new_unchecked( |
234 | 1 | views: ScalarBuffer<u128>, |
235 | 1 | buffers: Vec<Buffer>, |
236 | 1 | nulls: Option<NullBuffer>, |
237 | 1 | ) -> Self { |
238 | 1 | if cfg!(feature = "force_validate") { |
239 | 0 | return Self::new(views, buffers, nulls); |
240 | 1 | } |
241 | | |
242 | 1 | Self { |
243 | 1 | data_type: T::DATA_TYPE, |
244 | 1 | phantom: Default::default(), |
245 | 1 | views, |
246 | 1 | buffers, |
247 | 1 | nulls, |
248 | 1 | } |
249 | 1 | } |
250 | | |
251 | | /// Create a new [`GenericByteViewArray`] of length `len` where all values are null |
252 | | pub fn new_null(len: usize) -> Self { |
253 | | Self { |
254 | | data_type: T::DATA_TYPE, |
255 | | views: vec![0; len].into(), |
256 | | buffers: vec![], |
257 | | nulls: Some(NullBuffer::new_null(len)), |
258 | | phantom: Default::default(), |
259 | | } |
260 | | } |
261 | | |
262 | | /// Create a new [`Scalar`] from `value` |
263 | | pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> { |
264 | | Scalar::new(Self::from_iter_values(std::iter::once(value))) |
265 | | } |
266 | | |
267 | | /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls |
268 | 1 | pub fn from_iter_values<Ptr, I>(iter: I) -> Self |
269 | 1 | where |
270 | 1 | Ptr: AsRef<T::Native>, |
271 | 1 | I: IntoIterator<Item = Ptr>, |
272 | | { |
273 | 1 | let iter = iter.into_iter(); |
274 | 1 | let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0); |
275 | 3 | for v2 in iter { |
276 | 2 | builder.append_value(v); |
277 | 2 | } |
278 | 1 | builder.finish() |
279 | 1 | } |
280 | | |
281 | | /// Deconstruct this array into its constituent parts |
282 | | pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) { |
283 | | (self.views, self.buffers, self.nulls) |
284 | | } |
285 | | |
286 | | /// Returns the views buffer |
287 | | #[inline] |
288 | 0 | pub fn views(&self) -> &ScalarBuffer<u128> { |
289 | 0 | &self.views |
290 | 0 | } |
291 | | |
292 | | /// Returns the buffers storing string data |
293 | | #[inline] |
294 | 0 | pub fn data_buffers(&self) -> &[Buffer] { |
295 | 0 | &self.buffers |
296 | 0 | } |
297 | | |
298 | | /// Returns the element at index `i` |
299 | | /// |
300 | | /// Note: This method does not check for nulls and the value is arbitrary |
301 | | /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. |
302 | | /// |
303 | | /// # Panics |
304 | | /// Panics if index `i` is out of bounds. |
305 | 0 | pub fn value(&self, i: usize) -> &T::Native { |
306 | 0 | assert!( |
307 | 0 | i < self.len(), |
308 | 0 | "Trying to access an element at index {} from a {}ViewArray of length {}", |
309 | | i, |
310 | | T::PREFIX, |
311 | 0 | self.len() |
312 | | ); |
313 | | |
314 | 0 | unsafe { self.value_unchecked(i) } |
315 | 0 | } |
316 | | |
317 | | /// Returns the element at index `i` without bounds checking |
318 | | /// |
319 | | /// Note: This method does not check for nulls and the value is arbitrary |
320 | | /// if [`is_null`](Self::is_null) returns true for the index. |
321 | | /// |
322 | | /// # Safety |
323 | | /// |
324 | | /// Caller is responsible for ensuring that the index is within the bounds |
325 | | /// of the array |
326 | 0 | pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { |
327 | 0 | let v = self.views.get_unchecked(idx); |
328 | 0 | let len = *v as u32; |
329 | 0 | let b = if len <= MAX_INLINE_VIEW_LEN { |
330 | 0 | Self::inline_value(v, len as usize) |
331 | | } else { |
332 | 0 | let view = ByteView::from(*v); |
333 | 0 | let data = self.buffers.get_unchecked(view.buffer_index as usize); |
334 | 0 | let offset = view.offset as usize; |
335 | 0 | data.get_unchecked(offset..offset + len as usize) |
336 | | }; |
337 | 0 | T::Native::from_bytes_unchecked(b) |
338 | 0 | } |
339 | | |
340 | | /// Returns the first `len` bytes the inline value of the view. |
341 | | /// |
342 | | /// # Safety |
343 | | /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout. |
344 | | /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`]. |
345 | | #[inline(always)] |
346 | 0 | pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] { |
347 | 0 | debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize); |
348 | 0 | std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len) |
349 | 0 | } |
350 | | |
351 | | /// Constructs a new iterator for iterating over the values of this array |
352 | 0 | pub fn iter(&self) -> ArrayIter<&Self> { |
353 | 0 | ArrayIter::new(self) |
354 | 0 | } |
355 | | |
356 | | /// Returns an iterator over the bytes of this array, including null values |
357 | | pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> { |
358 | | self.views.iter().map(move |v| { |
359 | | let len = *v as u32; |
360 | | if len <= MAX_INLINE_VIEW_LEN { |
361 | | unsafe { Self::inline_value(v, len as usize) } |
362 | | } else { |
363 | | let view = ByteView::from(*v); |
364 | | let data = &self.buffers[view.buffer_index as usize]; |
365 | | let offset = view.offset as usize; |
366 | | unsafe { data.get_unchecked(offset..offset + len as usize) } |
367 | | } |
368 | | }) |
369 | | } |
370 | | |
371 | | /// Returns an iterator over the first `prefix_len` bytes of each array |
372 | | /// element, including null values. |
373 | | /// |
374 | | /// If `prefix_len` is larger than the element's length, the iterator will |
375 | | /// return an empty slice (`&[]`). |
376 | 0 | pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> { |
377 | 0 | self.views().into_iter().map(move |v| { |
378 | 0 | let len = (*v as u32) as usize; |
379 | | |
380 | 0 | if len < prefix_len { |
381 | 0 | return &[] as &[u8]; |
382 | 0 | } |
383 | | |
384 | 0 | if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN { |
385 | 0 | unsafe { StringViewArray::inline_value(v, prefix_len) } |
386 | | } else { |
387 | 0 | let view = ByteView::from(*v); |
388 | 0 | let data = unsafe { |
389 | 0 | self.data_buffers() |
390 | 0 | .get_unchecked(view.buffer_index as usize) |
391 | | }; |
392 | 0 | let offset = view.offset as usize; |
393 | 0 | unsafe { data.get_unchecked(offset..offset + prefix_len) } |
394 | | } |
395 | 0 | }) |
396 | 0 | } |
397 | | |
398 | | /// Returns an iterator over the last `suffix_len` bytes of each array |
399 | | /// element, including null values. |
400 | | /// |
401 | | /// Note that for [`StringViewArray`] the last bytes may start in the middle |
402 | | /// of a UTF-8 codepoint, and thus may not be a valid `&str`. |
403 | | /// |
404 | | /// If `suffix_len` is larger than the element's length, the iterator will |
405 | | /// return an empty slice (`&[]`). |
406 | 0 | pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> { |
407 | 0 | self.views().into_iter().map(move |v| { |
408 | 0 | let len = (*v as u32) as usize; |
409 | | |
410 | 0 | if len < suffix_len { |
411 | 0 | return &[] as &[u8]; |
412 | 0 | } |
413 | | |
414 | 0 | if len as u32 <= MAX_INLINE_VIEW_LEN { |
415 | 0 | unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } |
416 | | } else { |
417 | 0 | let view = ByteView::from(*v); |
418 | 0 | let data = unsafe { |
419 | 0 | self.data_buffers() |
420 | 0 | .get_unchecked(view.buffer_index as usize) |
421 | | }; |
422 | 0 | let offset = view.offset as usize; |
423 | 0 | unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } |
424 | | } |
425 | 0 | }) |
426 | 0 | } |
427 | | |
428 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
429 | 0 | pub fn slice(&self, offset: usize, length: usize) -> Self { |
430 | | Self { |
431 | 0 | data_type: T::DATA_TYPE, |
432 | 0 | views: self.views.slice(offset, length), |
433 | 0 | buffers: self.buffers.clone(), |
434 | 0 | nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), |
435 | 0 | phantom: Default::default(), |
436 | | } |
437 | 0 | } |
438 | | |
439 | | /// Returns a "compacted" version of this array |
440 | | /// |
441 | | /// The original array will *not* be modified |
442 | | /// |
443 | | /// # Garbage Collection |
444 | | /// |
445 | | /// Before GC: |
446 | | /// ```text |
447 | | /// ┌──────┐ |
448 | | /// │......│ |
449 | | /// │......│ |
450 | | /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer |
451 | | /// │ View 1 │─ ─ ─ ─ │......│ with data that |
452 | | /// ├────────────────────┤ │......│ is not referred |
453 | | /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or |
454 | | /// └────────────────────┘ │......│ View 2 |
455 | | /// │......│ |
456 | | /// 2 views, refer to │......│ |
457 | | /// small portions of a └──────┘ |
458 | | /// large buffer |
459 | | /// ``` |
460 | | /// |
461 | | /// After GC: |
462 | | /// |
463 | | /// ```text |
464 | | /// ┌────────────────────┐ ┌─────┐ After gc, only |
465 | | /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is |
466 | | /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by |
467 | | /// │ View 2 │─ ─ ─ ─ └─────┘ the views is |
468 | | /// └────────────────────┘ left |
469 | | /// |
470 | | /// |
471 | | /// 2 views |
472 | | /// ``` |
473 | | /// This method will compact the data buffers by recreating the view array and only include the data |
474 | | /// that is pointed to by the views. |
475 | | /// |
476 | | /// Note that it will copy the array regardless of whether the original array is compact. |
477 | | /// Use with caution as this can be an expensive operation, only use it when you are sure that the view |
478 | | /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. |
479 | | /// |
480 | | /// Note: this function does not attempt to canonicalize / deduplicate values. For this |
481 | | /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`]. |
482 | | pub fn gc(&self) -> Self { |
483 | | // 1) Read basic properties once |
484 | | let len = self.len(); // number of elements |
485 | | let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap |
486 | | |
487 | | // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks |
488 | | if self.data_buffers().is_empty() { |
489 | | return unsafe { |
490 | | GenericByteViewArray::new_unchecked( |
491 | | self.views().clone(), |
492 | | vec![], // empty data blocks |
493 | | nulls, |
494 | | ) |
495 | | }; |
496 | | } |
497 | | |
498 | | // 2) Calculate total size of all non-inline data and detect if any exists |
499 | | let total_large = self.total_buffer_bytes_used(); |
500 | | |
501 | | // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing |
502 | | if total_large == 0 { |
503 | | // Views are inline-only or all null; just reuse original views and no data blocks |
504 | | return unsafe { |
505 | | GenericByteViewArray::new_unchecked( |
506 | | self.views().clone(), |
507 | | vec![], // empty data blocks |
508 | | nulls, |
509 | | ) |
510 | | }; |
511 | | } |
512 | | |
513 | | // 3) Allocate exactly capacity for all non-inline data |
514 | | let mut data_buf = Vec::with_capacity(total_large); |
515 | | |
516 | | // 4) Iterate over views and process each inline/non-inline view |
517 | | let views_buf: Vec<u128> = (0..len) |
518 | | .map(|i| unsafe { self.copy_view_to_buffer(i, &mut data_buf) }) |
519 | | .collect(); |
520 | | |
521 | | // 5) Wrap up buffers |
522 | | let data_block = Buffer::from_vec(data_buf); |
523 | | let views_scalar = ScalarBuffer::from(views_buf); |
524 | | let data_blocks = vec![data_block]; |
525 | | |
526 | | // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized |
527 | | unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) } |
528 | | } |
529 | | |
530 | | /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer. |
531 | | /// |
532 | | /// # Safety |
533 | | /// |
534 | | /// - `i < self.len()`. |
535 | | /// - Every element in `self.views()` must currently refer to a valid slice |
536 | | /// inside one of `self.buffers`. |
537 | | /// - `data_buf` must be ready to have additional bytes appended. |
538 | | /// - After this call, the returned view will have its |
539 | | /// `buffer_index` reset to `0` and its `offset` updated so that it points |
540 | | /// into the bytes just appended at the end of `data_buf`. |
541 | | #[inline(always)] |
542 | | unsafe fn copy_view_to_buffer(&self, i: usize, data_buf: &mut Vec<u8>) -> u128 { |
543 | | // SAFETY: `i < self.len()` ensures this is in‑bounds. |
544 | | let raw_view = *self.views().get_unchecked(i); |
545 | | let mut bv = ByteView::from(raw_view); |
546 | | |
547 | | // Inline‑small views stay as‑is. |
548 | | if bv.length <= MAX_INLINE_VIEW_LEN { |
549 | | raw_view |
550 | | } else { |
551 | | // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length` |
552 | | // must both lie within valid ranges for `self.buffers`. |
553 | | let buffer = self.buffers.get_unchecked(bv.buffer_index as usize); |
554 | | let start = bv.offset as usize; |
555 | | let end = start + bv.length as usize; |
556 | | let slice = buffer.get_unchecked(start..end); |
557 | | |
558 | | // Copy out‑of‑line data into our single “0” buffer. |
559 | | let new_offset = data_buf.len() as u32; |
560 | | data_buf.extend_from_slice(slice); |
561 | | |
562 | | bv.buffer_index = 0; |
563 | | bv.offset = new_offset; |
564 | | bv.into() |
565 | | } |
566 | | } |
567 | | |
568 | | /// Returns the total number of bytes used by all non inlined views in all |
569 | | /// buffers. |
570 | | /// |
571 | | /// Note this does not account for views that point at the same underlying |
572 | | /// data in buffers |
573 | | /// |
574 | | /// For example, if the array has three strings views: |
575 | | /// * View with length = 9 (inlined) |
576 | | /// * View with length = 32 (non inlined) |
577 | | /// * View with length = 16 (non inlined) |
578 | | /// |
579 | | /// Then this method would report 48 |
580 | 0 | pub fn total_buffer_bytes_used(&self) -> usize { |
581 | 0 | self.views() |
582 | 0 | .iter() |
583 | 0 | .map(|v| { |
584 | 0 | let len = *v as u32; |
585 | 0 | if len > MAX_INLINE_VIEW_LEN { |
586 | 0 | len as usize |
587 | | } else { |
588 | 0 | 0 |
589 | | } |
590 | 0 | }) |
591 | 0 | .sum() |
592 | 0 | } |
593 | | |
594 | | /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx` |
595 | | /// |
596 | | /// Comparing two ByteView types are non-trivial. |
597 | | /// It takes a bit of patience to understand why we don't just compare two &[u8] directly. |
598 | | /// |
599 | | /// ByteView types give us the following two advantages, and we need to be careful not to lose them: |
600 | | /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view. |
601 | | /// Meaning that reading one array element requires only one memory access |
602 | | /// (two memory access required for StringArray, one for offset buffer, the other for value buffer). |
603 | | /// |
604 | | /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray, |
605 | | /// thanks to the inlined 4 bytes. |
606 | | /// Consider equality check: |
607 | | /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access). |
608 | | /// |
609 | | /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary. |
610 | | /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer, |
611 | | /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string. |
612 | | /// |
613 | | /// # Order check flow |
614 | | /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view. |
615 | | /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string. |
616 | | /// (2.1) if the inlined 4 bytes are different, we can return the result immediately. |
617 | | /// (2.2) o.w., we need to compare the full string. |
618 | | /// |
619 | | /// # Safety |
620 | | /// The left/right_idx must within range of each array |
621 | 0 | pub unsafe fn compare_unchecked( |
622 | 0 | left: &GenericByteViewArray<T>, |
623 | 0 | left_idx: usize, |
624 | 0 | right: &GenericByteViewArray<T>, |
625 | 0 | right_idx: usize, |
626 | 0 | ) -> Ordering { |
627 | 0 | let l_view = left.views().get_unchecked(left_idx); |
628 | 0 | let l_byte_view = ByteView::from(*l_view); |
629 | | |
630 | 0 | let r_view = right.views().get_unchecked(right_idx); |
631 | 0 | let r_byte_view = ByteView::from(*r_view); |
632 | | |
633 | 0 | let l_len = l_byte_view.length; |
634 | 0 | let r_len = r_byte_view.length; |
635 | | |
636 | 0 | if l_len <= 12 && r_len <= 12 { |
637 | 0 | return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view)); |
638 | 0 | } |
639 | | |
640 | | // one of the string is larger than 12 bytes, |
641 | | // we then try to compare the inlined data first |
642 | | |
643 | | // Note: In theory, ByteView is only used for string which is larger than 12 bytes, |
644 | | // but we can still use it to get the inlined prefix for shorter strings. |
645 | | // The prefix is always the first 4 bytes of the view, for both short and long strings. |
646 | 0 | let l_inlined_be = l_byte_view.prefix.swap_bytes(); |
647 | 0 | let r_inlined_be = r_byte_view.prefix.swap_bytes(); |
648 | 0 | if l_inlined_be != r_inlined_be { |
649 | 0 | return l_inlined_be.cmp(&r_inlined_be); |
650 | 0 | } |
651 | | |
652 | | // unfortunately, we need to compare the full data |
653 | 0 | let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() }; |
654 | 0 | let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() }; |
655 | | |
656 | 0 | l_full_data.cmp(r_full_data) |
657 | 0 | } |
658 | | |
659 | | /// Builds a 128-bit composite key for an inline value: |
660 | | /// |
661 | | /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). |
662 | | /// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings |
663 | | /// (or those with fewer meaningful bytes) always numerically sort before longer ones. |
664 | | /// |
665 | | /// This function extracts the length and the 12-byte inline string data from the raw |
666 | | /// little-endian `u128` representation, converts them to big-endian ordering, and packs them |
667 | | /// into a single `u128` value suitable for fast, branchless comparisons. |
668 | | /// |
669 | | /// # Why include length? |
670 | | /// |
671 | | /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes |
672 | | /// compare equal—either because one is a true prefix of the other or because zero-padding |
673 | | /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare |
674 | | /// handles both content and length in one go. |
675 | | /// |
676 | | /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) |
677 | | /// |
678 | | /// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | |
679 | | /// |------------|-----------------------|---------------------------------| |
680 | | /// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | |
681 | | /// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | |
682 | | /// |
683 | | /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field |
684 | | /// then differentiates: |
685 | | /// |
686 | | /// ```text |
687 | | /// key("bar") = 0x0000000000000000000062617200000003 |
688 | | /// key("bar\0") = 0x0000000000000000000062617200000004 |
689 | | /// ⇒ key("bar") < key("bar\0") |
690 | | /// ``` |
691 | | /// # Inlining and Endianness |
692 | | /// |
693 | | /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory |
694 | | /// representation is little‑endian on x86/ARM. |
695 | | /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. |
696 | | /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. |
697 | | /// - We serialize `length` as big‑endian into `buf[12..16]`. |
698 | | /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte |
699 | | /// and `buf[15]` as the least significant, producing a `u128` whose integer value |
700 | | /// directly encodes “inline data then length” in big‑endian form. |
701 | | /// |
702 | | /// This ensures that a simple `u128` comparison is equivalent to the desired |
703 | | /// lexicographical comparison of the inline bytes followed by length. |
704 | | #[inline(always)] |
705 | 0 | pub fn inline_key_fast(raw: u128) -> u128 { |
706 | | // 1. Decompose `raw` into little‑endian bytes: |
707 | | // - raw_bytes[0..4] = length in LE |
708 | | // - raw_bytes[4..16] = inline string data |
709 | 0 | let raw_bytes = raw.to_le_bytes(); |
710 | | |
711 | | // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). |
712 | 0 | let length = raw as u32; |
713 | | |
714 | | // 3. Build a 16‑byte buffer in big‑endian order: |
715 | | // - buf[0..12] = inline string bytes (in original order) |
716 | | // - buf[12..16] = length.to_be_bytes() (BE) |
717 | 0 | let mut buf = [0u8; 16]; |
718 | 0 | buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data |
719 | | |
720 | | // Why convert length to big-endian for comparison? |
721 | | // |
722 | | // Rust (on most platforms) stores integers in little-endian format, |
723 | | // meaning the least significant byte is at the lowest memory address. |
724 | | // For example, an u32 value like 0x22345677 is stored in memory as: |
725 | | // |
726 | | // [0x77, 0x56, 0x34, 0x22] // little-endian layout |
727 | | // ^ ^ ^ ^ |
728 | | // LSB ↑↑↑ MSB |
729 | | // |
730 | | // This layout is efficient for arithmetic but *not* suitable for |
731 | | // lexicographic (dictionary-style) comparison of byte arrays. |
732 | | // |
733 | | // To compare values by byte order—e.g., for sorted keys or binary trees— |
734 | | // we must convert them to **big-endian**, where: |
735 | | // |
736 | | // - The most significant byte (MSB) comes first (index 0) |
737 | | // - The least significant byte (LSB) comes last (index N-1) |
738 | | // |
739 | | // In big-endian, the same u32 = 0x22345677 would be represented as: |
740 | | // |
741 | | // [0x22, 0x34, 0x56, 0x77] |
742 | | // |
743 | | // This ordering aligns with natural string/byte sorting, so calling |
744 | | // `.to_be_bytes()` allows us to construct |
745 | | // keys where standard numeric comparison (e.g., `<`, `>`) behaves |
746 | | // like lexicographic byte comparison. |
747 | 0 | buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE |
748 | | |
749 | | // 4. Deserialize the buffer as a big‑endian u128: |
750 | | // buf[0] is MSB, buf[15] is LSB. |
751 | | // Details: |
752 | | // Note on endianness and layout: |
753 | | // |
754 | | // Although `buf[0]` is stored at the lowest memory address, |
755 | | // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, |
756 | | // and `buf[15]` as the **least significant byte (LSB)**. |
757 | | // |
758 | | // This is the core principle of **big-endian decoding**: |
759 | | // - Byte at index 0 maps to bits 127..120 (highest) |
760 | | // - Byte at index 1 maps to bits 119..112 |
761 | | // - ... |
762 | | // - Byte at index 15 maps to bits 7..0 (lowest) |
763 | | // |
764 | | // So even though memory layout goes from low to high (left to right), |
765 | | // big-endian treats the **first byte** as highest in value. |
766 | | // |
767 | | // This guarantees that comparing two `u128` keys is equivalent to lexicographically |
768 | | // comparing the original inline bytes, followed by length. |
769 | 0 | u128::from_be_bytes(buf) |
770 | 0 | } |
771 | | } |
772 | | |
773 | | impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> { |
774 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
775 | 0 | write!(f, "{}ViewArray\n[\n", T::PREFIX)?; |
776 | 0 | print_long_array(self, f, |array, index, f| { |
777 | 0 | std::fmt::Debug::fmt(&array.value(index), f) |
778 | 0 | })?; |
779 | 0 | write!(f, "]") |
780 | 0 | } |
781 | | } |
782 | | |
783 | | impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> { |
784 | 1 | fn as_any(&self) -> &dyn Any { |
785 | 1 | self |
786 | 1 | } |
787 | | |
788 | 0 | fn to_data(&self) -> ArrayData { |
789 | 0 | self.clone().into() |
790 | 0 | } |
791 | | |
792 | 0 | fn into_data(self) -> ArrayData { |
793 | 0 | self.into() |
794 | 0 | } |
795 | | |
796 | 2 | fn data_type(&self) -> &DataType { |
797 | 2 | &self.data_type |
798 | 2 | } |
799 | | |
800 | 0 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
801 | 0 | Arc::new(self.slice(offset, length)) |
802 | 0 | } |
803 | | |
804 | 2 | fn len(&self) -> usize { |
805 | 2 | self.views.len() |
806 | 2 | } |
807 | | |
808 | 0 | fn is_empty(&self) -> bool { |
809 | 0 | self.views.is_empty() |
810 | 0 | } |
811 | | |
812 | 0 | fn shrink_to_fit(&mut self) { |
813 | 0 | self.views.shrink_to_fit(); |
814 | 0 | self.buffers.iter_mut().for_each(|b| b.shrink_to_fit()); |
815 | 0 | self.buffers.shrink_to_fit(); |
816 | 0 | if let Some(nulls) = &mut self.nulls { |
817 | 0 | nulls.shrink_to_fit(); |
818 | 0 | } |
819 | 0 | } |
820 | | |
821 | 0 | fn offset(&self) -> usize { |
822 | 0 | 0 |
823 | 0 | } |
824 | | |
825 | 2 | fn nulls(&self) -> Option<&NullBuffer> { |
826 | 2 | self.nulls.as_ref() |
827 | 2 | } |
828 | | |
829 | 0 | fn logical_null_count(&self) -> usize { |
830 | | // More efficient that the default implementation |
831 | 0 | self.null_count() |
832 | 0 | } |
833 | | |
834 | 0 | fn get_buffer_memory_size(&self) -> usize { |
835 | 0 | let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>(); |
836 | 0 | sum += self.views.inner().capacity(); |
837 | 0 | if let Some(x) = &self.nulls { |
838 | 0 | sum += x.buffer().capacity() |
839 | 0 | } |
840 | 0 | sum |
841 | 0 | } |
842 | | |
843 | 0 | fn get_array_memory_size(&self) -> usize { |
844 | 0 | std::mem::size_of::<Self>() + self.get_buffer_memory_size() |
845 | 0 | } |
846 | | } |
847 | | |
848 | | impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> { |
849 | | type Item = &'a T::Native; |
850 | | |
851 | 0 | fn value(&self, index: usize) -> Self::Item { |
852 | 0 | GenericByteViewArray::value(self, index) |
853 | 0 | } |
854 | | |
855 | 0 | unsafe fn value_unchecked(&self, index: usize) -> Self::Item { |
856 | 0 | GenericByteViewArray::value_unchecked(self, index) |
857 | 0 | } |
858 | | } |
859 | | |
860 | | impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> { |
861 | | type Item = Option<&'a T::Native>; |
862 | | type IntoIter = ArrayIter<Self>; |
863 | | |
864 | | fn into_iter(self) -> Self::IntoIter { |
865 | | ArrayIter::new(self) |
866 | | } |
867 | | } |
868 | | |
869 | | impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> { |
870 | 0 | fn from(value: ArrayData) -> Self { |
871 | 0 | let views = value.buffers()[0].clone(); |
872 | 0 | let views = ScalarBuffer::new(views, value.offset(), value.len()); |
873 | 0 | let buffers = value.buffers()[1..].to_vec(); |
874 | 0 | Self { |
875 | 0 | data_type: T::DATA_TYPE, |
876 | 0 | views, |
877 | 0 | buffers, |
878 | 0 | nulls: value.nulls().cloned(), |
879 | 0 | phantom: Default::default(), |
880 | 0 | } |
881 | 0 | } |
882 | | } |
883 | | |
884 | | /// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`] |
885 | | /// |
886 | | /// For example this method can convert a [`StringArray`] to a |
887 | | /// [`StringViewArray`]. |
888 | | /// |
889 | | /// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`] |
890 | | /// is built without copying the underlying string data (views are created |
891 | | /// directly into the existing buffer) |
892 | | /// |
893 | | /// [`StringArray`]: crate::StringArray |
894 | | impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V> |
895 | | where |
896 | | FROM: ByteArrayType, |
897 | | FROM::Offset: OffsetSizeTrait + ToPrimitive, |
898 | | V: ByteViewType<Native = FROM::Native>, |
899 | | { |
900 | 0 | fn from(byte_array: &GenericByteArray<FROM>) -> Self { |
901 | 0 | let offsets = byte_array.offsets(); |
902 | | |
903 | 0 | let can_reuse_buffer = match offsets.last() { |
904 | 0 | Some(offset) => offset.as_usize() < u32::MAX as usize, |
905 | 0 | None => true, |
906 | | }; |
907 | | |
908 | 0 | if can_reuse_buffer { |
909 | | // build views directly pointing to the existing buffer |
910 | 0 | let len = byte_array.len(); |
911 | 0 | let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len); |
912 | 0 | let str_values_buf = byte_array.values().clone(); |
913 | 0 | let block = views_builder.append_block(str_values_buf); |
914 | 0 | for (i, w) in offsets.windows(2).enumerate() { |
915 | 0 | let offset = w[0].as_usize(); |
916 | 0 | let end = w[1].as_usize(); |
917 | 0 | let length = end - offset; |
918 | | |
919 | 0 | if byte_array.is_null(i) { |
920 | 0 | views_builder.append_null(); |
921 | 0 | } else { |
922 | | // Safety: the input was a valid array so it valid UTF8 (if string). And |
923 | | // all offsets were valid |
924 | | unsafe { |
925 | 0 | views_builder.append_view_unchecked(block, offset as u32, length as u32) |
926 | | } |
927 | | } |
928 | | } |
929 | 0 | assert_eq!(views_builder.len(), len); |
930 | 0 | views_builder.finish() |
931 | | } else { |
932 | | // Otherwise, create a new buffer for large strings |
933 | | // TODO: the original buffer could still be used |
934 | | // by making multiple slices of u32::MAX length |
935 | 0 | GenericByteViewArray::<V>::from_iter(byte_array.iter()) |
936 | | } |
937 | 0 | } |
938 | | } |
939 | | |
940 | | impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData { |
941 | 0 | fn from(mut array: GenericByteViewArray<T>) -> Self { |
942 | 0 | let len = array.len(); |
943 | 0 | array.buffers.insert(0, array.views.into_inner()); |
944 | 0 | let builder = ArrayDataBuilder::new(T::DATA_TYPE) |
945 | 0 | .len(len) |
946 | 0 | .buffers(array.buffers) |
947 | 0 | .nulls(array.nulls); |
948 | | |
949 | 0 | unsafe { builder.build_unchecked() } |
950 | 0 | } |
951 | | } |
952 | | |
953 | | impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T> |
954 | | where |
955 | | Ptr: AsRef<T::Native> + 'a, |
956 | | T: ByteViewType + ?Sized, |
957 | | { |
958 | | fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self { |
959 | | iter.into_iter() |
960 | | .map(|o| o.as_ref().map(|p| p.as_ref())) |
961 | | .collect() |
962 | | } |
963 | | } |
964 | | |
965 | | impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T> |
966 | | where |
967 | | Ptr: AsRef<T::Native>, |
968 | | { |
969 | 0 | fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self { |
970 | 0 | let iter = iter.into_iter(); |
971 | 0 | let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0); |
972 | 0 | builder.extend(iter); |
973 | 0 | builder.finish() |
974 | 0 | } |
975 | | } |
976 | | |
977 | | /// A [`GenericByteViewArray`] of `[u8]` |
978 | | /// |
979 | | /// See [`GenericByteViewArray`] for format and layout details. |
980 | | /// |
981 | | /// # Example |
982 | | /// ``` |
983 | | /// use arrow_array::BinaryViewArray; |
984 | | /// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]); |
985 | | /// assert_eq!(array.value(0), b"hello"); |
986 | | /// assert_eq!(array.value(3), b"large payload over 12 bytes"); |
987 | | /// ``` |
988 | | pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>; |
989 | | |
990 | | impl BinaryViewArray { |
991 | | /// Convert the [`BinaryViewArray`] to [`StringViewArray`] |
992 | | /// If items not utf8 data, validate will fail and error returned. |
993 | 0 | pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> { |
994 | 0 | StringViewType::validate(self.views(), self.data_buffers())?; |
995 | 0 | unsafe { Ok(self.to_string_view_unchecked()) } |
996 | 0 | } |
997 | | |
998 | | /// Convert the [`BinaryViewArray`] to [`StringViewArray`] |
999 | | /// # Safety |
1000 | | /// Caller is responsible for ensuring that items in array are utf8 data. |
1001 | 0 | pub unsafe fn to_string_view_unchecked(self) -> StringViewArray { |
1002 | 0 | StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) |
1003 | 0 | } |
1004 | | } |
1005 | | |
1006 | | impl From<Vec<&[u8]>> for BinaryViewArray { |
1007 | 0 | fn from(v: Vec<&[u8]>) -> Self { |
1008 | 0 | Self::from_iter_values(v) |
1009 | 0 | } |
1010 | | } |
1011 | | |
1012 | | impl From<Vec<Option<&[u8]>>> for BinaryViewArray { |
1013 | 0 | fn from(v: Vec<Option<&[u8]>>) -> Self { |
1014 | 0 | v.into_iter().collect() |
1015 | 0 | } |
1016 | | } |
1017 | | |
1018 | | /// A [`GenericByteViewArray`] that stores utf8 data |
1019 | | /// |
1020 | | /// See [`GenericByteViewArray`] for format and layout details. |
1021 | | /// |
1022 | | /// # Example |
1023 | | /// ``` |
1024 | | /// use arrow_array::StringViewArray; |
1025 | | /// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]); |
1026 | | /// assert_eq!(array.value(0), "hello"); |
1027 | | /// assert_eq!(array.value(3), "large payload over 12 bytes"); |
1028 | | /// ``` |
1029 | | pub type StringViewArray = GenericByteViewArray<StringViewType>; |
1030 | | |
1031 | | impl StringViewArray { |
1032 | | /// Convert the [`StringViewArray`] to [`BinaryViewArray`] |
1033 | 0 | pub fn to_binary_view(self) -> BinaryViewArray { |
1034 | 0 | unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) } |
1035 | 0 | } |
1036 | | |
1037 | | /// Returns true if all data within this array is ASCII |
1038 | 0 | pub fn is_ascii(&self) -> bool { |
1039 | | // Alternative (but incorrect): directly check the underlying buffers |
1040 | | // (1) Our string view might be sparse, i.e., a subset of the buffers, |
1041 | | // so even if the buffer is not ascii, we can still be ascii. |
1042 | | // (2) It is quite difficult to know the range of each buffer (unlike StringArray) |
1043 | | // This means that this operation is quite expensive, shall we cache the result? |
1044 | | // i.e. track `is_ascii` in the builder. |
1045 | 0 | self.iter().all(|v| match v { |
1046 | 0 | Some(v) => v.is_ascii(), |
1047 | 0 | None => true, |
1048 | 0 | }) |
1049 | 0 | } |
1050 | | } |
1051 | | |
1052 | | impl From<Vec<&str>> for StringViewArray { |
1053 | 1 | fn from(v: Vec<&str>) -> Self { |
1054 | 1 | Self::from_iter_values(v) |
1055 | 1 | } |
1056 | | } |
1057 | | |
1058 | | impl From<Vec<Option<&str>>> for StringViewArray { |
1059 | 0 | fn from(v: Vec<Option<&str>>) -> Self { |
1060 | 0 | v.into_iter().collect() |
1061 | 0 | } |
1062 | | } |
1063 | | |
1064 | | impl From<Vec<String>> for StringViewArray { |
1065 | 0 | fn from(v: Vec<String>) -> Self { |
1066 | 0 | Self::from_iter_values(v) |
1067 | 0 | } |
1068 | | } |
1069 | | |
1070 | | impl From<Vec<Option<String>>> for StringViewArray { |
1071 | 0 | fn from(v: Vec<Option<String>>) -> Self { |
1072 | 0 | v.into_iter().collect() |
1073 | 0 | } |
1074 | | } |
1075 | | |
1076 | | #[cfg(test)] |
1077 | | mod tests { |
1078 | | use crate::builder::{BinaryViewBuilder, StringViewBuilder}; |
1079 | | use crate::types::BinaryViewType; |
1080 | | use crate::{ |
1081 | | Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray, |
1082 | | }; |
1083 | | use arrow_buffer::{Buffer, ScalarBuffer}; |
1084 | | use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; |
1085 | | use rand::prelude::StdRng; |
1086 | | use rand::{Rng, SeedableRng}; |
1087 | | |
1088 | | const BLOCK_SIZE: u32 = 8; |
1089 | | |
1090 | | #[test] |
1091 | | fn try_new_string() { |
1092 | | let array = StringViewArray::from_iter_values(vec![ |
1093 | | "hello", |
1094 | | "world", |
1095 | | "lulu", |
1096 | | "large payload over 12 bytes", |
1097 | | ]); |
1098 | | assert_eq!(array.value(0), "hello"); |
1099 | | assert_eq!(array.value(3), "large payload over 12 bytes"); |
1100 | | } |
1101 | | |
1102 | | #[test] |
1103 | | fn try_new_binary() { |
1104 | | let array = BinaryViewArray::from_iter_values(vec![ |
1105 | | b"hello".as_slice(), |
1106 | | b"world".as_slice(), |
1107 | | b"lulu".as_slice(), |
1108 | | b"large payload over 12 bytes".as_slice(), |
1109 | | ]); |
1110 | | assert_eq!(array.value(0), b"hello"); |
1111 | | assert_eq!(array.value(3), b"large payload over 12 bytes"); |
1112 | | } |
1113 | | |
1114 | | #[test] |
1115 | | fn try_new_empty_string() { |
1116 | | // test empty array |
1117 | | let array = { |
1118 | | let mut builder = StringViewBuilder::new(); |
1119 | | builder.finish() |
1120 | | }; |
1121 | | assert!(array.is_empty()); |
1122 | | } |
1123 | | |
1124 | | #[test] |
1125 | | fn try_new_empty_binary() { |
1126 | | // test empty array |
1127 | | let array = { |
1128 | | let mut builder = BinaryViewBuilder::new(); |
1129 | | builder.finish() |
1130 | | }; |
1131 | | assert!(array.is_empty()); |
1132 | | } |
1133 | | |
1134 | | #[test] |
1135 | | fn test_append_string() { |
1136 | | // test builder append |
1137 | | let array = { |
1138 | | let mut builder = StringViewBuilder::new(); |
1139 | | builder.append_value("hello"); |
1140 | | builder.append_null(); |
1141 | | builder.append_option(Some("large payload over 12 bytes")); |
1142 | | builder.finish() |
1143 | | }; |
1144 | | assert_eq!(array.value(0), "hello"); |
1145 | | assert!(array.is_null(1)); |
1146 | | assert_eq!(array.value(2), "large payload over 12 bytes"); |
1147 | | } |
1148 | | |
1149 | | #[test] |
1150 | | fn test_append_binary() { |
1151 | | // test builder append |
1152 | | let array = { |
1153 | | let mut builder = BinaryViewBuilder::new(); |
1154 | | builder.append_value(b"hello"); |
1155 | | builder.append_null(); |
1156 | | builder.append_option(Some(b"large payload over 12 bytes")); |
1157 | | builder.finish() |
1158 | | }; |
1159 | | assert_eq!(array.value(0), b"hello"); |
1160 | | assert!(array.is_null(1)); |
1161 | | assert_eq!(array.value(2), b"large payload over 12 bytes"); |
1162 | | } |
1163 | | |
1164 | | #[test] |
1165 | | fn test_in_progress_recreation() { |
1166 | | let array = { |
1167 | | // make a builder with small block size. |
1168 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(14); |
1169 | | builder.append_value("large payload over 12 bytes"); |
1170 | | builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created")); |
1171 | | builder.finish() |
1172 | | }; |
1173 | | assert_eq!(array.value(0), "large payload over 12 bytes"); |
1174 | | assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"); |
1175 | | assert_eq!(2, array.buffers.len()); |
1176 | | } |
1177 | | |
1178 | | #[test] |
1179 | | #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] |
1180 | | fn new_with_invalid_view_data() { |
1181 | | let v = "large payload over 12 bytes"; |
1182 | | let view = ByteView::new(13, &v.as_bytes()[0..4]) |
1183 | | .with_buffer_index(3) |
1184 | | .with_offset(1); |
1185 | | let views = ScalarBuffer::from(vec![view.into()]); |
1186 | | let buffers = vec![Buffer::from_slice_ref(v)]; |
1187 | | StringViewArray::new(views, buffers, None); |
1188 | | } |
1189 | | |
1190 | | #[test] |
1191 | | #[should_panic( |
1192 | | expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" |
1193 | | )] |
1194 | | fn new_with_invalid_utf8_data() { |
1195 | | let v: Vec<u8> = vec![ |
1196 | | // invalid UTF8 |
1197 | | 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12 |
1198 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
1199 | | ]; |
1200 | | let view = ByteView::new(v.len() as u32, &v[0..4]); |
1201 | | let views = ScalarBuffer::from(vec![view.into()]); |
1202 | | let buffers = vec![Buffer::from_slice_ref(v)]; |
1203 | | StringViewArray::new(views, buffers, None); |
1204 | | } |
1205 | | |
1206 | | #[test] |
1207 | | #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")] |
1208 | | fn new_with_invalid_zero_padding() { |
1209 | | let mut data = [0; 12]; |
1210 | | data[0] = b'H'; |
1211 | | data[11] = 1; // no zero padding |
1212 | | |
1213 | | let mut view_buffer = [0; 16]; |
1214 | | view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes()); |
1215 | | view_buffer[4..].copy_from_slice(&data); |
1216 | | |
1217 | | let view = ByteView::from(u128::from_le_bytes(view_buffer)); |
1218 | | let views = ScalarBuffer::from(vec![view.into()]); |
1219 | | let buffers = vec![]; |
1220 | | StringViewArray::new(views, buffers, None); |
1221 | | } |
1222 | | |
1223 | | #[test] |
1224 | | #[should_panic(expected = "Mismatch between embedded prefix and data")] |
1225 | | fn test_mismatch_between_embedded_prefix_and_data() { |
1226 | | let input_str_1 = "Hello, Rustaceans!"; |
1227 | | let input_str_2 = "Hallo, Rustaceans!"; |
1228 | | let length = input_str_1.len() as u32; |
1229 | | assert!(input_str_1.len() > 12); |
1230 | | |
1231 | | let mut view_buffer = [0; 16]; |
1232 | | view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); |
1233 | | view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]); |
1234 | | view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes()); |
1235 | | view_buffer[12..].copy_from_slice(&0u32.to_le_bytes()); |
1236 | | let view = ByteView::from(u128::from_le_bytes(view_buffer)); |
1237 | | let views = ScalarBuffer::from(vec![view.into()]); |
1238 | | let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())]; |
1239 | | |
1240 | | StringViewArray::new(views, buffers, None); |
1241 | | } |
1242 | | |
1243 | | #[test] |
1244 | | fn test_gc() { |
1245 | | let test_data = [ |
1246 | | Some("longer than 12 bytes"), |
1247 | | Some("short"), |
1248 | | Some("t"), |
1249 | | Some("longer than 12 bytes"), |
1250 | | None, |
1251 | | Some("short"), |
1252 | | ]; |
1253 | | |
1254 | | let array = { |
1255 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers |
1256 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1257 | | builder.finish() |
1258 | | }; |
1259 | | assert!(array.buffers.len() > 1); |
1260 | | |
1261 | | fn check_gc(to_test: &StringViewArray) { |
1262 | | let gc = to_test.gc(); |
1263 | | assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len()); |
1264 | | |
1265 | | to_test.iter().zip(gc.iter()).for_each(|(a, b)| { |
1266 | | assert_eq!(a, b); |
1267 | | }); |
1268 | | assert_eq!(to_test.len(), gc.len()); |
1269 | | } |
1270 | | |
1271 | | check_gc(&array); |
1272 | | check_gc(&array.slice(1, 3)); |
1273 | | check_gc(&array.slice(2, 1)); |
1274 | | check_gc(&array.slice(2, 2)); |
1275 | | check_gc(&array.slice(3, 1)); |
1276 | | } |
1277 | | |
1278 | | /// 1) Empty array: no elements, expect gc to return empty with no data buffers |
1279 | | #[test] |
1280 | | fn test_gc_empty_array() { |
1281 | | let array = StringViewBuilder::new() |
1282 | | .with_fixed_block_size(BLOCK_SIZE) |
1283 | | .finish(); |
1284 | | let gced = array.gc(); |
1285 | | // length and null count remain zero |
1286 | | assert_eq!(gced.len(), 0); |
1287 | | assert_eq!(gced.null_count(), 0); |
1288 | | // no underlying data buffers should be allocated |
1289 | | assert!( |
1290 | | gced.data_buffers().is_empty(), |
1291 | | "Expected no data buffers for empty array" |
1292 | | ); |
1293 | | } |
1294 | | |
1295 | | /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values |
1296 | | #[test] |
1297 | | fn test_gc_all_inline() { |
1298 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1299 | | // append many short strings, each exactly INLINE_LEN long |
1300 | | for _ in 0..100 { |
1301 | | let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize); |
1302 | | builder.append_option(Some(&s)); |
1303 | | } |
1304 | | let array = builder.finish(); |
1305 | | let gced = array.gc(); |
1306 | | // Since all views fit inline, data buffer is empty |
1307 | | assert_eq!( |
1308 | | gced.data_buffers().len(), |
1309 | | 0, |
1310 | | "Should have no data buffers for inline values" |
1311 | | ); |
1312 | | assert_eq!(gced.len(), 100); |
1313 | | // verify element-wise equality |
1314 | | array.iter().zip(gced.iter()).for_each(|(orig, got)| { |
1315 | | assert_eq!(orig, got, "Inline value mismatch after gc"); |
1316 | | }); |
1317 | | } |
1318 | | |
1319 | | /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer |
1320 | | #[test] |
1321 | | fn test_gc_all_large() { |
1322 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1323 | | let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5); |
1324 | | // append multiple large strings |
1325 | | for _ in 0..50 { |
1326 | | builder.append_option(Some(&large_str)); |
1327 | | } |
1328 | | let array = builder.finish(); |
1329 | | let gced = array.gc(); |
1330 | | // New data buffers should be populated (one or more blocks) |
1331 | | assert!( |
1332 | | !gced.data_buffers().is_empty(), |
1333 | | "Expected data buffers for large values" |
1334 | | ); |
1335 | | assert_eq!(gced.len(), 50); |
1336 | | // verify that every large string emerges unchanged |
1337 | | array.iter().zip(gced.iter()).for_each(|(orig, got)| { |
1338 | | assert_eq!(orig, got, "Large view mismatch after gc"); |
1339 | | }); |
1340 | | } |
1341 | | |
1342 | | /// 4) All null elements: ensure null bitmap handling path is correct |
1343 | | #[test] |
1344 | | fn test_gc_all_nulls() { |
1345 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1346 | | for _ in 0..20 { |
1347 | | builder.append_null(); |
1348 | | } |
1349 | | let array = builder.finish(); |
1350 | | let gced = array.gc(); |
1351 | | // length and null count match |
1352 | | assert_eq!(gced.len(), 20); |
1353 | | assert_eq!(gced.null_count(), 20); |
1354 | | // data buffers remain empty for null-only array |
1355 | | assert!( |
1356 | | gced.data_buffers().is_empty(), |
1357 | | "No data should be stored for nulls" |
1358 | | ); |
1359 | | } |
1360 | | |
1361 | | /// 5) Random mix of inline, large, and null values with slicing tests |
1362 | | #[test] |
1363 | | fn test_gc_random_mixed_and_slices() { |
1364 | | let mut rng = StdRng::seed_from_u64(42); |
1365 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1366 | | // Keep a Vec of original Option<String> for later comparison |
1367 | | let mut original: Vec<Option<String>> = Vec::new(); |
1368 | | |
1369 | | for _ in 0..200 { |
1370 | | if rng.random_bool(0.1) { |
1371 | | // 10% nulls |
1372 | | builder.append_null(); |
1373 | | original.push(None); |
1374 | | } else { |
1375 | | // random length between 0 and twice the inline limit |
1376 | | let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2)); |
1377 | | let s: String = "A".repeat(len as usize); |
1378 | | builder.append_option(Some(&s)); |
1379 | | original.push(Some(s)); |
1380 | | } |
1381 | | } |
1382 | | |
1383 | | let array = builder.finish(); |
1384 | | // Test multiple slice ranges to ensure offset logic is correct |
1385 | | for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] { |
1386 | | let sliced = array.slice(*offset, *slice_len); |
1387 | | let gced = sliced.gc(); |
1388 | | // Build expected slice of Option<&str> |
1389 | | let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)] |
1390 | | .iter() |
1391 | | .map(|opt| opt.as_deref()) |
1392 | | .collect(); |
1393 | | |
1394 | | assert_eq!(gced.len(), *slice_len, "Slice length mismatch"); |
1395 | | // Compare element-wise |
1396 | | gced.iter().zip(expected.iter()).for_each(|(got, expect)| { |
1397 | | assert_eq!(got, *expect, "Value mismatch in mixed slice after gc"); |
1398 | | }); |
1399 | | } |
1400 | | } |
1401 | | |
1402 | | #[test] |
1403 | | fn test_eq() { |
1404 | | let test_data = [ |
1405 | | Some("longer than 12 bytes"), |
1406 | | None, |
1407 | | Some("short"), |
1408 | | Some("again, this is longer than 12 bytes"), |
1409 | | ]; |
1410 | | |
1411 | | let array1 = { |
1412 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(8); |
1413 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1414 | | builder.finish() |
1415 | | }; |
1416 | | let array2 = { |
1417 | | // create a new array with the same data but different layout |
1418 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(100); |
1419 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1420 | | builder.finish() |
1421 | | }; |
1422 | | assert_eq!(array1, array1.clone()); |
1423 | | assert_eq!(array2, array2.clone()); |
1424 | | assert_eq!(array1, array2); |
1425 | | } |
1426 | | |
1427 | | /// Integration tests for `inline_key_fast` covering: |
1428 | | /// |
1429 | | /// 1. Monotonic ordering across increasing lengths and lexical variations. |
1430 | | /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. |
1431 | | /// |
1432 | | /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why |
1433 | | /// the length field is required even when all inline bytes fit in 12 bytes. |
1434 | | /// |
1435 | | /// The test includes strings that verify correct byte order (prevent reversal bugs), |
1436 | | /// and length-based tie-breaking in the composite key. |
1437 | | /// |
1438 | | /// The test confirms that `inline_key_fast` produces keys which sort consistently |
1439 | | /// with the expected lexicographical order of the raw byte arrays. |
1440 | | #[test] |
1441 | | fn test_inline_key_fast_various_lengths_and_lexical() { |
1442 | | /// Helper to create a raw u128 value representing an inline ByteView: |
1443 | | /// - `length`: number of meaningful bytes (must be ≤ 12) |
1444 | | /// - `data`: the actual inline data bytes |
1445 | | /// |
1446 | | /// The first 4 bytes encode length in little-endian, |
1447 | | /// the following 12 bytes contain the inline string data (unpadded). |
1448 | | fn make_raw_inline(length: u32, data: &[u8]) -> u128 { |
1449 | | assert!(length as usize <= 12, "Inline length must be ≤ 12"); |
1450 | | assert!( |
1451 | | data.len() == length as usize, |
1452 | | "Data length must match `length`" |
1453 | | ); |
1454 | | |
1455 | | let mut raw_bytes = [0u8; 16]; |
1456 | | raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian |
1457 | | raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data |
1458 | | u128::from_le_bytes(raw_bytes) |
1459 | | } |
1460 | | |
1461 | | // Test inputs: various lengths and lexical orders, |
1462 | | // plus special cases for byte order and length tie-breaking |
1463 | | let test_inputs: Vec<&[u8]> = vec![ |
1464 | | b"a", |
1465 | | b"aa", |
1466 | | b"aaa", |
1467 | | b"aab", |
1468 | | b"abcd", |
1469 | | b"abcde", |
1470 | | b"abcdef", |
1471 | | b"abcdefg", |
1472 | | b"abcdefgh", |
1473 | | b"abcdefghi", |
1474 | | b"abcdefghij", |
1475 | | b"abcdefghijk", |
1476 | | b"abcdefghijkl", |
1477 | | // Tests for byte-order reversal bug: |
1478 | | // Without the fix, "backend one" would compare as "eno dnekcab", |
1479 | | // causing incorrect sort order relative to "backend two". |
1480 | | b"backend one", |
1481 | | b"backend two", |
1482 | | // Tests length-tiebreaker logic: |
1483 | | // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, |
1484 | | // so only the length differentiates their ordering. |
1485 | | b"bar", |
1486 | | b"bar\0", |
1487 | | // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: |
1488 | | b"than12Byt", |
1489 | | b"than12Bytes", |
1490 | | b"than12Bytes\0", |
1491 | | b"than12Bytesx", |
1492 | | b"than12Bytex", |
1493 | | b"than12Bytez", |
1494 | | // Additional lexical tests |
1495 | | b"xyy", |
1496 | | b"xyz", |
1497 | | b"xza", |
1498 | | ]; |
1499 | | |
1500 | | // Create a GenericBinaryArray for cross-comparison of lex order |
1501 | | let array: GenericBinaryArray<i32> = |
1502 | | GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>()); |
1503 | | |
1504 | | for i in 0..array.len() - 1 { |
1505 | | let v1 = array.value(i); |
1506 | | let v2 = array.value(i + 1); |
1507 | | |
1508 | | // Assert the array's natural lexical ordering is correct |
1509 | | assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); |
1510 | | |
1511 | | // Assert the keys produced by inline_key_fast reflect the same ordering |
1512 | | let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline( |
1513 | | v1.len() as u32, |
1514 | | v1, |
1515 | | )); |
1516 | | let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline( |
1517 | | v2.len() as u32, |
1518 | | v2, |
1519 | | )); |
1520 | | |
1521 | | assert!( |
1522 | | key1 < key2, |
1523 | | "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", |
1524 | | ); |
1525 | | } |
1526 | | } |
1527 | | } |