/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/byte_view_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::print_long_array; |
19 | | use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; |
20 | | use crate::iterator::ArrayIter; |
21 | | use crate::types::bytes::ByteArrayNativeType; |
22 | | use crate::types::{BinaryViewType, ByteViewType, StringViewType}; |
23 | | use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; |
24 | | use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; |
25 | | use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; |
26 | | use arrow_schema::{ArrowError, DataType}; |
27 | | use core::str; |
28 | | use num_traits::ToPrimitive; |
29 | | use std::any::Any; |
30 | | use std::cmp::Ordering; |
31 | | use std::fmt::Debug; |
32 | | use std::marker::PhantomData; |
33 | | use std::sync::Arc; |
34 | | |
35 | | use super::ByteArrayType; |
36 | | |
37 | | /// [Variable-size Binary View Layout]: An array of variable length bytes views. |
38 | | /// |
39 | | /// This array type is used to store variable length byte data (e.g. Strings, Binary) |
40 | | /// and has efficient operations such as `take`, `filter`, and comparison. |
41 | | /// |
42 | | /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout |
43 | | /// |
44 | | /// This is different from [`GenericByteArray`], which also stores variable |
45 | | /// length byte data, as it represents strings with an offset and length. `take` |
46 | | /// and `filter` like operations are implemented by manipulating the "views" |
47 | | /// (`u128`) without modifying the bytes. Each view also stores an inlined |
48 | | /// prefix which speed up comparisons. |
49 | | /// |
50 | | /// # See Also |
51 | | /// |
52 | | /// * [`StringViewArray`] for storing utf8 encoded string data |
53 | | /// * [`BinaryViewArray`] for storing bytes |
54 | | /// * [`ByteView`] to interpret `u128`s layout of the views. |
55 | | /// |
56 | | /// [`ByteView`]: arrow_data::ByteView |
57 | | /// |
58 | | /// # Layout: "views" and buffers |
59 | | /// |
60 | | /// A `GenericByteViewArray` stores variable length byte strings. An array of |
61 | | /// `N` elements is stored as `N` fixed length "views" and a variable number |
62 | | /// of variable length "buffers". |
63 | | /// |
64 | | /// Each view is a `u128` value whose layout is different depending on the |
65 | | /// length of the string stored at that location: |
66 | | /// |
67 | | /// ```text |
68 | | /// ┌──────┬────────────────────────┐ |
69 | | /// │length│ string value │ |
70 | | /// Strings (len <= 12) │ │ (padded with 0) │ |
71 | | /// └──────┴────────────────────────┘ |
72 | | /// 0 31 127 |
73 | | /// |
74 | | /// ┌───────┬───────┬───────┬───────┐ |
75 | | /// │length │prefix │ buf │offset │ |
76 | | /// Strings (len > 12) │ │ │ index │ │ |
77 | | /// └───────┴───────┴───────┴───────┘ |
78 | | /// 0 31 63 95 127 |
79 | | /// ``` |
80 | | /// |
81 | | /// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in |
82 | | /// the view. See [`Self::inline_value`] to access the inlined prefix from a |
83 | | /// short view. |
84 | | /// |
85 | | /// * Strings with length > 12: The first four bytes are stored inline in the |
86 | | /// view and the entire string is stored in one of the buffers. See [`ByteView`] |
87 | | /// to access the fields of the these views. |
88 | | /// |
89 | | /// As with other arrays, the optimized kernels in [`arrow_compute`] are likely |
90 | | /// the easiest and fastest way to work with this data. However, it is possible |
91 | | /// to access the views and buffers directly for more control. |
92 | | /// |
93 | | /// For example |
94 | | /// |
95 | | /// ```rust |
96 | | /// # use arrow_array::StringViewArray; |
97 | | /// # use arrow_array::Array; |
98 | | /// use arrow_data::ByteView; |
99 | | /// let array = StringViewArray::from(vec![ |
100 | | /// "hello", |
101 | | /// "this string is longer than 12 bytes", |
102 | | /// "this string is also longer than 12 bytes" |
103 | | /// ]); |
104 | | /// |
105 | | /// // ** Examine the first view (short string) ** |
106 | | /// assert!(array.is_valid(0)); // Check for nulls |
107 | | /// let short_view: u128 = array.views()[0]; // "hello" |
108 | | /// // get length of the string |
109 | | /// let len = short_view as u32; |
110 | | /// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view |
111 | | /// // SAFETY: `view` is a valid view |
112 | | /// let value = unsafe { |
113 | | /// StringViewArray::inline_value(&short_view, len as usize) |
114 | | /// }; |
115 | | /// assert_eq!(value, b"hello"); |
116 | | /// |
117 | | /// // ** Examine the third view (long string) ** |
118 | | /// assert!(array.is_valid(12)); // Check for nulls |
119 | | /// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes" |
120 | | /// let len = long_view as u32; |
121 | | /// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer |
122 | | /// let view = ByteView::from(long_view); // use ByteView to access the fields |
123 | | /// assert_eq!(view.length, 40); |
124 | | /// assert_eq!(view.buffer_index, 0); |
125 | | /// assert_eq!(view.offset, 35); // data starts after the first long string |
126 | | /// // Views for long strings store a 4 byte prefix |
127 | | /// let prefix = view.prefix.to_le_bytes(); |
128 | | /// assert_eq!(&prefix, b"this"); |
129 | | /// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly) |
130 | | /// assert_eq!(value, "this string is also longer than 12 bytes"); |
131 | | /// ``` |
132 | | /// |
133 | | /// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN |
134 | | /// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html |
135 | | /// |
136 | | /// Unlike [`GenericByteArray`], there are no constraints on the offsets other |
137 | | /// than they must point into a valid buffer. However, they can be out of order, |
138 | | /// non continuous and overlapping. |
139 | | /// |
140 | | /// For example, in the following diagram, the strings "FishWasInTownToday" and |
141 | | /// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a |
142 | | /// separate buffer while the string "LavaMonster" is stored inlined in the |
143 | | /// view. In this case, the same bytes for "Fish" are used to store both strings. |
144 | | /// |
145 | | /// [`ByteView`]: arrow_data::ByteView |
146 | | /// |
147 | | /// ```text |
148 | | /// ┌───┐ |
149 | | /// ┌──────┬──────┬──────┬──────┐ offset │...│ |
150 | | /// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│ |
151 | | /// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│ |
152 | | /// ┌──────┬──────┬──────┬──────┐ │mpl│ |
153 | | /// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│ |
154 | | /// └──────┴──────┴──────┴──────┘ │ced│ |
155 | | /// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│ |
156 | | /// "LavaMonster" │ 11 │ LavaMonster │ │hWa│ |
157 | | /// └──────┴────────────────────┘ offset │sIn│ |
158 | | /// 115 │Tow│ |
159 | | /// │nTo│ |
160 | | /// │day│ |
161 | | /// u128 "views" │Yay│ |
162 | | /// buffer 0 │...│ |
163 | | /// └───┘ |
164 | | /// ``` |
165 | | pub struct GenericByteViewArray<T: ByteViewType + ?Sized> { |
166 | | data_type: DataType, |
167 | | views: ScalarBuffer<u128>, |
168 | | buffers: Vec<Buffer>, |
169 | | phantom: PhantomData<T>, |
170 | | nulls: Option<NullBuffer>, |
171 | | } |
172 | | |
173 | | impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> { |
174 | 108 | fn clone(&self) -> Self { |
175 | 108 | Self { |
176 | 108 | data_type: T::DATA_TYPE, |
177 | 108 | views: self.views.clone(), |
178 | 108 | buffers: self.buffers.clone(), |
179 | 108 | nulls: self.nulls.clone(), |
180 | 108 | phantom: Default::default(), |
181 | 108 | } |
182 | 108 | } |
183 | | } |
184 | | |
185 | | impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { |
186 | | /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure |
187 | | /// |
188 | | /// # Panics |
189 | | /// |
190 | | /// Panics if [`GenericByteViewArray::try_new`] returns an error |
191 | | pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self { |
192 | | Self::try_new(views, buffers, nulls).unwrap() |
193 | | } |
194 | | |
195 | | /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure |
196 | | /// |
197 | | /// # Errors |
198 | | /// |
199 | | /// * `views.len() != nulls.len()` |
200 | | /// * [ByteViewType::validate] fails |
201 | 2 | pub fn try_new( |
202 | 2 | views: ScalarBuffer<u128>, |
203 | 2 | buffers: Vec<Buffer>, |
204 | 2 | nulls: Option<NullBuffer>, |
205 | 2 | ) -> Result<Self, ArrowError> { |
206 | 2 | T::validate(&views, &buffers)?0 ; |
207 | | |
208 | 2 | if let Some(n0 ) = nulls.as_ref() { |
209 | 0 | if n.len() != views.len() { |
210 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
211 | 0 | "Incorrect length of null buffer for {}ViewArray, expected {} got {}", |
212 | 0 | T::PREFIX, |
213 | 0 | views.len(), |
214 | 0 | n.len(), |
215 | 0 | ))); |
216 | 0 | } |
217 | 2 | } |
218 | | |
219 | 2 | Ok(Self { |
220 | 2 | data_type: T::DATA_TYPE, |
221 | 2 | views, |
222 | 2 | buffers, |
223 | 2 | nulls, |
224 | 2 | phantom: Default::default(), |
225 | 2 | }) |
226 | 2 | } |
227 | | |
228 | | /// Create a new [`GenericByteViewArray`] from the provided parts, without validation |
229 | | /// |
230 | | /// # Safety |
231 | | /// |
232 | | /// Safe if [`Self::try_new`] would not error |
233 | 216 | pub unsafe fn new_unchecked( |
234 | 216 | views: ScalarBuffer<u128>, |
235 | 216 | buffers: Vec<Buffer>, |
236 | 216 | nulls: Option<NullBuffer>, |
237 | 216 | ) -> Self { |
238 | 216 | if cfg!(feature = "force_validate") { |
239 | 0 | return Self::new(views, buffers, nulls); |
240 | 216 | } |
241 | | |
242 | 216 | Self { |
243 | 216 | data_type: T::DATA_TYPE, |
244 | 216 | phantom: Default::default(), |
245 | 216 | views, |
246 | 216 | buffers, |
247 | 216 | nulls, |
248 | 216 | } |
249 | 216 | } |
250 | | |
251 | | /// Create a new [`GenericByteViewArray`] of length `len` where all values are null |
252 | | pub fn new_null(len: usize) -> Self { |
253 | | Self { |
254 | | data_type: T::DATA_TYPE, |
255 | | views: vec![0; len].into(), |
256 | | buffers: vec![], |
257 | | nulls: Some(NullBuffer::new_null(len)), |
258 | | phantom: Default::default(), |
259 | | } |
260 | | } |
261 | | |
262 | | /// Create a new [`Scalar`] from `value` |
263 | | pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> { |
264 | | Scalar::new(Self::from_iter_values(std::iter::once(value))) |
265 | | } |
266 | | |
267 | | /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls |
268 | 2 | pub fn from_iter_values<Ptr, I>(iter: I) -> Self |
269 | 2 | where |
270 | 2 | Ptr: AsRef<T::Native>, |
271 | 2 | I: IntoIterator<Item = Ptr>, |
272 | | { |
273 | 2 | let iter = iter.into_iter(); |
274 | 2 | let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0); |
275 | 7 | for v5 in iter { |
276 | 5 | builder.append_value(v); |
277 | 5 | } |
278 | 2 | builder.finish() |
279 | 2 | } |
280 | | |
281 | | /// Deconstruct this array into its constituent parts |
282 | | pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) { |
283 | | (self.views, self.buffers, self.nulls) |
284 | | } |
285 | | |
286 | | /// Returns the views buffer |
287 | | #[inline] |
288 | 409 | pub fn views(&self) -> &ScalarBuffer<u128> { |
289 | 409 | &self.views |
290 | 409 | } |
291 | | |
292 | | /// Returns the buffers storing string data |
293 | | #[inline] |
294 | 600 | pub fn data_buffers(&self) -> &[Buffer] { |
295 | 600 | &self.buffers |
296 | 600 | } |
297 | | |
298 | | /// Returns the element at index `i` |
299 | | /// |
300 | | /// Note: This method does not check for nulls and the value is arbitrary |
301 | | /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. |
302 | | /// |
303 | | /// # Panics |
304 | | /// Panics if index `i` is out of bounds. |
305 | 0 | pub fn value(&self, i: usize) -> &T::Native { |
306 | 0 | assert!( |
307 | 0 | i < self.len(), |
308 | 0 | "Trying to access an element at index {} from a {}ViewArray of length {}", |
309 | | i, |
310 | | T::PREFIX, |
311 | 0 | self.len() |
312 | | ); |
313 | | |
314 | 0 | unsafe { self.value_unchecked(i) } |
315 | 0 | } |
316 | | |
317 | | /// Returns the element at index `i` without bounds checking |
318 | | /// |
319 | | /// Note: This method does not check for nulls and the value is arbitrary |
320 | | /// if [`is_null`](Self::is_null) returns true for the index. |
321 | | /// |
322 | | /// # Safety |
323 | | /// |
324 | | /// Caller is responsible for ensuring that the index is within the bounds |
325 | | /// of the array |
326 | 47.9k | pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { |
327 | 47.9k | let v = unsafe { self.views.get_unchecked(idx) }; |
328 | 47.9k | let len = *v as u32; |
329 | 47.9k | let b = if len <= MAX_INLINE_VIEW_LEN { |
330 | 30.8k | unsafe { Self::inline_value(v, len as usize) } |
331 | | } else { |
332 | 17.1k | let view = ByteView::from(*v); |
333 | 17.1k | let data = unsafe { self.buffers.get_unchecked(view.buffer_index as usize) }; |
334 | 17.1k | let offset = view.offset as usize; |
335 | 17.1k | unsafe { data.get_unchecked(offset..offset + len as usize) } |
336 | | }; |
337 | 47.9k | unsafe { T::Native::from_bytes_unchecked(b) } |
338 | 47.9k | } |
339 | | |
340 | | /// Returns the first `len` bytes the inline value of the view. |
341 | | /// |
342 | | /// # Safety |
343 | | /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout. |
344 | | /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`]. |
345 | | #[inline(always)] |
346 | 30.8k | pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] { |
347 | 30.8k | debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize); |
348 | | unsafe { |
349 | 30.8k | std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len) |
350 | | } |
351 | 30.8k | } |
352 | | |
353 | | /// Constructs a new iterator for iterating over the values of this array |
354 | 87 | pub fn iter(&self) -> ArrayIter<&Self> { |
355 | 87 | ArrayIter::new(self) |
356 | 87 | } |
357 | | |
358 | | /// Returns an iterator over the bytes of this array, including null values |
359 | | pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> { |
360 | | self.views.iter().map(move |v| { |
361 | | let len = *v as u32; |
362 | | if len <= MAX_INLINE_VIEW_LEN { |
363 | | unsafe { Self::inline_value(v, len as usize) } |
364 | | } else { |
365 | | let view = ByteView::from(*v); |
366 | | let data = &self.buffers[view.buffer_index as usize]; |
367 | | let offset = view.offset as usize; |
368 | | unsafe { data.get_unchecked(offset..offset + len as usize) } |
369 | | } |
370 | | }) |
371 | | } |
372 | | |
373 | | /// Returns an iterator over the first `prefix_len` bytes of each array |
374 | | /// element, including null values. |
375 | | /// |
376 | | /// If `prefix_len` is larger than the element's length, the iterator will |
377 | | /// return an empty slice (`&[]`). |
378 | | pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> { |
379 | | self.views().into_iter().map(move |v| { |
380 | | let len = (*v as u32) as usize; |
381 | | |
382 | | if len < prefix_len { |
383 | | return &[] as &[u8]; |
384 | | } |
385 | | |
386 | | if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN { |
387 | | unsafe { StringViewArray::inline_value(v, prefix_len) } |
388 | | } else { |
389 | | let view = ByteView::from(*v); |
390 | | let data = unsafe { |
391 | | self.data_buffers() |
392 | | .get_unchecked(view.buffer_index as usize) |
393 | | }; |
394 | | let offset = view.offset as usize; |
395 | | unsafe { data.get_unchecked(offset..offset + prefix_len) } |
396 | | } |
397 | | }) |
398 | | } |
399 | | |
400 | | /// Returns an iterator over the last `suffix_len` bytes of each array |
401 | | /// element, including null values. |
402 | | /// |
403 | | /// Note that for [`StringViewArray`] the last bytes may start in the middle |
404 | | /// of a UTF-8 codepoint, and thus may not be a valid `&str`. |
405 | | /// |
406 | | /// If `suffix_len` is larger than the element's length, the iterator will |
407 | | /// return an empty slice (`&[]`). |
408 | | pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> { |
409 | | self.views().into_iter().map(move |v| { |
410 | | let len = (*v as u32) as usize; |
411 | | |
412 | | if len < suffix_len { |
413 | | return &[] as &[u8]; |
414 | | } |
415 | | |
416 | | if len as u32 <= MAX_INLINE_VIEW_LEN { |
417 | | unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } |
418 | | } else { |
419 | | let view = ByteView::from(*v); |
420 | | let data = unsafe { |
421 | | self.data_buffers() |
422 | | .get_unchecked(view.buffer_index as usize) |
423 | | }; |
424 | | let offset = view.offset as usize; |
425 | | unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } |
426 | | } |
427 | | }) |
428 | | } |
429 | | |
430 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
431 | 53 | pub fn slice(&self, offset: usize, length: usize) -> Self { |
432 | | Self { |
433 | 53 | data_type: T::DATA_TYPE, |
434 | 53 | views: self.views.slice(offset, length), |
435 | 53 | buffers: self.buffers.clone(), |
436 | 53 | nulls: self.nulls.as_ref().map(|n| n35 .slice35 (offset35 , length35 )), |
437 | 53 | phantom: Default::default(), |
438 | | } |
439 | 53 | } |
440 | | |
441 | | /// Returns a "compacted" version of this array |
442 | | /// |
443 | | /// The original array will *not* be modified |
444 | | /// |
445 | | /// # Garbage Collection |
446 | | /// |
447 | | /// Before GC: |
448 | | /// ```text |
449 | | /// ┌──────┐ |
450 | | /// │......│ |
451 | | /// │......│ |
452 | | /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer |
453 | | /// │ View 1 │─ ─ ─ ─ │......│ with data that |
454 | | /// ├────────────────────┤ │......│ is not referred |
455 | | /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or |
456 | | /// └────────────────────┘ │......│ View 2 |
457 | | /// │......│ |
458 | | /// 2 views, refer to │......│ |
459 | | /// small portions of a └──────┘ |
460 | | /// large buffer |
461 | | /// ``` |
462 | | /// |
463 | | /// After GC: |
464 | | /// |
465 | | /// ```text |
466 | | /// ┌────────────────────┐ ┌─────┐ After gc, only |
467 | | /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is |
468 | | /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by |
469 | | /// │ View 2 │─ ─ ─ ─ └─────┘ the views is |
470 | | /// └────────────────────┘ left |
471 | | /// |
472 | | /// |
473 | | /// 2 views |
474 | | /// ``` |
475 | | /// This method will compact the data buffers by recreating the view array and only include the data |
476 | | /// that is pointed to by the views. |
477 | | /// |
478 | | /// Note that it will copy the array regardless of whether the original array is compact. |
479 | | /// Use with caution as this can be an expensive operation, only use it when you are sure that the view |
480 | | /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. |
481 | | /// |
482 | | /// Note: this function does not attempt to canonicalize / deduplicate values. For this |
483 | | /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`]. |
484 | | pub fn gc(&self) -> Self { |
485 | | // 1) Read basic properties once |
486 | | let len = self.len(); // number of elements |
487 | | let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap |
488 | | |
489 | | // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks |
490 | | if self.data_buffers().is_empty() { |
491 | | return unsafe { |
492 | | GenericByteViewArray::new_unchecked( |
493 | | self.views().clone(), |
494 | | vec![], // empty data blocks |
495 | | nulls, |
496 | | ) |
497 | | }; |
498 | | } |
499 | | |
500 | | // 2) Calculate total size of all non-inline data and detect if any exists |
501 | | let total_large = self.total_buffer_bytes_used(); |
502 | | |
503 | | // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing |
504 | | if total_large == 0 { |
505 | | // Views are inline-only or all null; just reuse original views and no data blocks |
506 | | return unsafe { |
507 | | GenericByteViewArray::new_unchecked( |
508 | | self.views().clone(), |
509 | | vec![], // empty data blocks |
510 | | nulls, |
511 | | ) |
512 | | }; |
513 | | } |
514 | | |
515 | | let (views_buf, data_blocks) = if total_large < i32::MAX as usize { |
516 | | // fast path, the entire data fits in a single buffer |
517 | | // 3) Allocate exactly capacity for all non-inline data |
518 | | let mut data_buf = Vec::with_capacity(total_large); |
519 | | |
520 | | // 4) Iterate over views and process each inline/non-inline view |
521 | | let views_buf: Vec<u128> = (0..len) |
522 | | .map(|i| unsafe { self.copy_view_to_buffer(i, 0, &mut data_buf) }) |
523 | | .collect(); |
524 | | let data_block = Buffer::from_vec(data_buf); |
525 | | let data_blocks = vec![data_block]; |
526 | | (views_buf, data_blocks) |
527 | | } else { |
528 | | // slow path, need to split into multiple buffers |
529 | | |
530 | | struct GcCopyGroup { |
531 | | total_buffer_bytes: usize, |
532 | | total_len: usize, |
533 | | } |
534 | | |
535 | | impl GcCopyGroup { |
536 | 0 | fn new(total_buffer_bytes: u32, total_len: usize) -> Self { |
537 | 0 | Self { |
538 | 0 | total_buffer_bytes: total_buffer_bytes as usize, |
539 | 0 | total_len, |
540 | 0 | } |
541 | 0 | } |
542 | | } |
543 | | |
544 | | let mut groups = Vec::new(); |
545 | | let mut current_length = 0; |
546 | | let mut current_elements = 0; |
547 | | |
548 | | for view in self.views() { |
549 | | let len = *view as u32; |
550 | | if len > MAX_INLINE_VIEW_LEN { |
551 | | if current_length + len > i32::MAX as u32 { |
552 | | // Start a new group |
553 | | groups.push(GcCopyGroup::new(current_length, current_elements)); |
554 | | current_length = 0; |
555 | | current_elements = 0; |
556 | | } |
557 | | current_length += len; |
558 | | current_elements += 1; |
559 | | } |
560 | | } |
561 | | if current_elements != 0 { |
562 | | groups.push(GcCopyGroup::new(current_length, current_elements)); |
563 | | } |
564 | | debug_assert!(groups.len() <= i32::MAX as usize); |
565 | | |
566 | | // 3) Copy the buffers group by group |
567 | | let mut views_buf = Vec::with_capacity(len); |
568 | | let mut data_blocks = Vec::with_capacity(groups.len()); |
569 | | |
570 | | let mut current_view_idx = 0; |
571 | | |
572 | | for (group_idx, gc_copy_group) in groups.iter().enumerate() { |
573 | | let mut data_buf = Vec::with_capacity(gc_copy_group.total_buffer_bytes); |
574 | | |
575 | | // Directly push views to avoid intermediate Vec allocation |
576 | | let new_views = (current_view_idx..current_view_idx + gc_copy_group.total_len).map( |
577 | | |view_idx| { |
578 | | // safety: the view index came from iterating over valid range |
579 | | unsafe { |
580 | | self.copy_view_to_buffer(view_idx, group_idx as i32, &mut data_buf) |
581 | | } |
582 | | }, |
583 | | ); |
584 | | views_buf.extend(new_views); |
585 | | |
586 | | data_blocks.push(Buffer::from_vec(data_buf)); |
587 | | current_view_idx += gc_copy_group.total_len; |
588 | | } |
589 | | (views_buf, data_blocks) |
590 | | }; |
591 | | |
592 | | // 5) Wrap up views buffer |
593 | | let views_scalar = ScalarBuffer::from(views_buf); |
594 | | |
595 | | // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized |
596 | | unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) } |
597 | | } |
598 | | |
599 | | /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer. |
600 | | /// |
601 | | /// # Safety |
602 | | /// |
603 | | /// - `i < self.len()`. |
604 | | /// - Every element in `self.views()` must currently refer to a valid slice |
605 | | /// inside one of `self.buffers`. |
606 | | /// - `data_buf` must be ready to have additional bytes appended. |
607 | | /// - After this call, the returned view will have its |
608 | | /// `buffer_index` reset to `buffer_idx` and its `offset` updated so that it points |
609 | | /// into the bytes just appended at the end of `data_buf`. |
610 | | #[inline(always)] |
611 | | unsafe fn copy_view_to_buffer( |
612 | | &self, |
613 | | i: usize, |
614 | | buffer_idx: i32, |
615 | | data_buf: &mut Vec<u8>, |
616 | | ) -> u128 { |
617 | | // SAFETY: `i < self.len()` ensures this is in‑bounds. |
618 | | let raw_view = unsafe { *self.views().get_unchecked(i) }; |
619 | | let mut bv = ByteView::from(raw_view); |
620 | | |
621 | | // Inline‑small views stay as‑is. |
622 | | if bv.length <= MAX_INLINE_VIEW_LEN { |
623 | | raw_view |
624 | | } else { |
625 | | // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length` |
626 | | // must both lie within valid ranges for `self.buffers`. |
627 | | let buffer = unsafe { self.buffers.get_unchecked(bv.buffer_index as usize) }; |
628 | | let start = bv.offset as usize; |
629 | | let end = start + bv.length as usize; |
630 | | let slice = unsafe { buffer.get_unchecked(start..end) }; |
631 | | |
632 | | // Copy out‑of‑line data into our single “0” buffer. |
633 | | let new_offset = data_buf.len() as u32; |
634 | | data_buf.extend_from_slice(slice); |
635 | | |
636 | | bv.buffer_index = buffer_idx as u32; |
637 | | bv.offset = new_offset; |
638 | | bv.into() |
639 | | } |
640 | | } |
641 | | |
642 | | /// Returns the total number of bytes used by all non inlined views in all |
643 | | /// buffers. |
644 | | /// |
645 | | /// Note this does not account for views that point at the same underlying |
646 | | /// data in buffers |
647 | | /// |
648 | | /// For example, if the array has three strings views: |
649 | | /// * View with length = 9 (inlined) |
650 | | /// * View with length = 32 (non inlined) |
651 | | /// * View with length = 16 (non inlined) |
652 | | /// |
653 | | /// Then this method would report 48 |
654 | 90 | pub fn total_buffer_bytes_used(&self) -> usize { |
655 | 90 | self.views() |
656 | 90 | .iter() |
657 | 27.5k | .map90 (|v| { |
658 | 27.5k | let len = *v as u32; |
659 | 27.5k | if len > MAX_INLINE_VIEW_LEN { |
660 | 9.21k | len as usize |
661 | | } else { |
662 | 18.2k | 0 |
663 | | } |
664 | 27.5k | }) |
665 | 90 | .sum() |
666 | 90 | } |
667 | | |
668 | | /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx` |
669 | | /// |
670 | | /// Comparing two ByteView types are non-trivial. |
671 | | /// It takes a bit of patience to understand why we don't just compare two &[u8] directly. |
672 | | /// |
673 | | /// ByteView types give us the following two advantages, and we need to be careful not to lose them: |
674 | | /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view. |
675 | | /// Meaning that reading one array element requires only one memory access |
676 | | /// (two memory access required for StringArray, one for offset buffer, the other for value buffer). |
677 | | /// |
678 | | /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray, |
679 | | /// thanks to the inlined 4 bytes. |
680 | | /// Consider equality check: |
681 | | /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access). |
682 | | /// |
683 | | /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary. |
684 | | /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer, |
685 | | /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string. |
686 | | /// |
687 | | /// # Order check flow |
688 | | /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view. |
689 | | /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string. |
690 | | /// (2.1) if the inlined 4 bytes are different, we can return the result immediately. |
691 | | /// (2.2) o.w., we need to compare the full string. |
692 | | /// |
693 | | /// # Safety |
694 | | /// The left/right_idx must within range of each array |
695 | | pub unsafe fn compare_unchecked( |
696 | | left: &GenericByteViewArray<T>, |
697 | | left_idx: usize, |
698 | | right: &GenericByteViewArray<T>, |
699 | | right_idx: usize, |
700 | | ) -> Ordering { |
701 | | let l_view = unsafe { left.views().get_unchecked(left_idx) }; |
702 | | let l_byte_view = ByteView::from(*l_view); |
703 | | |
704 | | let r_view = unsafe { right.views().get_unchecked(right_idx) }; |
705 | | let r_byte_view = ByteView::from(*r_view); |
706 | | |
707 | | let l_len = l_byte_view.length; |
708 | | let r_len = r_byte_view.length; |
709 | | |
710 | | if l_len <= 12 && r_len <= 12 { |
711 | | return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view)); |
712 | | } |
713 | | |
714 | | // one of the string is larger than 12 bytes, |
715 | | // we then try to compare the inlined data first |
716 | | |
717 | | // Note: In theory, ByteView is only used for string which is larger than 12 bytes, |
718 | | // but we can still use it to get the inlined prefix for shorter strings. |
719 | | // The prefix is always the first 4 bytes of the view, for both short and long strings. |
720 | | let l_inlined_be = l_byte_view.prefix.swap_bytes(); |
721 | | let r_inlined_be = r_byte_view.prefix.swap_bytes(); |
722 | | if l_inlined_be != r_inlined_be { |
723 | | return l_inlined_be.cmp(&r_inlined_be); |
724 | | } |
725 | | |
726 | | // unfortunately, we need to compare the full data |
727 | | let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() }; |
728 | | let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() }; |
729 | | |
730 | | l_full_data.cmp(r_full_data) |
731 | | } |
732 | | |
733 | | /// Builds a 128-bit composite key for an inline value: |
734 | | /// |
735 | | /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). |
736 | | /// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings |
737 | | /// (or those with fewer meaningful bytes) always numerically sort before longer ones. |
738 | | /// |
739 | | /// This function extracts the length and the 12-byte inline string data from the raw |
740 | | /// little-endian `u128` representation, converts them to big-endian ordering, and packs them |
741 | | /// into a single `u128` value suitable for fast, branchless comparisons. |
742 | | /// |
743 | | /// # Why include length? |
744 | | /// |
745 | | /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes |
746 | | /// compare equal—either because one is a true prefix of the other or because zero-padding |
747 | | /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare |
748 | | /// handles both content and length in one go. |
749 | | /// |
750 | | /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) |
751 | | /// |
752 | | /// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | |
753 | | /// |------------|-----------------------|---------------------------------| |
754 | | /// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | |
755 | | /// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | |
756 | | /// |
757 | | /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field |
758 | | /// then differentiates: |
759 | | /// |
760 | | /// ```text |
761 | | /// key("bar") = 0x0000000000000000000062617200000003 |
762 | | /// key("bar\0") = 0x0000000000000000000062617200000004 |
763 | | /// ⇒ key("bar") < key("bar\0") |
764 | | /// ``` |
765 | | /// # Inlining and Endianness |
766 | | /// |
767 | | /// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory |
768 | | /// representation is little‑endian on x86/ARM. |
769 | | /// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. |
770 | | /// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. |
771 | | /// - We serialize `length` as big‑endian into `buf[12..16]`. |
772 | | /// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte |
773 | | /// and `buf[15]` as the least significant, producing a `u128` whose integer value |
774 | | /// directly encodes “inline data then length” in big‑endian form. |
775 | | /// |
776 | | /// This ensures that a simple `u128` comparison is equivalent to the desired |
777 | | /// lexicographical comparison of the inline bytes followed by length. |
778 | | #[inline(always)] |
779 | | pub fn inline_key_fast(raw: u128) -> u128 { |
780 | | // 1. Decompose `raw` into little‑endian bytes: |
781 | | // - raw_bytes[0..4] = length in LE |
782 | | // - raw_bytes[4..16] = inline string data |
783 | | let raw_bytes = raw.to_le_bytes(); |
784 | | |
785 | | // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). |
786 | | let length = raw as u32; |
787 | | |
788 | | // 3. Build a 16‑byte buffer in big‑endian order: |
789 | | // - buf[0..12] = inline string bytes (in original order) |
790 | | // - buf[12..16] = length.to_be_bytes() (BE) |
791 | | let mut buf = [0u8; 16]; |
792 | | buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data |
793 | | |
794 | | // Why convert length to big-endian for comparison? |
795 | | // |
796 | | // Rust (on most platforms) stores integers in little-endian format, |
797 | | // meaning the least significant byte is at the lowest memory address. |
798 | | // For example, an u32 value like 0x22345677 is stored in memory as: |
799 | | // |
800 | | // [0x77, 0x56, 0x34, 0x22] // little-endian layout |
801 | | // ^ ^ ^ ^ |
802 | | // LSB ↑↑↑ MSB |
803 | | // |
804 | | // This layout is efficient for arithmetic but *not* suitable for |
805 | | // lexicographic (dictionary-style) comparison of byte arrays. |
806 | | // |
807 | | // To compare values by byte order—e.g., for sorted keys or binary trees— |
808 | | // we must convert them to **big-endian**, where: |
809 | | // |
810 | | // - The most significant byte (MSB) comes first (index 0) |
811 | | // - The least significant byte (LSB) comes last (index N-1) |
812 | | // |
813 | | // In big-endian, the same u32 = 0x22345677 would be represented as: |
814 | | // |
815 | | // [0x22, 0x34, 0x56, 0x77] |
816 | | // |
817 | | // This ordering aligns with natural string/byte sorting, so calling |
818 | | // `.to_be_bytes()` allows us to construct |
819 | | // keys where standard numeric comparison (e.g., `<`, `>`) behaves |
820 | | // like lexicographic byte comparison. |
821 | | buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE |
822 | | |
823 | | // 4. Deserialize the buffer as a big‑endian u128: |
824 | | // buf[0] is MSB, buf[15] is LSB. |
825 | | // Details: |
826 | | // Note on endianness and layout: |
827 | | // |
828 | | // Although `buf[0]` is stored at the lowest memory address, |
829 | | // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, |
830 | | // and `buf[15]` as the **least significant byte (LSB)**. |
831 | | // |
832 | | // This is the core principle of **big-endian decoding**: |
833 | | // - Byte at index 0 maps to bits 127..120 (highest) |
834 | | // - Byte at index 1 maps to bits 119..112 |
835 | | // - ... |
836 | | // - Byte at index 15 maps to bits 7..0 (lowest) |
837 | | // |
838 | | // So even though memory layout goes from low to high (left to right), |
839 | | // big-endian treats the **first byte** as highest in value. |
840 | | // |
841 | | // This guarantees that comparing two `u128` keys is equivalent to lexicographically |
842 | | // comparing the original inline bytes, followed by length. |
843 | | u128::from_be_bytes(buf) |
844 | | } |
845 | | } |
846 | | |
847 | | impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> { |
848 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
849 | 0 | write!(f, "{}ViewArray\n[\n", T::PREFIX)?; |
850 | 0 | print_long_array(self, f, |array, index, f| { |
851 | 0 | std::fmt::Debug::fmt(&array.value(index), f) |
852 | 0 | })?; |
853 | 0 | write!(f, "]") |
854 | 0 | } |
855 | | } |
856 | | |
857 | | impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> { |
858 | 510 | fn as_any(&self) -> &dyn Any { |
859 | 510 | self |
860 | 510 | } |
861 | | |
862 | 108 | fn to_data(&self) -> ArrayData { |
863 | 108 | self.clone().into() |
864 | 108 | } |
865 | | |
866 | 0 | fn into_data(self) -> ArrayData { |
867 | 0 | self.into() |
868 | 0 | } |
869 | | |
870 | 520 | fn data_type(&self) -> &DataType { |
871 | 520 | &self.data_type |
872 | 520 | } |
873 | | |
874 | 53 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
875 | 53 | Arc::new(self.slice(offset, length)) |
876 | 53 | } |
877 | | |
878 | 663 | fn len(&self) -> usize { |
879 | 663 | self.views.len() |
880 | 663 | } |
881 | | |
882 | 0 | fn is_empty(&self) -> bool { |
883 | 0 | self.views.is_empty() |
884 | 0 | } |
885 | | |
886 | 0 | fn shrink_to_fit(&mut self) { |
887 | 0 | self.views.shrink_to_fit(); |
888 | 0 | self.buffers.iter_mut().for_each(|b| b.shrink_to_fit()); |
889 | 0 | self.buffers.shrink_to_fit(); |
890 | 0 | if let Some(nulls) = &mut self.nulls { |
891 | 0 | nulls.shrink_to_fit(); |
892 | 0 | } |
893 | 0 | } |
894 | | |
895 | 0 | fn offset(&self) -> usize { |
896 | 0 | 0 |
897 | 0 | } |
898 | | |
899 | 445 | fn nulls(&self) -> Option<&NullBuffer> { |
900 | 445 | self.nulls.as_ref() |
901 | 445 | } |
902 | | |
903 | 0 | fn logical_null_count(&self) -> usize { |
904 | | // More efficient that the default implementation |
905 | 0 | self.null_count() |
906 | 0 | } |
907 | | |
908 | 0 | fn get_buffer_memory_size(&self) -> usize { |
909 | 0 | let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>(); |
910 | 0 | sum += self.views.inner().capacity(); |
911 | 0 | if let Some(x) = &self.nulls { |
912 | 0 | sum += x.buffer().capacity() |
913 | 0 | } |
914 | 0 | sum |
915 | 0 | } |
916 | | |
917 | 0 | fn get_array_memory_size(&self) -> usize { |
918 | 0 | std::mem::size_of::<Self>() + self.get_buffer_memory_size() |
919 | 0 | } |
920 | | } |
921 | | |
922 | | impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> { |
923 | | type Item = &'a T::Native; |
924 | | |
925 | | fn value(&self, index: usize) -> Self::Item { |
926 | | GenericByteViewArray::value(self, index) |
927 | | } |
928 | | |
929 | 47.9k | unsafe fn value_unchecked(&self, index: usize) -> Self::Item { |
930 | 47.9k | unsafe { GenericByteViewArray::value_unchecked(self, index) } |
931 | 47.9k | } |
932 | | } |
933 | | |
934 | | impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> { |
935 | | type Item = Option<&'a T::Native>; |
936 | | type IntoIter = ArrayIter<Self>; |
937 | | |
938 | | fn into_iter(self) -> Self::IntoIter { |
939 | | ArrayIter::new(self) |
940 | | } |
941 | | } |
942 | | |
943 | | impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> { |
944 | 86 | fn from(value: ArrayData) -> Self { |
945 | 86 | let views = value.buffers()[0].clone(); |
946 | 86 | let views = ScalarBuffer::new(views, value.offset(), value.len()); |
947 | 86 | let buffers = value.buffers()[1..].to_vec(); |
948 | 86 | Self { |
949 | 86 | data_type: T::DATA_TYPE, |
950 | 86 | views, |
951 | 86 | buffers, |
952 | 86 | nulls: value.nulls().cloned(), |
953 | 86 | phantom: Default::default(), |
954 | 86 | } |
955 | 86 | } |
956 | | } |
957 | | |
958 | | /// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`] |
959 | | /// |
960 | | /// For example this method can convert a [`StringArray`] to a |
961 | | /// [`StringViewArray`]. |
962 | | /// |
963 | | /// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`] |
964 | | /// is built without copying the underlying string data (views are created |
965 | | /// directly into the existing buffer) |
966 | | /// |
967 | | /// [`StringArray`]: crate::StringArray |
968 | | impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V> |
969 | | where |
970 | | FROM: ByteArrayType, |
971 | | FROM::Offset: OffsetSizeTrait + ToPrimitive, |
972 | | V: ByteViewType<Native = FROM::Native>, |
973 | | { |
974 | 4 | fn from(byte_array: &GenericByteArray<FROM>) -> Self { |
975 | 4 | let offsets = byte_array.offsets(); |
976 | | |
977 | 4 | let can_reuse_buffer = match offsets.last() { |
978 | 4 | Some(offset) => offset.as_usize() < u32::MAX as usize, |
979 | 0 | None => true, |
980 | | }; |
981 | | |
982 | 4 | if can_reuse_buffer { |
983 | | // build views directly pointing to the existing buffer |
984 | 4 | let len = byte_array.len(); |
985 | 4 | let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len); |
986 | 4 | let str_values_buf = byte_array.values().clone(); |
987 | 4 | let block = views_builder.append_block(str_values_buf); |
988 | 20 | for (i, w) in offsets.windows(2)4 .enumerate4 () { |
989 | 20 | let offset = w[0].as_usize(); |
990 | 20 | let end = w[1].as_usize(); |
991 | 20 | let length = end - offset; |
992 | | |
993 | 20 | if byte_array.is_null(i) { |
994 | 4 | views_builder.append_null(); |
995 | 4 | } else { |
996 | | // Safety: the input was a valid array so it valid UTF8 (if string). And |
997 | | // all offsets were valid |
998 | | unsafe { |
999 | 16 | views_builder.append_view_unchecked(block, offset as u32, length as u32) |
1000 | | } |
1001 | | } |
1002 | | } |
1003 | 4 | assert_eq!(views_builder.len(), len); |
1004 | 4 | views_builder.finish() |
1005 | | } else { |
1006 | | // Otherwise, create a new buffer for large strings |
1007 | | // TODO: the original buffer could still be used |
1008 | | // by making multiple slices of u32::MAX length |
1009 | 0 | GenericByteViewArray::<V>::from_iter(byte_array.iter()) |
1010 | | } |
1011 | 4 | } |
1012 | | } |
1013 | | |
1014 | | impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData { |
1015 | 108 | fn from(mut array: GenericByteViewArray<T>) -> Self { |
1016 | 108 | let len = array.len(); |
1017 | 108 | array.buffers.insert(0, array.views.into_inner()); |
1018 | 108 | let builder = ArrayDataBuilder::new(T::DATA_TYPE) |
1019 | 108 | .len(len) |
1020 | 108 | .buffers(array.buffers) |
1021 | 108 | .nulls(array.nulls); |
1022 | | |
1023 | 108 | unsafe { builder.build_unchecked() } |
1024 | 108 | } |
1025 | | } |
1026 | | |
1027 | | impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T> |
1028 | | where |
1029 | | Ptr: AsRef<T::Native> + 'a, |
1030 | | T: ByteViewType + ?Sized, |
1031 | | { |
1032 | 1 | fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self { |
1033 | 1 | iter.into_iter() |
1034 | 1.00k | .map1 (|o| o.as_ref().map(|p| p667 .as_ref667 ())) |
1035 | 1 | .collect() |
1036 | 1 | } |
1037 | | } |
1038 | | |
1039 | | impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T> |
1040 | | where |
1041 | | Ptr: AsRef<T::Native>, |
1042 | | { |
1043 | 45 | fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self { |
1044 | 45 | let iter = iter.into_iter(); |
1045 | 45 | let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0); |
1046 | 45 | builder.extend(iter); |
1047 | 45 | builder.finish() |
1048 | 45 | } |
1049 | | } |
1050 | | |
1051 | | /// A [`GenericByteViewArray`] of `[u8]` |
1052 | | /// |
1053 | | /// See [`GenericByteViewArray`] for format and layout details. |
1054 | | /// |
1055 | | /// # Example |
1056 | | /// ``` |
1057 | | /// use arrow_array::BinaryViewArray; |
1058 | | /// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]); |
1059 | | /// assert_eq!(array.value(0), b"hello"); |
1060 | | /// assert_eq!(array.value(3), b"large payload over 12 bytes"); |
1061 | | /// ``` |
1062 | | pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>; |
1063 | | |
1064 | | impl BinaryViewArray { |
1065 | | /// Convert the [`BinaryViewArray`] to [`StringViewArray`] |
1066 | | /// If items not utf8 data, validate will fail and error returned. |
1067 | 0 | pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> { |
1068 | 0 | StringViewType::validate(self.views(), self.data_buffers())?; |
1069 | 0 | unsafe { Ok(self.to_string_view_unchecked()) } |
1070 | 0 | } |
1071 | | |
1072 | | /// Convert the [`BinaryViewArray`] to [`StringViewArray`] |
1073 | | /// # Safety |
1074 | | /// Caller is responsible for ensuring that items in array are utf8 data. |
1075 | 0 | pub unsafe fn to_string_view_unchecked(self) -> StringViewArray { |
1076 | 0 | unsafe { StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) } |
1077 | 0 | } |
1078 | | } |
1079 | | |
1080 | | impl From<Vec<&[u8]>> for BinaryViewArray { |
1081 | 0 | fn from(v: Vec<&[u8]>) -> Self { |
1082 | 0 | Self::from_iter_values(v) |
1083 | 0 | } |
1084 | | } |
1085 | | |
1086 | | impl From<Vec<Option<&[u8]>>> for BinaryViewArray { |
1087 | 0 | fn from(v: Vec<Option<&[u8]>>) -> Self { |
1088 | 0 | v.into_iter().collect() |
1089 | 0 | } |
1090 | | } |
1091 | | |
1092 | | /// A [`GenericByteViewArray`] that stores utf8 data |
1093 | | /// |
1094 | | /// See [`GenericByteViewArray`] for format and layout details. |
1095 | | /// |
1096 | | /// # Example |
1097 | | /// ``` |
1098 | | /// use arrow_array::StringViewArray; |
1099 | | /// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]); |
1100 | | /// assert_eq!(array.value(0), "hello"); |
1101 | | /// assert_eq!(array.value(3), "large payload over 12 bytes"); |
1102 | | /// ``` |
1103 | | pub type StringViewArray = GenericByteViewArray<StringViewType>; |
1104 | | |
1105 | | impl StringViewArray { |
1106 | | /// Convert the [`StringViewArray`] to [`BinaryViewArray`] |
1107 | 0 | pub fn to_binary_view(self) -> BinaryViewArray { |
1108 | 0 | unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) } |
1109 | 0 | } |
1110 | | |
1111 | | /// Returns true if all data within this array is ASCII |
1112 | 0 | pub fn is_ascii(&self) -> bool { |
1113 | | // Alternative (but incorrect): directly check the underlying buffers |
1114 | | // (1) Our string view might be sparse, i.e., a subset of the buffers, |
1115 | | // so even if the buffer is not ascii, we can still be ascii. |
1116 | | // (2) It is quite difficult to know the range of each buffer (unlike StringArray) |
1117 | | // This means that this operation is quite expensive, shall we cache the result? |
1118 | | // i.e. track `is_ascii` in the builder. |
1119 | 0 | self.iter().all(|v| match v { |
1120 | 0 | Some(v) => v.is_ascii(), |
1121 | 0 | None => true, |
1122 | 0 | }) |
1123 | 0 | } |
1124 | | } |
1125 | | |
1126 | | impl From<Vec<&str>> for StringViewArray { |
1127 | 2 | fn from(v: Vec<&str>) -> Self { |
1128 | 2 | Self::from_iter_values(v) |
1129 | 2 | } |
1130 | | } |
1131 | | |
1132 | | impl From<Vec<Option<&str>>> for StringViewArray { |
1133 | 2 | fn from(v: Vec<Option<&str>>) -> Self { |
1134 | 2 | v.into_iter().collect() |
1135 | 2 | } |
1136 | | } |
1137 | | |
1138 | | impl From<Vec<String>> for StringViewArray { |
1139 | 0 | fn from(v: Vec<String>) -> Self { |
1140 | 0 | Self::from_iter_values(v) |
1141 | 0 | } |
1142 | | } |
1143 | | |
1144 | | impl From<Vec<Option<String>>> for StringViewArray { |
1145 | 0 | fn from(v: Vec<Option<String>>) -> Self { |
1146 | 0 | v.into_iter().collect() |
1147 | 0 | } |
1148 | | } |
1149 | | |
1150 | | #[cfg(test)] |
1151 | | mod tests { |
1152 | | use crate::builder::{BinaryViewBuilder, StringViewBuilder}; |
1153 | | use crate::types::BinaryViewType; |
1154 | | use crate::{ |
1155 | | Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray, |
1156 | | }; |
1157 | | use arrow_buffer::{Buffer, ScalarBuffer}; |
1158 | | use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; |
1159 | | use rand::prelude::StdRng; |
1160 | | use rand::{Rng, SeedableRng}; |
1161 | | |
1162 | | const BLOCK_SIZE: u32 = 8; |
1163 | | |
1164 | | #[test] |
1165 | | fn try_new_string() { |
1166 | | let array = StringViewArray::from_iter_values(vec![ |
1167 | | "hello", |
1168 | | "world", |
1169 | | "lulu", |
1170 | | "large payload over 12 bytes", |
1171 | | ]); |
1172 | | assert_eq!(array.value(0), "hello"); |
1173 | | assert_eq!(array.value(3), "large payload over 12 bytes"); |
1174 | | } |
1175 | | |
1176 | | #[test] |
1177 | | fn try_new_binary() { |
1178 | | let array = BinaryViewArray::from_iter_values(vec![ |
1179 | | b"hello".as_slice(), |
1180 | | b"world".as_slice(), |
1181 | | b"lulu".as_slice(), |
1182 | | b"large payload over 12 bytes".as_slice(), |
1183 | | ]); |
1184 | | assert_eq!(array.value(0), b"hello"); |
1185 | | assert_eq!(array.value(3), b"large payload over 12 bytes"); |
1186 | | } |
1187 | | |
1188 | | #[test] |
1189 | | fn try_new_empty_string() { |
1190 | | // test empty array |
1191 | | let array = { |
1192 | | let mut builder = StringViewBuilder::new(); |
1193 | | builder.finish() |
1194 | | }; |
1195 | | assert!(array.is_empty()); |
1196 | | } |
1197 | | |
1198 | | #[test] |
1199 | | fn try_new_empty_binary() { |
1200 | | // test empty array |
1201 | | let array = { |
1202 | | let mut builder = BinaryViewBuilder::new(); |
1203 | | builder.finish() |
1204 | | }; |
1205 | | assert!(array.is_empty()); |
1206 | | } |
1207 | | |
1208 | | #[test] |
1209 | | fn test_append_string() { |
1210 | | // test builder append |
1211 | | let array = { |
1212 | | let mut builder = StringViewBuilder::new(); |
1213 | | builder.append_value("hello"); |
1214 | | builder.append_null(); |
1215 | | builder.append_option(Some("large payload over 12 bytes")); |
1216 | | builder.finish() |
1217 | | }; |
1218 | | assert_eq!(array.value(0), "hello"); |
1219 | | assert!(array.is_null(1)); |
1220 | | assert_eq!(array.value(2), "large payload over 12 bytes"); |
1221 | | } |
1222 | | |
1223 | | #[test] |
1224 | | fn test_append_binary() { |
1225 | | // test builder append |
1226 | | let array = { |
1227 | | let mut builder = BinaryViewBuilder::new(); |
1228 | | builder.append_value(b"hello"); |
1229 | | builder.append_null(); |
1230 | | builder.append_option(Some(b"large payload over 12 bytes")); |
1231 | | builder.finish() |
1232 | | }; |
1233 | | assert_eq!(array.value(0), b"hello"); |
1234 | | assert!(array.is_null(1)); |
1235 | | assert_eq!(array.value(2), b"large payload over 12 bytes"); |
1236 | | } |
1237 | | |
1238 | | #[test] |
1239 | | fn test_in_progress_recreation() { |
1240 | | let array = { |
1241 | | // make a builder with small block size. |
1242 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(14); |
1243 | | builder.append_value("large payload over 12 bytes"); |
1244 | | builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created")); |
1245 | | builder.finish() |
1246 | | }; |
1247 | | assert_eq!(array.value(0), "large payload over 12 bytes"); |
1248 | | assert_eq!( |
1249 | | array.value(1), |
1250 | | "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created" |
1251 | | ); |
1252 | | assert_eq!(2, array.buffers.len()); |
1253 | | } |
1254 | | |
1255 | | #[test] |
1256 | | #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] |
1257 | | fn new_with_invalid_view_data() { |
1258 | | let v = "large payload over 12 bytes"; |
1259 | | let view = ByteView::new(13, &v.as_bytes()[0..4]) |
1260 | | .with_buffer_index(3) |
1261 | | .with_offset(1); |
1262 | | let views = ScalarBuffer::from(vec![view.into()]); |
1263 | | let buffers = vec![Buffer::from_slice_ref(v)]; |
1264 | | StringViewArray::new(views, buffers, None); |
1265 | | } |
1266 | | |
1267 | | #[test] |
1268 | | #[should_panic( |
1269 | | expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" |
1270 | | )] |
1271 | | fn new_with_invalid_utf8_data() { |
1272 | | let v: Vec<u8> = vec![ |
1273 | | // invalid UTF8 |
1274 | | 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12 |
1275 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
1276 | | ]; |
1277 | | let view = ByteView::new(v.len() as u32, &v[0..4]); |
1278 | | let views = ScalarBuffer::from(vec![view.into()]); |
1279 | | let buffers = vec![Buffer::from_slice_ref(v)]; |
1280 | | StringViewArray::new(views, buffers, None); |
1281 | | } |
1282 | | |
1283 | | #[test] |
1284 | | #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")] |
1285 | | fn new_with_invalid_zero_padding() { |
1286 | | let mut data = [0; 12]; |
1287 | | data[0] = b'H'; |
1288 | | data[11] = 1; // no zero padding |
1289 | | |
1290 | | let mut view_buffer = [0; 16]; |
1291 | | view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes()); |
1292 | | view_buffer[4..].copy_from_slice(&data); |
1293 | | |
1294 | | let view = ByteView::from(u128::from_le_bytes(view_buffer)); |
1295 | | let views = ScalarBuffer::from(vec![view.into()]); |
1296 | | let buffers = vec![]; |
1297 | | StringViewArray::new(views, buffers, None); |
1298 | | } |
1299 | | |
1300 | | #[test] |
1301 | | #[should_panic(expected = "Mismatch between embedded prefix and data")] |
1302 | | fn test_mismatch_between_embedded_prefix_and_data() { |
1303 | | let input_str_1 = "Hello, Rustaceans!"; |
1304 | | let input_str_2 = "Hallo, Rustaceans!"; |
1305 | | let length = input_str_1.len() as u32; |
1306 | | assert!(input_str_1.len() > 12); |
1307 | | |
1308 | | let mut view_buffer = [0; 16]; |
1309 | | view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); |
1310 | | view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]); |
1311 | | view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes()); |
1312 | | view_buffer[12..].copy_from_slice(&0u32.to_le_bytes()); |
1313 | | let view = ByteView::from(u128::from_le_bytes(view_buffer)); |
1314 | | let views = ScalarBuffer::from(vec![view.into()]); |
1315 | | let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())]; |
1316 | | |
1317 | | StringViewArray::new(views, buffers, None); |
1318 | | } |
1319 | | |
1320 | | #[test] |
1321 | | fn test_gc() { |
1322 | | let test_data = [ |
1323 | | Some("longer than 12 bytes"), |
1324 | | Some("short"), |
1325 | | Some("t"), |
1326 | | Some("longer than 12 bytes"), |
1327 | | None, |
1328 | | Some("short"), |
1329 | | ]; |
1330 | | |
1331 | | let array = { |
1332 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers |
1333 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1334 | | builder.finish() |
1335 | | }; |
1336 | | assert!(array.buffers.len() > 1); |
1337 | | |
1338 | | fn check_gc(to_test: &StringViewArray) { |
1339 | | let gc = to_test.gc(); |
1340 | | assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len()); |
1341 | | |
1342 | | to_test.iter().zip(gc.iter()).for_each(|(a, b)| { |
1343 | | assert_eq!(a, b); |
1344 | | }); |
1345 | | assert_eq!(to_test.len(), gc.len()); |
1346 | | } |
1347 | | |
1348 | | check_gc(&array); |
1349 | | check_gc(&array.slice(1, 3)); |
1350 | | check_gc(&array.slice(2, 1)); |
1351 | | check_gc(&array.slice(2, 2)); |
1352 | | check_gc(&array.slice(3, 1)); |
1353 | | } |
1354 | | |
1355 | | /// 1) Empty array: no elements, expect gc to return empty with no data buffers |
1356 | | #[test] |
1357 | | fn test_gc_empty_array() { |
1358 | | let array = StringViewBuilder::new() |
1359 | | .with_fixed_block_size(BLOCK_SIZE) |
1360 | | .finish(); |
1361 | | let gced = array.gc(); |
1362 | | // length and null count remain zero |
1363 | | assert_eq!(gced.len(), 0); |
1364 | | assert_eq!(gced.null_count(), 0); |
1365 | | // no underlying data buffers should be allocated |
1366 | | assert!( |
1367 | | gced.data_buffers().is_empty(), |
1368 | | "Expected no data buffers for empty array" |
1369 | | ); |
1370 | | } |
1371 | | |
1372 | | /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values |
1373 | | #[test] |
1374 | | fn test_gc_all_inline() { |
1375 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1376 | | // append many short strings, each exactly INLINE_LEN long |
1377 | | for _ in 0..100 { |
1378 | | let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize); |
1379 | | builder.append_option(Some(&s)); |
1380 | | } |
1381 | | let array = builder.finish(); |
1382 | | let gced = array.gc(); |
1383 | | // Since all views fit inline, data buffer is empty |
1384 | | assert_eq!( |
1385 | | gced.data_buffers().len(), |
1386 | | 0, |
1387 | | "Should have no data buffers for inline values" |
1388 | | ); |
1389 | | assert_eq!(gced.len(), 100); |
1390 | | // verify element-wise equality |
1391 | | array.iter().zip(gced.iter()).for_each(|(orig, got)| { |
1392 | | assert_eq!(orig, got, "Inline value mismatch after gc"); |
1393 | | }); |
1394 | | } |
1395 | | |
1396 | | /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer |
1397 | | #[test] |
1398 | | fn test_gc_all_large() { |
1399 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1400 | | let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5); |
1401 | | // append multiple large strings |
1402 | | for _ in 0..50 { |
1403 | | builder.append_option(Some(&large_str)); |
1404 | | } |
1405 | | let array = builder.finish(); |
1406 | | let gced = array.gc(); |
1407 | | // New data buffers should be populated (one or more blocks) |
1408 | | assert!( |
1409 | | !gced.data_buffers().is_empty(), |
1410 | | "Expected data buffers for large values" |
1411 | | ); |
1412 | | assert_eq!(gced.len(), 50); |
1413 | | // verify that every large string emerges unchanged |
1414 | | array.iter().zip(gced.iter()).for_each(|(orig, got)| { |
1415 | | assert_eq!(orig, got, "Large view mismatch after gc"); |
1416 | | }); |
1417 | | } |
1418 | | |
1419 | | /// 4) All null elements: ensure null bitmap handling path is correct |
1420 | | #[test] |
1421 | | fn test_gc_all_nulls() { |
1422 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1423 | | for _ in 0..20 { |
1424 | | builder.append_null(); |
1425 | | } |
1426 | | let array = builder.finish(); |
1427 | | let gced = array.gc(); |
1428 | | // length and null count match |
1429 | | assert_eq!(gced.len(), 20); |
1430 | | assert_eq!(gced.null_count(), 20); |
1431 | | // data buffers remain empty for null-only array |
1432 | | assert!( |
1433 | | gced.data_buffers().is_empty(), |
1434 | | "No data should be stored for nulls" |
1435 | | ); |
1436 | | } |
1437 | | |
1438 | | /// 5) Random mix of inline, large, and null values with slicing tests |
1439 | | #[test] |
1440 | | fn test_gc_random_mixed_and_slices() { |
1441 | | let mut rng = StdRng::seed_from_u64(42); |
1442 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE); |
1443 | | // Keep a Vec of original Option<String> for later comparison |
1444 | | let mut original: Vec<Option<String>> = Vec::new(); |
1445 | | |
1446 | | for _ in 0..200 { |
1447 | | if rng.random_bool(0.1) { |
1448 | | // 10% nulls |
1449 | | builder.append_null(); |
1450 | | original.push(None); |
1451 | | } else { |
1452 | | // random length between 0 and twice the inline limit |
1453 | | let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2)); |
1454 | | let s: String = "A".repeat(len as usize); |
1455 | | builder.append_option(Some(&s)); |
1456 | | original.push(Some(s)); |
1457 | | } |
1458 | | } |
1459 | | |
1460 | | let array = builder.finish(); |
1461 | | // Test multiple slice ranges to ensure offset logic is correct |
1462 | | for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] { |
1463 | | let sliced = array.slice(*offset, *slice_len); |
1464 | | let gced = sliced.gc(); |
1465 | | // Build expected slice of Option<&str> |
1466 | | let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)] |
1467 | | .iter() |
1468 | | .map(|opt| opt.as_deref()) |
1469 | | .collect(); |
1470 | | |
1471 | | assert_eq!(gced.len(), *slice_len, "Slice length mismatch"); |
1472 | | // Compare element-wise |
1473 | | gced.iter().zip(expected.iter()).for_each(|(got, expect)| { |
1474 | | assert_eq!(got, *expect, "Value mismatch in mixed slice after gc"); |
1475 | | }); |
1476 | | } |
1477 | | } |
1478 | | |
1479 | | #[test] |
1480 | | #[cfg_attr(miri, ignore)] // Takes too long |
1481 | | fn test_gc_huge_array() { |
1482 | | // Construct multiple 128 MiB BinaryView entries so total > 4 GiB |
1483 | | let block_len: usize = 128 * 1024 * 1024; // 128 MiB per view |
1484 | | let num_views: usize = 36; |
1485 | | |
1486 | | // Create a single 128 MiB data block with a simple byte pattern |
1487 | | let buffer = Buffer::from_vec(vec![0xAB; block_len]); |
1488 | | let buffer2 = Buffer::from_vec(vec![0xFF; block_len]); |
1489 | | |
1490 | | // Append this block and then add many views pointing to it |
1491 | | let mut builder = BinaryViewBuilder::new(); |
1492 | | let block_id = builder.append_block(buffer); |
1493 | | for _ in 0..num_views / 2 { |
1494 | | builder |
1495 | | .try_append_view(block_id, 0, block_len as u32) |
1496 | | .expect("append view into 128MiB block"); |
1497 | | } |
1498 | | let block_id2 = builder.append_block(buffer2); |
1499 | | for _ in 0..num_views / 2 { |
1500 | | builder |
1501 | | .try_append_view(block_id2, 0, block_len as u32) |
1502 | | .expect("append view into 128MiB block"); |
1503 | | } |
1504 | | |
1505 | | let array = builder.finish(); |
1506 | | let total = array.total_buffer_bytes_used(); |
1507 | | assert!( |
1508 | | total > u32::MAX as usize, |
1509 | | "Expected total non-inline bytes to exceed 4 GiB, got {}", |
1510 | | total |
1511 | | ); |
1512 | | |
1513 | | // Run gc and verify correctness |
1514 | | let gced = array.gc(); |
1515 | | assert_eq!(gced.len(), num_views, "Length mismatch after gc"); |
1516 | | assert_eq!(gced.null_count(), 0, "Null count mismatch after gc"); |
1517 | | assert_ne!( |
1518 | | gced.data_buffers().len(), |
1519 | | 1, |
1520 | | "gc with huge buffer should not consolidate data into a single buffer" |
1521 | | ); |
1522 | | |
1523 | | // Element-wise equality check across the entire array |
1524 | | array.iter().zip(gced.iter()).for_each(|(orig, got)| { |
1525 | | assert_eq!(orig, got, "Value mismatch after gc on huge array"); |
1526 | | }); |
1527 | | } |
1528 | | |
1529 | | #[test] |
1530 | | fn test_eq() { |
1531 | | let test_data = [ |
1532 | | Some("longer than 12 bytes"), |
1533 | | None, |
1534 | | Some("short"), |
1535 | | Some("again, this is longer than 12 bytes"), |
1536 | | ]; |
1537 | | |
1538 | | let array1 = { |
1539 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(8); |
1540 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1541 | | builder.finish() |
1542 | | }; |
1543 | | let array2 = { |
1544 | | // create a new array with the same data but different layout |
1545 | | let mut builder = StringViewBuilder::new().with_fixed_block_size(100); |
1546 | | test_data.into_iter().for_each(|v| builder.append_option(v)); |
1547 | | builder.finish() |
1548 | | }; |
1549 | | assert_eq!(array1, array1.clone()); |
1550 | | assert_eq!(array2, array2.clone()); |
1551 | | assert_eq!(array1, array2); |
1552 | | } |
1553 | | |
1554 | | /// Integration tests for `inline_key_fast` covering: |
1555 | | /// |
1556 | | /// 1. Monotonic ordering across increasing lengths and lexical variations. |
1557 | | /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. |
1558 | | /// |
1559 | | /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why |
1560 | | /// the length field is required even when all inline bytes fit in 12 bytes. |
1561 | | /// |
1562 | | /// The test includes strings that verify correct byte order (prevent reversal bugs), |
1563 | | /// and length-based tie-breaking in the composite key. |
1564 | | /// |
1565 | | /// The test confirms that `inline_key_fast` produces keys which sort consistently |
1566 | | /// with the expected lexicographical order of the raw byte arrays. |
1567 | | #[test] |
1568 | | fn test_inline_key_fast_various_lengths_and_lexical() { |
1569 | | /// Helper to create a raw u128 value representing an inline ByteView: |
1570 | | /// - `length`: number of meaningful bytes (must be ≤ 12) |
1571 | | /// - `data`: the actual inline data bytes |
1572 | | /// |
1573 | | /// The first 4 bytes encode length in little-endian, |
1574 | | /// the following 12 bytes contain the inline string data (unpadded). |
1575 | | fn make_raw_inline(length: u32, data: &[u8]) -> u128 { |
1576 | | assert!(length as usize <= 12, "Inline length must be ≤ 12"); |
1577 | | assert!( |
1578 | | data.len() == length as usize, |
1579 | | "Data length must match `length`" |
1580 | | ); |
1581 | | |
1582 | | let mut raw_bytes = [0u8; 16]; |
1583 | | raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian |
1584 | | raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data |
1585 | | u128::from_le_bytes(raw_bytes) |
1586 | | } |
1587 | | |
1588 | | // Test inputs: various lengths and lexical orders, |
1589 | | // plus special cases for byte order and length tie-breaking |
1590 | | let test_inputs: Vec<&[u8]> = vec![ |
1591 | | b"a", |
1592 | | b"aa", |
1593 | | b"aaa", |
1594 | | b"aab", |
1595 | | b"abcd", |
1596 | | b"abcde", |
1597 | | b"abcdef", |
1598 | | b"abcdefg", |
1599 | | b"abcdefgh", |
1600 | | b"abcdefghi", |
1601 | | b"abcdefghij", |
1602 | | b"abcdefghijk", |
1603 | | b"abcdefghijkl", |
1604 | | // Tests for byte-order reversal bug: |
1605 | | // Without the fix, "backend one" would compare as "eno dnekcab", |
1606 | | // causing incorrect sort order relative to "backend two". |
1607 | | b"backend one", |
1608 | | b"backend two", |
1609 | | // Tests length-tiebreaker logic: |
1610 | | // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, |
1611 | | // so only the length differentiates their ordering. |
1612 | | b"bar", |
1613 | | b"bar\0", |
1614 | | // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: |
1615 | | b"than12Byt", |
1616 | | b"than12Bytes", |
1617 | | b"than12Bytes\0", |
1618 | | b"than12Bytesx", |
1619 | | b"than12Bytex", |
1620 | | b"than12Bytez", |
1621 | | // Additional lexical tests |
1622 | | b"xyy", |
1623 | | b"xyz", |
1624 | | b"xza", |
1625 | | ]; |
1626 | | |
1627 | | // Create a GenericBinaryArray for cross-comparison of lex order |
1628 | | let array: GenericBinaryArray<i32> = |
1629 | | GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>()); |
1630 | | |
1631 | | for i in 0..array.len() - 1 { |
1632 | | let v1 = array.value(i); |
1633 | | let v2 = array.value(i + 1); |
1634 | | |
1635 | | // Assert the array's natural lexical ordering is correct |
1636 | | assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); |
1637 | | |
1638 | | // Assert the keys produced by inline_key_fast reflect the same ordering |
1639 | | let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline( |
1640 | | v1.len() as u32, |
1641 | | v1, |
1642 | | )); |
1643 | | let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline( |
1644 | | v2.len() as u32, |
1645 | | v2, |
1646 | | )); |
1647 | | |
1648 | | assert!( |
1649 | | key1 < key2, |
1650 | | "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", |
1651 | | ); |
1652 | | } |
1653 | | } |
1654 | | } |