-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Implement arrow-row encoding/decoding for view types #5922
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6063832
e46a741
6c6c942
ee2673f
195a15d
bec69a5
fe28ef1
2890708
3a55827
b6e9b40
5d835e9
de104e1
8cba128
e415e30
c72a549
8374798
cecc1f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,19 +16,22 @@ | |
| // under the License. | ||
|
|
||
| use crate::array::print_long_array; | ||
| use crate::builder::GenericByteViewBuilder; | ||
| use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; | ||
| use crate::iterator::ArrayIter; | ||
| use crate::types::bytes::ByteArrayNativeType; | ||
| use crate::types::{BinaryViewType, ByteViewType, StringViewType}; | ||
| use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; | ||
| use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; | ||
| use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; | ||
| use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; | ||
| use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; | ||
| use arrow_schema::{ArrowError, DataType}; | ||
| use num::ToPrimitive; | ||
| use std::any::Any; | ||
| use std::fmt::Debug; | ||
| use std::marker::PhantomData; | ||
| use std::sync::Arc; | ||
|
|
||
| use super::ByteArrayType; | ||
|
|
||
| /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. | ||
| /// | ||
| /// Different than [`crate::GenericByteArray`] as it stores both an offset and length | ||
|
|
@@ -429,6 +432,51 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> { | |
| } | ||
| } | ||
|
|
||
| /// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a smart way: | ||
| /// If the offsets are all less than u32::MAX, then we directly build the view array on top of existing buffer. | ||
| impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V> | ||
| where | ||
| FROM: ByteArrayType, | ||
| FROM::Offset: OffsetSizeTrait + ToPrimitive, | ||
| V: ByteViewType<Native = FROM::Native>, | ||
| { | ||
| fn from(byte_array: &GenericByteArray<FROM>) -> Self { | ||
| let offsets = byte_array.offsets(); | ||
|
|
||
| let can_reuse_buffer = match offsets.last() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
| Some(offset) => offset.as_usize() < u32::MAX as usize, | ||
| None => true, | ||
| }; | ||
|
|
||
| if can_reuse_buffer { | ||
| let len = byte_array.len(); | ||
| let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len); | ||
| let str_values_buf = byte_array.values().clone(); | ||
| let block = views_builder.append_block(str_values_buf); | ||
| for (i, w) in offsets.windows(2).enumerate() { | ||
| let offset = w[0].as_usize(); | ||
| let end = w[1].as_usize(); | ||
| let length = end - offset; | ||
|
|
||
| if byte_array.is_null(i) { | ||
| views_builder.append_null(); | ||
| } else { | ||
| // Safety: the input was a valid array so it valid UTF8 (if string). And | ||
| // all offsets were valid | ||
| unsafe { | ||
| views_builder.append_view_unchecked(block, offset as u32, length as u32) | ||
| } | ||
| } | ||
| } | ||
| assert_eq!(views_builder.len(), len); | ||
| views_builder.finish() | ||
| } else { | ||
| // TODO: the first u32::MAX can still be reused | ||
| GenericByteViewArray::<V>::from_iter(byte_array.iter()) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData { | ||
| fn from(mut array: GenericByteViewArray<T>) -> Self { | ||
| let len = array.len(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,6 +135,21 @@ fn compare_bytes<T: ByteArrayType>( | |
| }) | ||
| } | ||
|
|
||
| fn compare_byte_view<T: ByteViewType>( | ||
| left: &dyn Array, | ||
| right: &dyn Array, | ||
| opts: SortOptions, | ||
| ) -> DynComparator { | ||
| let left = left.as_byte_view::<T>(); | ||
| let right = right.as_byte_view::<T>(); | ||
|
|
||
| let l = left.clone(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I considered if we could avoid these |
||
| let r = right.clone(); | ||
| compare(left, right, opts, move |i, j| { | ||
| crate::cmp::compare_byte_view(&l, i, &r, j) | ||
| }) | ||
| } | ||
|
|
||
| fn compare_dict<K: ArrowDictionaryKeyType>( | ||
| left: &dyn Array, | ||
| right: &dyn Array, | ||
|
|
@@ -342,8 +357,10 @@ pub fn make_comparator( | |
| (Boolean, Boolean) => Ok(compare_boolean(left, right, opts)), | ||
| (Utf8, Utf8) => Ok(compare_bytes::<Utf8Type>(left, right, opts)), | ||
| (LargeUtf8, LargeUtf8) => Ok(compare_bytes::<LargeUtf8Type>(left, right, opts)), | ||
| (Utf8View, Utf8View) => Ok(compare_byte_view::<StringViewType>(left, right, opts)), | ||
| (Binary, Binary) => Ok(compare_bytes::<BinaryType>(left, right, opts)), | ||
| (LargeBinary, LargeBinary) => Ok(compare_bytes::<LargeBinaryType>(left, right, opts)), | ||
| (BinaryView, BinaryView) => Ok(compare_byte_view::<BinaryViewType>(left, right, opts)), | ||
| (FixedSizeBinary(_), FixedSizeBinary(_)) => { | ||
| let left = left.as_fixed_size_binary(); | ||
| let right = right.as_fixed_size_binary(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.