/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/byte_array.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::{get_offsets, print_long_array}; |
19 | | use crate::builder::GenericByteBuilder; |
20 | | use crate::iterator::ArrayIter; |
21 | | use crate::types::ByteArrayType; |
22 | | use crate::types::bytes::ByteArrayNativeType; |
23 | | use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar}; |
24 | | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; |
25 | | use arrow_buffer::{NullBuffer, OffsetBuffer}; |
26 | | use arrow_data::{ArrayData, ArrayDataBuilder}; |
27 | | use arrow_schema::{ArrowError, DataType}; |
28 | | use std::any::Any; |
29 | | use std::sync::Arc; |
30 | | |
31 | | /// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) |
32 | | /// |
33 | | /// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data |
34 | | /// |
35 | | /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes |
36 | | /// |
37 | | /// # Example: From a Vec |
38 | | /// |
39 | | /// ``` |
40 | | /// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; |
41 | | /// let arr: GenericByteArray<Utf8Type> = vec!["hello", "world", ""].into(); |
42 | | /// assert_eq!(arr.value_data(), b"helloworld"); |
43 | | /// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10]); |
44 | | /// let values: Vec<_> = arr.iter().collect(); |
45 | | /// assert_eq!(values, &[Some("hello"), Some("world"), Some("")]); |
46 | | /// ``` |
47 | | /// |
48 | | /// # Example: From an optional Vec |
49 | | /// |
50 | | /// ``` |
51 | | /// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; |
52 | | /// let arr: GenericByteArray<Utf8Type> = vec![Some("hello"), Some("world"), Some(""), None].into(); |
53 | | /// assert_eq!(arr.value_data(), b"helloworld"); |
54 | | /// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10, 10]); |
55 | | /// let values: Vec<_> = arr.iter().collect(); |
56 | | /// assert_eq!(values, &[Some("hello"), Some("world"), Some(""), None]); |
57 | | /// ``` |
58 | | /// |
59 | | /// # Example: From an iterator of option |
60 | | /// |
61 | | /// ``` |
62 | | /// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; |
63 | | /// let arr: GenericByteArray<Utf8Type> = (0..5).map(|x| (x % 2 == 0).then(|| x.to_string())).collect(); |
64 | | /// let values: Vec<_> = arr.iter().collect(); |
65 | | /// assert_eq!(values, &[Some("0"), None, Some("2"), None, Some("4")]); |
66 | | /// ``` |
67 | | /// |
68 | | /// # Example: Using Builder |
69 | | /// |
70 | | /// ``` |
71 | | /// # use arrow_array::Array; |
72 | | /// # use arrow_array::builder::GenericByteBuilder; |
73 | | /// # use arrow_array::types::Utf8Type; |
74 | | /// let mut builder = GenericByteBuilder::<Utf8Type>::new(); |
75 | | /// builder.append_value("hello"); |
76 | | /// builder.append_null(); |
77 | | /// builder.append_value("world"); |
78 | | /// let array = builder.finish(); |
79 | | /// let values: Vec<_> = array.iter().collect(); |
80 | | /// assert_eq!(values, &[Some("hello"), None, Some("world")]); |
81 | | /// ``` |
82 | | /// |
83 | | /// [`StringArray`]: crate::StringArray |
84 | | /// [`LargeStringArray`]: crate::LargeStringArray |
85 | | /// [`BinaryArray`]: crate::BinaryArray |
86 | | /// [`LargeBinaryArray`]: crate::LargeBinaryArray |
87 | | pub struct GenericByteArray<T: ByteArrayType> { |
88 | | data_type: DataType, |
89 | | value_offsets: OffsetBuffer<T::Offset>, |
90 | | value_data: Buffer, |
91 | | nulls: Option<NullBuffer>, |
92 | | } |
93 | | |
94 | | impl<T: ByteArrayType> Clone for GenericByteArray<T> { |
95 | 160k | fn clone(&self) -> Self { |
96 | 160k | Self { |
97 | 160k | data_type: T::DATA_TYPE, |
98 | 160k | value_offsets: self.value_offsets.clone(), |
99 | 160k | value_data: self.value_data.clone(), |
100 | 160k | nulls: self.nulls.clone(), |
101 | 160k | } |
102 | 160k | } |
103 | | } |
104 | | |
105 | | impl<T: ByteArrayType> GenericByteArray<T> { |
106 | | /// Data type of the array. |
107 | | pub const DATA_TYPE: DataType = T::DATA_TYPE; |
108 | | |
109 | | /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure |
110 | | /// |
111 | | /// # Panics |
112 | | /// |
113 | | /// Panics if [`GenericByteArray::try_new`] returns an error |
114 | 1 | pub fn new( |
115 | 1 | offsets: OffsetBuffer<T::Offset>, |
116 | 1 | values: Buffer, |
117 | 1 | nulls: Option<NullBuffer>, |
118 | 1 | ) -> Self { |
119 | 1 | Self::try_new(offsets, values, nulls).unwrap() |
120 | 1 | } |
121 | | |
122 | | /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure |
123 | | /// |
124 | | /// # Errors |
125 | | /// |
126 | | /// * `offsets.len() - 1 != nulls.len()` |
127 | | /// * Any consecutive pair of `offsets` does not denote a valid slice of `values` |
128 | 1 | pub fn try_new( |
129 | 1 | offsets: OffsetBuffer<T::Offset>, |
130 | 1 | values: Buffer, |
131 | 1 | nulls: Option<NullBuffer>, |
132 | 1 | ) -> Result<Self, ArrowError> { |
133 | 1 | let len = offsets.len() - 1; |
134 | | |
135 | | // Verify that each pair of offsets is a valid slices of values |
136 | 1 | T::validate(&offsets, &values)?0 ; |
137 | | |
138 | 1 | if let Some(n) = nulls.as_ref() { |
139 | 1 | if n.len() != len { |
140 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
141 | 0 | "Incorrect length of null buffer for {}{}Array, expected {len} got {}", |
142 | 0 | T::Offset::PREFIX, |
143 | 0 | T::PREFIX, |
144 | 0 | n.len(), |
145 | 0 | ))); |
146 | 1 | } |
147 | 0 | } |
148 | | |
149 | 1 | Ok(Self { |
150 | 1 | data_type: T::DATA_TYPE, |
151 | 1 | value_offsets: offsets, |
152 | 1 | value_data: values, |
153 | 1 | nulls, |
154 | 1 | }) |
155 | 1 | } |
156 | | |
157 | | /// Create a new [`GenericByteArray`] from the provided parts, without validation |
158 | | /// |
159 | | /// # Safety |
160 | | /// |
161 | | /// Safe if [`Self::try_new`] would not error |
162 | 41 | pub unsafe fn new_unchecked( |
163 | 41 | offsets: OffsetBuffer<T::Offset>, |
164 | 41 | values: Buffer, |
165 | 41 | nulls: Option<NullBuffer>, |
166 | 41 | ) -> Self { |
167 | 41 | if cfg!(feature = "force_validate") { |
168 | 0 | return Self::new(offsets, values, nulls); |
169 | 41 | } |
170 | 41 | Self { |
171 | 41 | data_type: T::DATA_TYPE, |
172 | 41 | value_offsets: offsets, |
173 | 41 | value_data: values, |
174 | 41 | nulls, |
175 | 41 | } |
176 | 41 | } |
177 | | |
178 | | /// Create a new [`GenericByteArray`] of length `len` where all values are null |
179 | 32 | pub fn new_null(len: usize) -> Self { |
180 | 32 | Self { |
181 | 32 | data_type: T::DATA_TYPE, |
182 | 32 | value_offsets: OffsetBuffer::new_zeroed(len), |
183 | 32 | value_data: MutableBuffer::new(0).into(), |
184 | 32 | nulls: Some(NullBuffer::new_null(len)), |
185 | 32 | } |
186 | 32 | } |
187 | | |
188 | | /// Create a new [`Scalar`] from `v` |
189 | | pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> { |
190 | | Scalar::new(Self::from_iter_values(std::iter::once(value))) |
191 | | } |
192 | | |
193 | | /// Create a new [`GenericByteArray`] where `value` is repeated `repeat_count` times. |
194 | | /// |
195 | | /// # Panics |
196 | | /// This will panic if value's length multiplied by `repeat_count` overflows usize. |
197 | | /// |
198 | | pub fn new_repeated(value: impl AsRef<T::Native>, repeat_count: usize) -> Self { |
199 | | let s: &[u8] = value.as_ref().as_ref(); |
200 | | let value_offsets = OffsetBuffer::from_repeated_length(s.len(), repeat_count); |
201 | | let bytes: Buffer = { |
202 | | let mut mutable_buffer = MutableBuffer::with_capacity(0); |
203 | | mutable_buffer.repeat_slice_n_times(s, repeat_count); |
204 | | |
205 | | mutable_buffer.into() |
206 | | }; |
207 | | |
208 | | Self { |
209 | | data_type: T::DATA_TYPE, |
210 | | value_data: bytes, |
211 | | value_offsets, |
212 | | nulls: None, |
213 | | } |
214 | | } |
215 | | |
216 | | /// Creates a [`GenericByteArray`] based on an iterator of values without nulls |
217 | 93 | pub fn from_iter_values<Ptr, I>(iter: I) -> Self |
218 | 93 | where |
219 | 93 | Ptr: AsRef<T::Native>, |
220 | 93 | I: IntoIterator<Item = Ptr>, |
221 | | { |
222 | 93 | let iter = iter.into_iter(); |
223 | 93 | let (_, data_len) = iter.size_hint(); |
224 | 93 | let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. |
225 | | |
226 | 93 | let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>()); |
227 | 93 | offsets.push(T::Offset::usize_as(0)); |
228 | | |
229 | 93 | let mut values = MutableBuffer::new(0); |
230 | 743 | for s650 in iter { |
231 | 650 | let s: &[u8] = s.as_ref().as_ref(); |
232 | 650 | values.extend_from_slice(s); |
233 | 650 | offsets.push(T::Offset::usize_as(values.len())); |
234 | 650 | } |
235 | | |
236 | 93 | T::Offset::from_usize(values.len()).expect("offset overflow"); |
237 | 93 | let offsets = Buffer::from(offsets); |
238 | | |
239 | | // Safety: valid by construction |
240 | 93 | let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; |
241 | | |
242 | 93 | Self { |
243 | 93 | data_type: T::DATA_TYPE, |
244 | 93 | value_data: values.into(), |
245 | 93 | value_offsets, |
246 | 93 | nulls: None, |
247 | 93 | } |
248 | 93 | } |
249 | | |
250 | | /// Deconstruct this array into its constituent parts |
251 | | pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) { |
252 | | (self.value_offsets, self.value_data, self.nulls) |
253 | | } |
254 | | |
255 | | /// Returns the length for value at index `i`. |
256 | | /// # Panics |
257 | | /// Panics if index `i` is out of bounds. |
258 | | #[inline] |
259 | | pub fn value_length(&self, i: usize) -> T::Offset { |
260 | | let offsets = self.value_offsets(); |
261 | | offsets[i + 1] - offsets[i] |
262 | | } |
263 | | |
264 | | /// Returns a reference to the offsets of this array |
265 | | /// |
266 | | /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] |
267 | | /// allowing for zero-copy cloning |
268 | | #[inline] |
269 | 137 | pub fn offsets(&self) -> &OffsetBuffer<T::Offset> { |
270 | 137 | &self.value_offsets |
271 | 137 | } |
272 | | |
273 | | /// Returns the values of this array |
274 | | /// |
275 | | /// Unlike [`Self::value_data`] this returns the [`Buffer`] |
276 | | /// allowing for zero-copy cloning |
277 | | #[inline] |
278 | 165 | pub fn values(&self) -> &Buffer { |
279 | 165 | &self.value_data |
280 | 165 | } |
281 | | |
282 | | /// Returns the raw value data |
283 | 176 | pub fn value_data(&self) -> &[u8] { |
284 | 176 | self.value_data.as_slice() |
285 | 176 | } |
286 | | |
287 | | /// Returns true if all data within this array is ASCII |
288 | | pub fn is_ascii(&self) -> bool { |
289 | | let offsets = self.value_offsets(); |
290 | | let start = offsets.first().unwrap(); |
291 | | let end = offsets.last().unwrap(); |
292 | | self.value_data()[start.as_usize()..end.as_usize()].is_ascii() |
293 | | } |
294 | | |
295 | | /// Returns the offset values in the offsets buffer |
296 | | #[inline] |
297 | 174k | pub fn value_offsets(&self) -> &[T::Offset] { |
298 | 174k | &self.value_offsets |
299 | 174k | } |
300 | | |
301 | | /// Returns the element at index `i` |
302 | | /// |
303 | | /// Note: This method does not check for nulls and the value is arbitrary |
304 | | /// if [`is_null`](Self::is_null) returns true for the index. |
305 | | /// |
306 | | /// # Safety |
307 | | /// Caller is responsible for ensuring that the index is within the bounds of the array |
308 | 87.2k | pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native { |
309 | 87.2k | let end = *unsafe { self.value_offsets().get_unchecked(i + 1) }; |
310 | 87.2k | let start = *unsafe { self.value_offsets().get_unchecked(i) }; |
311 | | |
312 | | // Soundness |
313 | | // pointer alignment & location is ensured by RawPtrBox |
314 | | // buffer bounds/offset is ensured by the value_offset invariants |
315 | | |
316 | | // Safety of `to_isize().unwrap()` |
317 | | // `start` and `end` are &OffsetSize, which is a generic type that implements the |
318 | | // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, |
319 | | // both of which should cleanly cast to isize on an architecture that supports |
320 | | // 32/64-bit offsets |
321 | 87.2k | let b = unsafe { |
322 | 87.2k | std::slice::from_raw_parts( |
323 | 87.2k | self.value_data |
324 | 87.2k | .as_ptr() |
325 | 87.2k | .offset(start.to_isize().unwrap_unchecked()), |
326 | 87.2k | (end - start).to_usize().unwrap_unchecked(), |
327 | | ) |
328 | | }; |
329 | | |
330 | | // SAFETY: |
331 | | // ArrayData is valid |
332 | 87.2k | unsafe { T::Native::from_bytes_unchecked(b) } |
333 | 87.2k | } |
334 | | |
335 | | /// Returns the element at index `i` |
336 | | /// |
337 | | /// Note: This method does not check for nulls and the value is arbitrary |
338 | | /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. |
339 | | /// |
340 | | /// # Panics |
341 | | /// Panics if index `i` is out of bounds. |
342 | 83.6k | pub fn value(&self, i: usize) -> &T::Native { |
343 | 83.6k | assert!( |
344 | 83.6k | i < self.len(), |
345 | 0 | "Trying to access an element at index {} from a {}{}Array of length {}", |
346 | | i, |
347 | | T::Offset::PREFIX, |
348 | | T::PREFIX, |
349 | 0 | self.len() |
350 | | ); |
351 | | // SAFETY: |
352 | | // Verified length above |
353 | 83.6k | unsafe { self.value_unchecked(i) } |
354 | 83.6k | } |
355 | | |
356 | | /// constructs a new iterator |
357 | 107 | pub fn iter(&self) -> ArrayIter<&Self> { |
358 | 107 | ArrayIter::new(self) |
359 | 107 | } |
360 | | |
361 | | /// Returns a zero-copy slice of this array with the indicated offset and length. |
362 | 220 | pub fn slice(&self, offset: usize, length: usize) -> Self { |
363 | | Self { |
364 | 220 | data_type: T::DATA_TYPE, |
365 | 220 | value_offsets: self.value_offsets.slice(offset, length), |
366 | 220 | value_data: self.value_data.clone(), |
367 | 220 | nulls: self.nulls.as_ref().map(|n| n204 .slice204 (offset204 , length204 )), |
368 | | } |
369 | 220 | } |
370 | | |
371 | | /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying |
372 | | /// offset and data buffers are not shared by others. |
373 | | pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> { |
374 | | let len = self.len(); |
375 | | let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); |
376 | | |
377 | | let data = self.into_data(); |
378 | | let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); |
379 | | |
380 | | let element_len = std::mem::size_of::<T::Offset>(); |
381 | | let offset_buffer = data.buffers()[0] |
382 | | .slice_with_length(data.offset() * element_len, (len + 1) * element_len); |
383 | | |
384 | | let element_len = std::mem::size_of::<u8>(); |
385 | | let value_buffer = data.buffers()[1] |
386 | | .slice_with_length(data.offset() * element_len, value_len * element_len); |
387 | | |
388 | | drop(data); |
389 | | |
390 | | let try_mutable_null_buffer = match null_bit_buffer { |
391 | | None => Ok(None), |
392 | | Some(null_buffer) => { |
393 | | // Null buffer exists, tries to make it mutable |
394 | | null_buffer.into_mutable().map(Some) |
395 | | } |
396 | | }; |
397 | | |
398 | | let try_mutable_buffers = match try_mutable_null_buffer { |
399 | | Ok(mutable_null_buffer) => { |
400 | | // Got mutable null buffer, tries to get mutable value buffer |
401 | | let try_mutable_offset_buffer = offset_buffer.into_mutable(); |
402 | | let try_mutable_value_buffer = value_buffer.into_mutable(); |
403 | | |
404 | | // try_mutable_offset_buffer.map(...).map_err(...) doesn't work as the compiler complains |
405 | | // mutable_null_buffer is moved into map closure. |
406 | | match (try_mutable_offset_buffer, try_mutable_value_buffer) { |
407 | | (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe { |
408 | | Ok(GenericByteBuilder::<T>::new_from_buffer( |
409 | | mutable_offset_buffer, |
410 | | mutable_value_buffer, |
411 | | mutable_null_buffer, |
412 | | )) |
413 | | }, |
414 | | (Ok(mutable_offset_buffer), Err(value_buffer)) => Err(( |
415 | | mutable_offset_buffer.into(), |
416 | | value_buffer, |
417 | | mutable_null_buffer.map(|b| b.into()), |
418 | | )), |
419 | | (Err(offset_buffer), Ok(mutable_value_buffer)) => Err(( |
420 | | offset_buffer, |
421 | | mutable_value_buffer.into(), |
422 | | mutable_null_buffer.map(|b| b.into()), |
423 | | )), |
424 | | (Err(offset_buffer), Err(value_buffer)) => Err(( |
425 | | offset_buffer, |
426 | | value_buffer, |
427 | | mutable_null_buffer.map(|b| b.into()), |
428 | | )), |
429 | | } |
430 | | } |
431 | | Err(mutable_null_buffer) => { |
432 | | // Unable to get mutable null buffer |
433 | | Err((offset_buffer, value_buffer, Some(mutable_null_buffer))) |
434 | | } |
435 | | }; |
436 | | |
437 | | match try_mutable_buffers { |
438 | | Ok(builder) => Ok(builder), |
439 | | Err((offset_buffer, value_buffer, null_bit_buffer)) => { |
440 | | let builder = ArrayData::builder(T::DATA_TYPE) |
441 | | .len(len) |
442 | | .add_buffer(offset_buffer) |
443 | | .add_buffer(value_buffer) |
444 | | .null_bit_buffer(null_bit_buffer); |
445 | | |
446 | | let array_data = unsafe { builder.build_unchecked() }; |
447 | | let array = GenericByteArray::<T>::from(array_data); |
448 | | |
449 | | Err(array) |
450 | | } |
451 | | } |
452 | | } |
453 | | } |
454 | | |
455 | | impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> { |
456 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
457 | 0 | write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?; |
458 | 0 | print_long_array(self, f, |array, index, f| { |
459 | 0 | std::fmt::Debug::fmt(&array.value(index), f) |
460 | 0 | })?; |
461 | 0 | write!(f, "]") |
462 | 0 | } |
463 | | } |
464 | | |
465 | | impl<T: ByteArrayType> Array for GenericByteArray<T> { |
466 | 160k | fn as_any(&self) -> &dyn Any { |
467 | 160k | self |
468 | 160k | } |
469 | | |
470 | 160k | fn to_data(&self) -> ArrayData { |
471 | 160k | self.clone().into() |
472 | 160k | } |
473 | | |
474 | 80.1k | fn into_data(self) -> ArrayData { |
475 | 80.1k | self.into() |
476 | 80.1k | } |
477 | | |
478 | 241k | fn data_type(&self) -> &DataType { |
479 | 241k | &self.data_type |
480 | 241k | } |
481 | | |
482 | 115 | fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
483 | 115 | Arc::new(self.slice(offset, length)) |
484 | 115 | } |
485 | | |
486 | 565k | fn len(&self) -> usize { |
487 | 565k | self.value_offsets.len() - 1 |
488 | 565k | } |
489 | | |
490 | 20 | fn is_empty(&self) -> bool { |
491 | 20 | self.value_offsets.len() <= 1 |
492 | 20 | } |
493 | | |
494 | 0 | fn shrink_to_fit(&mut self) { |
495 | 0 | self.value_offsets.shrink_to_fit(); |
496 | 0 | self.value_data.shrink_to_fit(); |
497 | 0 | if let Some(nulls) = &mut self.nulls { |
498 | 0 | nulls.shrink_to_fit(); |
499 | 0 | } |
500 | 0 | } |
501 | | |
502 | 0 | fn offset(&self) -> usize { |
503 | 0 | 0 |
504 | 0 | } |
505 | | |
506 | 160k | fn nulls(&self) -> Option<&NullBuffer> { |
507 | 160k | self.nulls.as_ref() |
508 | 160k | } |
509 | | |
510 | 3 | fn logical_null_count(&self) -> usize { |
511 | | // More efficient that the default implementation |
512 | 3 | self.null_count() |
513 | 3 | } |
514 | | |
515 | 0 | fn get_buffer_memory_size(&self) -> usize { |
516 | 0 | let mut sum = self.value_offsets.inner().inner().capacity(); |
517 | 0 | sum += self.value_data.capacity(); |
518 | 0 | if let Some(x) = &self.nulls { |
519 | 0 | sum += x.buffer().capacity() |
520 | 0 | } |
521 | 0 | sum |
522 | 0 | } |
523 | | |
524 | 0 | fn get_array_memory_size(&self) -> usize { |
525 | 0 | std::mem::size_of::<Self>() + self.get_buffer_memory_size() |
526 | 0 | } |
527 | | } |
528 | | |
529 | | impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> { |
530 | | type Item = &'a T::Native; |
531 | | |
532 | | fn value(&self, index: usize) -> Self::Item { |
533 | | GenericByteArray::value(self, index) |
534 | | } |
535 | | |
536 | 3.57k | unsafe fn value_unchecked(&self, index: usize) -> Self::Item { |
537 | 3.57k | unsafe { GenericByteArray::value_unchecked(self, index) } |
538 | 3.57k | } |
539 | | } |
540 | | |
541 | | impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> { |
542 | 160k | fn from(data: ArrayData) -> Self { |
543 | 160k | assert_eq!( |
544 | 160k | data.data_type(), |
545 | 160k | &Self::DATA_TYPE, |
546 | 0 | "{}{}Array expects DataType::{}", |
547 | | T::Offset::PREFIX, |
548 | | T::PREFIX, |
549 | 0 | Self::DATA_TYPE |
550 | | ); |
551 | 160k | assert_eq!( |
552 | 160k | data.buffers().len(), |
553 | | 2, |
554 | 0 | "{}{}Array data should contain 2 buffers only (offsets and values)", |
555 | | T::Offset::PREFIX, |
556 | | T::PREFIX, |
557 | | ); |
558 | | // SAFETY: |
559 | | // ArrayData is valid, and verified type above |
560 | 160k | let value_offsets = unsafe { get_offsets(&data) }; |
561 | 160k | let value_data = data.buffers()[1].clone(); |
562 | 160k | Self { |
563 | 160k | value_offsets, |
564 | 160k | value_data, |
565 | 160k | data_type: T::DATA_TYPE, |
566 | 160k | nulls: data.nulls().cloned(), |
567 | 160k | } |
568 | 160k | } |
569 | | } |
570 | | |
571 | | impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData { |
572 | 240k | fn from(array: GenericByteArray<T>) -> Self { |
573 | 240k | let len = array.len(); |
574 | | |
575 | 240k | let offsets = array.value_offsets.into_inner().into_inner(); |
576 | 240k | let builder = ArrayDataBuilder::new(array.data_type) |
577 | 240k | .len(len) |
578 | 240k | .buffers(vec![offsets, array.value_data]) |
579 | 240k | .nulls(array.nulls); |
580 | | |
581 | 240k | unsafe { builder.build_unchecked() } |
582 | 240k | } |
583 | | } |
584 | | |
585 | | impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> { |
586 | | type Item = Option<&'a T::Native>; |
587 | | type IntoIter = ArrayIter<Self>; |
588 | | |
589 | 6 | fn into_iter(self) -> Self::IntoIter { |
590 | 6 | ArrayIter::new(self) |
591 | 6 | } |
592 | | } |
593 | | |
594 | | impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T> |
595 | | where |
596 | | Ptr: AsRef<T::Native> + 'a, |
597 | | { |
598 | | fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self { |
599 | | iter.into_iter() |
600 | | .map(|o| o.as_ref().map(|p| p.as_ref())) |
601 | | .collect() |
602 | | } |
603 | | } |
604 | | |
605 | | impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T> |
606 | | where |
607 | | Ptr: AsRef<T::Native>, |
608 | | { |
609 | 204 | fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self { |
610 | 204 | let iter = iter.into_iter(); |
611 | 204 | let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024); |
612 | 204 | builder.extend(iter); |
613 | 204 | builder.finish() |
614 | 204 | } |
615 | | } |
616 | | |
617 | | #[cfg(test)] |
618 | | mod tests { |
619 | | use crate::{Array, BinaryArray, StringArray}; |
620 | | use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; |
621 | | |
622 | | #[test] |
623 | | fn try_new() { |
624 | | let data = Buffer::from_slice_ref("helloworld"); |
625 | | let offsets = OffsetBuffer::new(vec![0, 5, 10].into()); |
626 | | StringArray::new(offsets.clone(), data.clone(), None); |
627 | | |
628 | | let nulls = NullBuffer::new_null(3); |
629 | | let err = |
630 | | StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err(); |
631 | | assert_eq!( |
632 | | err.to_string(), |
633 | | "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3" |
634 | | ); |
635 | | |
636 | | let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); |
637 | | assert_eq!( |
638 | | err.to_string(), |
639 | | "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3" |
640 | | ); |
641 | | |
642 | | let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld"); |
643 | | let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err(); |
644 | | assert_eq!( |
645 | | err.to_string(), |
646 | | "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2" |
647 | | ); |
648 | | |
649 | | BinaryArray::new(offsets, non_utf8_data, None); |
650 | | |
651 | | let offsets = OffsetBuffer::new(vec![0, 5, 11].into()); |
652 | | let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err(); |
653 | | assert_eq!( |
654 | | err.to_string(), |
655 | | "Invalid argument error: Offset of 11 exceeds length of values 10" |
656 | | ); |
657 | | |
658 | | let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err(); |
659 | | assert_eq!( |
660 | | err.to_string(), |
661 | | "Invalid argument error: Maximum offset of 11 is larger than values of length 10" |
662 | | ); |
663 | | |
664 | | let non_ascii_data = Buffer::from_slice_ref("heìloworld"); |
665 | | StringArray::new(offsets.clone(), non_ascii_data.clone(), None); |
666 | | BinaryArray::new(offsets, non_ascii_data.clone(), None); |
667 | | |
668 | | let offsets = OffsetBuffer::new(vec![0, 3, 10].into()); |
669 | | let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err(); |
670 | | assert_eq!( |
671 | | err.to_string(), |
672 | | "Invalid argument error: Split UTF-8 codepoint at offset 3" |
673 | | ); |
674 | | |
675 | | BinaryArray::new(offsets, non_ascii_data, None); |
676 | | } |
677 | | |
678 | | #[test] |
679 | | fn create_repeated() { |
680 | | let arr = BinaryArray::new_repeated(b"hello", 3); |
681 | | assert_eq!(arr.len(), 3); |
682 | | assert_eq!(arr.value(0), b"hello"); |
683 | | assert_eq!(arr.value(1), b"hello"); |
684 | | assert_eq!(arr.value(2), b"hello"); |
685 | | |
686 | | let arr = StringArray::new_repeated("world", 2); |
687 | | assert_eq!(arr.len(), 2); |
688 | | assert_eq!(arr.value(0), "world"); |
689 | | assert_eq!(arr.value(1), "world"); |
690 | | } |
691 | | |
692 | | #[test] |
693 | | #[should_panic(expected = "usize overflow")] |
694 | | fn create_repeated_usize_overflow_1() { |
695 | | let _arr = BinaryArray::new_repeated(b"hello", (usize::MAX / "hello".len()) + 1); |
696 | | } |
697 | | |
698 | | #[test] |
699 | | #[should_panic(expected = "usize overflow")] |
700 | | fn create_repeated_usize_overflow_2() { |
701 | | let _arr = BinaryArray::new_repeated(b"hello", usize::MAX); |
702 | | } |
703 | | |
704 | | #[test] |
705 | | #[should_panic(expected = "offset overflow")] |
706 | | fn create_repeated_i32_offset_overflow_1() { |
707 | | let _arr = BinaryArray::new_repeated(b"hello", usize::MAX / "hello".len()); |
708 | | } |
709 | | |
710 | | #[test] |
711 | | #[should_panic(expected = "offset overflow")] |
712 | | fn create_repeated_i32_offset_overflow_2() { |
713 | | let _arr = BinaryArray::new_repeated(b"hello", ((i32::MAX as usize) / "hello".len()) + 1); |
714 | | } |
715 | | } |