Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/byte_array.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::array::{get_offsets, print_long_array};
19
use crate::builder::GenericByteBuilder;
20
use crate::iterator::ArrayIter;
21
use crate::types::bytes::ByteArrayNativeType;
22
use crate::types::ByteArrayType;
23
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25
use arrow_buffer::{NullBuffer, OffsetBuffer};
26
use arrow_data::{ArrayData, ArrayDataBuilder};
27
use arrow_schema::{ArrowError, DataType};
28
use std::any::Any;
29
use std::sync::Arc;
30
31
/// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout)
32
///
33
/// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data
34
///
35
/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes
36
///
37
/// # Example: From a Vec
38
///
39
/// ```
40
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
41
/// let arr: GenericByteArray<Utf8Type> = vec!["hello", "world", ""].into();
42
/// assert_eq!(arr.value_data(), b"helloworld");
43
/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10]);
44
/// let values: Vec<_> = arr.iter().collect();
45
/// assert_eq!(values, &[Some("hello"), Some("world"), Some("")]);
46
/// ```
47
///
48
/// # Example: From an optional Vec
49
///
50
/// ```
51
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
52
/// let arr: GenericByteArray<Utf8Type> = vec![Some("hello"), Some("world"), Some(""), None].into();
53
/// assert_eq!(arr.value_data(), b"helloworld");
54
/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10, 10]);
55
/// let values: Vec<_> = arr.iter().collect();
56
/// assert_eq!(values, &[Some("hello"), Some("world"), Some(""), None]);
57
/// ```
58
///
59
/// # Example: From an iterator of option
60
///
61
/// ```
62
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
63
/// let arr: GenericByteArray<Utf8Type> = (0..5).map(|x| (x % 2 == 0).then(|| x.to_string())).collect();
64
/// let values: Vec<_> = arr.iter().collect();
65
/// assert_eq!(values, &[Some("0"), None, Some("2"), None, Some("4")]);
66
/// ```
67
///
68
/// # Example: Using Builder
69
///
70
/// ```
71
/// # use arrow_array::Array;
72
/// # use arrow_array::builder::GenericByteBuilder;
73
/// # use arrow_array::types::Utf8Type;
74
/// let mut builder = GenericByteBuilder::<Utf8Type>::new();
75
/// builder.append_value("hello");
76
/// builder.append_null();
77
/// builder.append_value("world");
78
/// let array = builder.finish();
79
/// let values: Vec<_> = array.iter().collect();
80
/// assert_eq!(values, &[Some("hello"), None, Some("world")]);
81
/// ```
82
///
83
/// [`StringArray`]: crate::StringArray
84
/// [`LargeStringArray`]: crate::LargeStringArray
85
/// [`BinaryArray`]: crate::BinaryArray
86
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
87
pub struct GenericByteArray<T: ByteArrayType> {
88
    data_type: DataType,
89
    value_offsets: OffsetBuffer<T::Offset>,
90
    value_data: Buffer,
91
    nulls: Option<NullBuffer>,
92
}
93
94
impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95
213
    fn clone(&self) -> Self {
96
213
        Self {
97
213
            data_type: T::DATA_TYPE,
98
213
            value_offsets: self.value_offsets.clone(),
99
213
            value_data: self.value_data.clone(),
100
213
            nulls: self.nulls.clone(),
101
213
        }
102
213
    }
103
}
104
105
impl<T: ByteArrayType> GenericByteArray<T> {
106
    /// Data type of the array.
107
    pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109
    /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
110
    ///
111
    /// # Panics
112
    ///
113
    /// Panics if [`GenericByteArray::try_new`] returns an error
114
145
    pub fn new(
115
145
        offsets: OffsetBuffer<T::Offset>,
116
145
        values: Buffer,
117
145
        nulls: Option<NullBuffer>,
118
145
    ) -> Self {
119
145
        Self::try_new(offsets, values, nulls).unwrap()
120
145
    }
121
122
    /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
123
    ///
124
    /// # Errors
125
    ///
126
    /// * `offsets.len() - 1 != nulls.len()`
127
    /// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
128
145
    pub fn try_new(
129
145
        offsets: OffsetBuffer<T::Offset>,
130
145
        values: Buffer,
131
145
        nulls: Option<NullBuffer>,
132
145
    ) -> Result<Self, ArrowError> {
133
145
        let len = offsets.len() - 1;
134
135
        // Verify that each pair of offsets is a valid slices of values
136
145
        T::validate(&offsets, &values)
?0
;
137
138
145
        if let Some(
n12
) = nulls.as_ref() {
139
12
            if n.len() != len {
140
0
                return Err(ArrowError::InvalidArgumentError(format!(
141
0
                    "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142
0
                    T::Offset::PREFIX,
143
0
                    T::PREFIX,
144
0
                    n.len(),
145
0
                )));
146
12
            }
147
133
        }
148
149
145
        Ok(Self {
150
145
            data_type: T::DATA_TYPE,
151
145
            value_offsets: offsets,
152
145
            value_data: values,
153
145
            nulls,
154
145
        })
155
145
    }
156
157
    /// Create a new [`GenericByteArray`] from the provided parts, without validation
158
    ///
159
    /// # Safety
160
    ///
161
    /// Safe if [`Self::try_new`] would not error
162
3
    pub unsafe fn new_unchecked(
163
3
        offsets: OffsetBuffer<T::Offset>,
164
3
        values: Buffer,
165
3
        nulls: Option<NullBuffer>,
166
3
    ) -> Self {
167
3
        if cfg!(feature = "force_validate") {
168
0
            return Self::new(offsets, values, nulls);
169
3
        }
170
3
        Self {
171
3
            data_type: T::DATA_TYPE,
172
3
            value_offsets: offsets,
173
3
            value_data: values,
174
3
            nulls,
175
3
        }
176
3
    }
177
178
    /// Create a new [`GenericByteArray`] of length `len` where all values are null
179
    pub fn new_null(len: usize) -> Self {
180
        Self {
181
            data_type: T::DATA_TYPE,
182
            value_offsets: OffsetBuffer::new_zeroed(len),
183
            value_data: MutableBuffer::new(0).into(),
184
            nulls: Some(NullBuffer::new_null(len)),
185
        }
186
    }
187
188
    /// Create a new [`Scalar`] from `v`
189
    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
190
        Scalar::new(Self::from_iter_values(std::iter::once(value)))
191
    }
192
193
    /// Creates a [`GenericByteArray`] based on an iterator of values without nulls
194
58
    pub fn from_iter_values<Ptr, I>(iter: I) -> Self
195
58
    where
196
58
        Ptr: AsRef<T::Native>,
197
58
        I: IntoIterator<Item = Ptr>,
198
    {
199
58
        let iter = iter.into_iter();
200
58
        let (_, data_len) = iter.size_hint();
201
58
        let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
202
203
58
        let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
204
58
        offsets.push(T::Offset::usize_as(0));
205
206
58
        let mut values = MutableBuffer::new(0);
207
403
        for 
s345
in iter {
208
345
            let s: &[u8] = s.as_ref().as_ref();
209
345
            values.extend_from_slice(s);
210
345
            offsets.push(T::Offset::usize_as(values.len()));
211
345
        }
212
213
58
        T::Offset::from_usize(values.len()).expect("offset overflow");
214
58
        let offsets = Buffer::from(offsets);
215
216
        // Safety: valid by construction
217
58
        let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
218
219
58
        Self {
220
58
            data_type: T::DATA_TYPE,
221
58
            value_data: values.into(),
222
58
            value_offsets,
223
58
            nulls: None,
224
58
        }
225
58
    }
226
227
    /// Deconstruct this array into its constituent parts
228
0
    pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
229
0
        (self.value_offsets, self.value_data, self.nulls)
230
0
    }
231
232
    /// Returns the length for value at index `i`.
233
    /// # Panics
234
    /// Panics if index `i` is out of bounds.
235
    #[inline]
236
    pub fn value_length(&self, i: usize) -> T::Offset {
237
        let offsets = self.value_offsets();
238
        offsets[i + 1] - offsets[i]
239
    }
240
241
    /// Returns a reference to the offsets of this array
242
    ///
243
    /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
244
    /// allowing for zero-copy cloning
245
    #[inline]
246
35
    pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
247
35
        &self.value_offsets
248
35
    }
249
250
    /// Returns the values of this array
251
    ///
252
    /// Unlike [`Self::value_data`] this returns the [`Buffer`]
253
    /// allowing for zero-copy cloning
254
    #[inline]
255
41
    pub fn values(&self) -> &Buffer {
256
41
        &self.value_data
257
41
    }
258
259
    /// Returns the raw value data
260
0
    pub fn value_data(&self) -> &[u8] {
261
0
        self.value_data.as_slice()
262
0
    }
263
264
    /// Returns true if all data within this array is ASCII
265
0
    pub fn is_ascii(&self) -> bool {
266
0
        let offsets = self.value_offsets();
267
0
        let start = offsets.first().unwrap();
268
0
        let end = offsets.last().unwrap();
269
0
        self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
270
0
    }
271
272
    /// Returns the offset values in the offsets buffer
273
    #[inline]
274
316
    pub fn value_offsets(&self) -> &[T::Offset] {
275
316
        &self.value_offsets
276
316
    }
277
278
    /// Returns the element at index `i`
279
    ///
280
    /// Note: This method does not check for nulls and the value is arbitrary
281
    /// if [`is_null`](Self::is_null) returns true for the index.
282
    ///
283
    /// # Safety
284
    /// Caller is responsible for ensuring that the index is within the bounds of the array
285
134
    pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
286
134
        let end = *self.value_offsets().get_unchecked(i + 1);
287
134
        let start = *self.value_offsets().get_unchecked(i);
288
289
        // Soundness
290
        // pointer alignment & location is ensured by RawPtrBox
291
        // buffer bounds/offset is ensured by the value_offset invariants
292
293
        // Safety of `to_isize().unwrap()`
294
        // `start` and `end` are &OffsetSize, which is a generic type that implements the
295
        // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
296
        // both of which should cleanly cast to isize on an architecture that supports
297
        // 32/64-bit offsets
298
134
        let b = std::slice::from_raw_parts(
299
134
            self.value_data
300
134
                .as_ptr()
301
134
                .offset(start.to_isize().unwrap_unchecked()),
302
134
            (end - start).to_usize().unwrap_unchecked(),
303
        );
304
305
        // SAFETY:
306
        // ArrayData is valid
307
134
        T::Native::from_bytes_unchecked(b)
308
134
    }
309
310
    /// Returns the element at index `i`
311
    ///
312
    /// Note: This method does not check for nulls and the value is arbitrary
313
    /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
314
    ///
315
    /// # Panics
316
    /// Panics if index `i` is out of bounds.
317
134
    pub fn value(&self, i: usize) -> &T::Native {
318
134
        assert!(
319
134
            i < self.len(),
320
0
            "Trying to access an element at index {} from a {}{}Array of length {}",
321
            i,
322
            T::Offset::PREFIX,
323
            T::PREFIX,
324
0
            self.len()
325
        );
326
        // SAFETY:
327
        // Verified length above
328
134
        unsafe { self.value_unchecked(i) }
329
134
    }
330
331
    /// constructs a new iterator
332
0
    pub fn iter(&self) -> ArrayIter<&Self> {
333
0
        ArrayIter::new(self)
334
0
    }
335
336
    /// Returns a zero-copy slice of this array with the indicated offset and length.
337
77
    pub fn slice(&self, offset: usize, length: usize) -> Self {
338
        Self {
339
77
            data_type: T::DATA_TYPE,
340
77
            value_offsets: self.value_offsets.slice(offset, length),
341
77
            value_data: self.value_data.clone(),
342
77
            nulls: self.nulls.as_ref().map(|n| 
n6
.
slice6
(
offset6
,
length6
)),
343
        }
344
77
    }
345
346
    /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying
347
    /// offset and data buffers are not shared by others.
348
    pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
349
        let len = self.len();
350
        let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
351
352
        let data = self.into_data();
353
        let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
354
355
        let element_len = std::mem::size_of::<T::Offset>();
356
        let offset_buffer = data.buffers()[0]
357
            .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
358
359
        let element_len = std::mem::size_of::<u8>();
360
        let value_buffer = data.buffers()[1]
361
            .slice_with_length(data.offset() * element_len, value_len * element_len);
362
363
        drop(data);
364
365
        let try_mutable_null_buffer = match null_bit_buffer {
366
            None => Ok(None),
367
            Some(null_buffer) => {
368
                // Null buffer exists, tries to make it mutable
369
                null_buffer.into_mutable().map(Some)
370
            }
371
        };
372
373
        let try_mutable_buffers = match try_mutable_null_buffer {
374
            Ok(mutable_null_buffer) => {
375
                // Got mutable null buffer, tries to get mutable value buffer
376
                let try_mutable_offset_buffer = offset_buffer.into_mutable();
377
                let try_mutable_value_buffer = value_buffer.into_mutable();
378
379
                // try_mutable_offset_buffer.map(...).map_err(...) doesn't work as the compiler complains
380
                // mutable_null_buffer is moved into map closure.
381
                match (try_mutable_offset_buffer, try_mutable_value_buffer) {
382
                    (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
383
                        Ok(GenericByteBuilder::<T>::new_from_buffer(
384
                            mutable_offset_buffer,
385
                            mutable_value_buffer,
386
                            mutable_null_buffer,
387
                        ))
388
                    },
389
                    (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
390
                        mutable_offset_buffer.into(),
391
                        value_buffer,
392
                        mutable_null_buffer.map(|b| b.into()),
393
                    )),
394
                    (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
395
                        offset_buffer,
396
                        mutable_value_buffer.into(),
397
                        mutable_null_buffer.map(|b| b.into()),
398
                    )),
399
                    (Err(offset_buffer), Err(value_buffer)) => Err((
400
                        offset_buffer,
401
                        value_buffer,
402
                        mutable_null_buffer.map(|b| b.into()),
403
                    )),
404
                }
405
            }
406
            Err(mutable_null_buffer) => {
407
                // Unable to get mutable null buffer
408
                Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
409
            }
410
        };
411
412
        match try_mutable_buffers {
413
            Ok(builder) => Ok(builder),
414
            Err((offset_buffer, value_buffer, null_bit_buffer)) => {
415
                let builder = ArrayData::builder(T::DATA_TYPE)
416
                    .len(len)
417
                    .add_buffer(offset_buffer)
418
                    .add_buffer(value_buffer)
419
                    .null_bit_buffer(null_bit_buffer);
420
421
                let array_data = unsafe { builder.build_unchecked() };
422
                let array = GenericByteArray::<T>::from(array_data);
423
424
                Err(array)
425
            }
426
        }
427
    }
428
}
429
430
impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
431
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
432
0
        write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
433
0
        print_long_array(self, f, |array, index, f| {
434
0
            std::fmt::Debug::fmt(&array.value(index), f)
435
0
        })?;
436
0
        write!(f, "]")
437
0
    }
438
}
439
440
impl<T: ByteArrayType> Array for GenericByteArray<T> {
441
116
    fn as_any(&self) -> &dyn Any {
442
116
        self
443
116
    }
444
445
213
    fn to_data(&self) -> ArrayData {
446
213
        self.clone().into()
447
213
    }
448
449
1
    fn into_data(self) -> ArrayData {
450
1
        self.into()
451
1
    }
452
453
411
    fn data_type(&self) -> &DataType {
454
411
        &self.data_type
455
411
    }
456
457
77
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
458
77
        Arc::new(self.slice(offset, length))
459
77
    }
460
461
939
    fn len(&self) -> usize {
462
939
        self.value_offsets.len() - 1
463
939
    }
464
465
0
    fn is_empty(&self) -> bool {
466
0
        self.value_offsets.len() <= 1
467
0
    }
468
469
0
    fn shrink_to_fit(&mut self) {
470
0
        self.value_offsets.shrink_to_fit();
471
0
        self.value_data.shrink_to_fit();
472
0
        if let Some(nulls) = &mut self.nulls {
473
0
            nulls.shrink_to_fit();
474
0
        }
475
0
    }
476
477
0
    fn offset(&self) -> usize {
478
0
        0
479
0
    }
480
481
115
    fn nulls(&self) -> Option<&NullBuffer> {
482
115
        self.nulls.as_ref()
483
115
    }
484
485
2
    fn logical_null_count(&self) -> usize {
486
        // More efficient that the default implementation
487
2
        self.null_count()
488
2
    }
489
490
0
    fn get_buffer_memory_size(&self) -> usize {
491
0
        let mut sum = self.value_offsets.inner().inner().capacity();
492
0
        sum += self.value_data.capacity();
493
0
        if let Some(x) = &self.nulls {
494
0
            sum += x.buffer().capacity()
495
0
        }
496
0
        sum
497
0
    }
498
499
0
    fn get_array_memory_size(&self) -> usize {
500
0
        std::mem::size_of::<Self>() + self.get_buffer_memory_size()
501
0
    }
502
}
503
504
impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
505
    type Item = &'a T::Native;
506
507
0
    fn value(&self, index: usize) -> Self::Item {
508
0
        GenericByteArray::value(self, index)
509
0
    }
510
511
0
    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
512
0
        GenericByteArray::value_unchecked(self, index)
513
0
    }
514
}
515
516
impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
517
36
    fn from(data: ArrayData) -> Self {
518
36
        assert_eq!(
519
36
            data.data_type(),
520
36
            &Self::DATA_TYPE,
521
0
            "{}{}Array expects DataType::{}",
522
            T::Offset::PREFIX,
523
            T::PREFIX,
524
0
            Self::DATA_TYPE
525
        );
526
36
        assert_eq!(
527
36
            data.buffers().len(),
528
            2,
529
0
            "{}{}Array data should contain 2 buffers only (offsets and values)",
530
            T::Offset::PREFIX,
531
            T::PREFIX,
532
        );
533
        // SAFETY:
534
        // ArrayData is valid, and verified type above
535
36
        let value_offsets = unsafe { get_offsets(&data) };
536
36
        let value_data = data.buffers()[1].clone();
537
36
        Self {
538
36
            value_offsets,
539
36
            value_data,
540
36
            data_type: T::DATA_TYPE,
541
36
            nulls: data.nulls().cloned(),
542
36
        }
543
36
    }
544
}
545
546
impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
547
214
    fn from(array: GenericByteArray<T>) -> Self {
548
214
        let len = array.len();
549
550
214
        let offsets = array.value_offsets.into_inner().into_inner();
551
214
        let builder = ArrayDataBuilder::new(array.data_type)
552
214
            .len(len)
553
214
            .buffers(vec![offsets, array.value_data])
554
214
            .nulls(array.nulls);
555
556
214
        unsafe { builder.build_unchecked() }
557
214
    }
558
}
559
560
impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
561
    type Item = Option<&'a T::Native>;
562
    type IntoIter = ArrayIter<Self>;
563
564
0
    fn into_iter(self) -> Self::IntoIter {
565
0
        ArrayIter::new(self)
566
0
    }
567
}
568
569
impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
570
where
571
    Ptr: AsRef<T::Native> + 'a,
572
{
573
    fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
574
        iter.into_iter()
575
            .map(|o| o.as_ref().map(|p| p.as_ref()))
576
            .collect()
577
    }
578
}
579
580
impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
581
where
582
    Ptr: AsRef<T::Native>,
583
{
584
5
    fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
585
5
        let iter = iter.into_iter();
586
5
        let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
587
5
        builder.extend(iter);
588
5
        builder.finish()
589
5
    }
590
}
591
592
#[cfg(test)]
593
mod tests {
594
    use crate::{BinaryArray, StringArray};
595
    use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
596
597
    #[test]
598
    fn try_new() {
599
        let data = Buffer::from_slice_ref("helloworld");
600
        let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
601
        StringArray::new(offsets.clone(), data.clone(), None);
602
603
        let nulls = NullBuffer::new_null(3);
604
        let err =
605
            StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
606
        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
607
608
        let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
609
        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
610
611
        let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
612
        let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
613
        assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
614
615
        BinaryArray::new(offsets, non_utf8_data, None);
616
617
        let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
618
        let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
619
        assert_eq!(
620
            err.to_string(),
621
            "Invalid argument error: Offset of 11 exceeds length of values 10"
622
        );
623
624
        let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
625
        assert_eq!(
626
            err.to_string(),
627
            "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
628
        );
629
630
        let non_ascii_data = Buffer::from_slice_ref("heìloworld");
631
        StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
632
        BinaryArray::new(offsets, non_ascii_data.clone(), None);
633
634
        let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
635
        let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
636
        assert_eq!(
637
            err.to_string(),
638
            "Invalid argument error: Split UTF-8 codepoint at offset 3"
639
        );
640
641
        BinaryArray::new(offsets, non_ascii_data, None);
642
    }
643
}