Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/string_array.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::types::GenericStringType;
19
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20
use arrow_schema::ArrowError;
21
22
/// A [`GenericByteArray`] for storing `str`
23
pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25
impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26
    /// Returns the number of `Unicode Scalar Value` in the string at index `i`.
27
    /// # Performance
28
    /// This function has `O(n)` time complexity where `n` is the string length.
29
    /// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
30
    /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
31
    pub fn num_chars(&self, i: usize) -> usize {
32
        self.value(i).chars().count()
33
    }
34
35
    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
36
    pub fn take_iter<'a>(
37
        &'a self,
38
        indexes: impl Iterator<Item = Option<usize>> + 'a,
39
    ) -> impl Iterator<Item = Option<&'a str>> {
40
        indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
41
    }
42
43
    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
44
    /// # Safety
45
    ///
46
    /// caller must ensure that the indexes in the iterator are less than the `array.len()`
47
    pub unsafe fn take_iter_unchecked<'a>(
48
        &'a self,
49
        indexes: impl Iterator<Item = Option<usize>> + 'a,
50
    ) -> impl Iterator<Item = Option<&'a str>> {
51
        indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
52
    }
53
54
    /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning
55
    /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data
56
0
    pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
57
0
        let (offsets, values, nulls) = v.into_parts();
58
0
        Self::try_new(offsets, values, nulls)
59
0
    }
60
}
61
62
impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
63
    for GenericStringArray<OffsetSize>
64
{
65
    fn from(v: GenericListArray<OffsetSize>) -> Self {
66
        GenericBinaryArray::<OffsetSize>::from(v).into()
67
    }
68
}
69
70
impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
71
    for GenericStringArray<OffsetSize>
72
{
73
0
    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
74
0
        Self::try_from_binary(v).unwrap()
75
0
    }
76
}
77
78
impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
79
4
    fn from(v: Vec<Option<&str>>) -> Self {
80
4
        v.into_iter().collect()
81
4
    }
82
}
83
84
impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
85
39
    fn from(v: Vec<&str>) -> Self {
86
39
        Self::from_iter_values(v)
87
39
    }
88
}
89
90
impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
91
    fn from(v: Vec<Option<String>>) -> Self {
92
        v.into_iter().collect()
93
    }
94
}
95
96
impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
97
    fn from(v: Vec<String>) -> Self {
98
        Self::from_iter_values(v)
99
    }
100
}
101
102
/// A [`GenericStringArray`] of `str` using `i32` offsets
103
///
104
/// # Examples
105
///
106
/// Construction
107
///
108
/// ```
109
/// # use arrow_array::StringArray;
110
/// // Create from Vec<Option<&str>>
111
/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
112
/// // Create from Vec<&str>
113
/// let arr = StringArray::from(vec!["foo", "bar", "baz"]);
114
/// // Create from iter/collect (requires Option<&str>)
115
/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect();
116
/// ```
117
///
118
/// Construction and Access
119
///
120
/// ```
121
/// # use arrow_array::StringArray;
122
/// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]);
123
/// assert_eq!(array.value(0), "foo");
124
/// ```
125
///
126
/// See [`GenericByteArray`] for more information and examples
127
pub type StringArray = GenericStringArray<i32>;
128
129
/// A [`GenericStringArray`] of `str` using `i64` offsets
130
///
131
/// # Examples
132
///
133
/// Construction
134
///
135
/// ```
136
/// # use arrow_array::LargeStringArray;
137
/// // Create from Vec<Option<&str>>
138
/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
139
/// // Create from Vec<&str>
140
/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]);
141
/// // Create from iter/collect (requires Option<&str>)
142
/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect();
143
/// ```
144
///
145
/// Construction and Access
146
///
147
/// ```
148
/// use arrow_array::LargeStringArray;
149
/// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
150
/// assert_eq!(array.value(2), "bar");
151
/// ```
152
///
153
/// See [`GenericByteArray`] for more information and examples
154
pub type LargeStringArray = GenericStringArray<i64>;
155
156
#[cfg(test)]
157
mod tests {
158
    use super::*;
159
    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
160
    use crate::types::UInt8Type;
161
    use crate::Array;
162
    use arrow_buffer::Buffer;
163
    use arrow_data::ArrayData;
164
    use arrow_schema::{DataType, Field};
165
    use std::sync::Arc;
166
167
    #[test]
168
    fn test_string_array_from_u8_slice() {
169
        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
170
171
        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
172
        let string_array = StringArray::from(values);
173
174
        assert_eq!(3, string_array.len());
175
        assert_eq!(0, string_array.null_count());
176
        assert_eq!("hello", string_array.value(0));
177
        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
178
        assert_eq!("", string_array.value(1));
179
        assert_eq!("", unsafe { string_array.value_unchecked(1) });
180
        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
181
        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
182
            string_array.value_unchecked(2)
183
        });
184
        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
185
        assert_eq!(8, string_array.num_chars(2));
186
        for i in 0..3 {
187
            assert!(string_array.is_valid(i));
188
            assert!(!string_array.is_null(i));
189
        }
190
    }
191
192
    #[test]
193
    #[should_panic(expected = "StringArray expects DataType::Utf8")]
194
    fn test_string_array_from_int() {
195
        let array = LargeStringArray::from(vec!["a", "b"]);
196
        drop(StringArray::from(array.into_data()));
197
    }
198
199
    #[test]
200
    fn test_large_string_array_from_u8_slice() {
201
        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
202
203
        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
204
        let string_array = LargeStringArray::from(values);
205
206
        assert_eq!(3, string_array.len());
207
        assert_eq!(0, string_array.null_count());
208
        assert_eq!("hello", string_array.value(0));
209
        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
210
        assert_eq!("", string_array.value(1));
211
        assert_eq!("", unsafe { string_array.value_unchecked(1) });
212
        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
213
        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
214
            string_array.value_unchecked(2)
215
        });
216
        assert_eq!(5, string_array.value_offsets()[2]);
217
        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
218
        assert_eq!(8, string_array.num_chars(2));
219
        for i in 0..3 {
220
            assert!(string_array.is_valid(i));
221
            assert!(!string_array.is_null(i));
222
        }
223
    }
224
225
    #[test]
226
    fn test_nested_string_array() {
227
        let string_builder = StringBuilder::with_capacity(3, 10);
228
        let mut list_of_string_builder = ListBuilder::new(string_builder);
229
230
        list_of_string_builder.values().append_value("foo");
231
        list_of_string_builder.values().append_value("bar");
232
        list_of_string_builder.append(true);
233
234
        list_of_string_builder.values().append_value("foobar");
235
        list_of_string_builder.append(true);
236
        let list_of_strings = list_of_string_builder.finish();
237
238
        assert_eq!(list_of_strings.len(), 2);
239
240
        let first_slot = list_of_strings.value(0);
241
        let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
242
        assert_eq!(first_list.len(), 2);
243
        assert_eq!(first_list.value(0), "foo");
244
        assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
245
        assert_eq!(first_list.value(1), "bar");
246
        assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
247
248
        let second_slot = list_of_strings.value(1);
249
        let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
250
        assert_eq!(second_list.len(), 1);
251
        assert_eq!(second_list.value(0), "foobar");
252
        assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
253
    }
254
255
    #[test]
256
    #[should_panic(
257
        expected = "Trying to access an element at index 4 from a StringArray of length 3"
258
    )]
259
    fn test_string_array_get_value_index_out_of_bound() {
260
        let values: [u8; 12] = [
261
            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
262
        ];
263
        let offsets: [i32; 4] = [0, 5, 5, 12];
264
        let array_data = ArrayData::builder(DataType::Utf8)
265
            .len(3)
266
            .add_buffer(Buffer::from_slice_ref(offsets))
267
            .add_buffer(Buffer::from_slice_ref(values))
268
            .build()
269
            .unwrap();
270
        let string_array = StringArray::from(array_data);
271
        string_array.value(4);
272
    }
273
274
    #[test]
275
    fn test_string_array_fmt_debug() {
276
        let arr: StringArray = vec!["hello", "arrow"].into();
277
        assert_eq!(
278
            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
279
            format!("{arr:?}")
280
        );
281
    }
282
283
    #[test]
284
    fn test_large_string_array_fmt_debug() {
285
        let arr: LargeStringArray = vec!["hello", "arrow"].into();
286
        assert_eq!(
287
            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
288
            format!("{arr:?}")
289
        );
290
    }
291
292
    #[test]
293
    fn test_string_array_from_iter() {
294
        let data = [Some("hello"), None, Some("arrow")];
295
        let data_vec = data.to_vec();
296
        // from Vec<Option<&str>>
297
        let array1 = StringArray::from(data_vec.clone());
298
        // from Iterator<Option<&str>>
299
        let array2: StringArray = data_vec.clone().into_iter().collect();
300
        // from Iterator<Option<String>>
301
        let array3: StringArray = data_vec
302
            .into_iter()
303
            .map(|x| x.map(|s| s.to_string()))
304
            .collect();
305
        // from Iterator<&Option<&str>>
306
        let array4: StringArray = data.iter().collect::<StringArray>();
307
308
        assert_eq!(array1, array2);
309
        assert_eq!(array2, array3);
310
        assert_eq!(array3, array4);
311
    }
312
313
    #[test]
314
    fn test_string_array_from_iter_values() {
315
        let data = ["hello", "hello2"];
316
        let array1 = StringArray::from_iter_values(data.iter());
317
318
        assert_eq!(array1.value(0), "hello");
319
        assert_eq!(array1.value(1), "hello2");
320
321
        // Also works with String types.
322
        let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
323
        let array2 = StringArray::from_iter_values(data2.iter());
324
325
        assert_eq!(array2.value(0), "goodbye");
326
        assert_eq!(array2.value(1), "goodbye2");
327
    }
328
329
    #[test]
330
    fn test_string_array_from_unbound_iter() {
331
        // iterator that doesn't declare (upper) size bound
332
        let string_iter = (0..)
333
            .scan(0usize, |pos, i| {
334
                if *pos < 10 {
335
                    *pos += 1;
336
                    Some(Some(format!("value {i}")))
337
                } else {
338
                    // actually returns up to 10 values
339
                    None
340
                }
341
            })
342
            // limited using take()
343
            .take(100);
344
345
        let (_, upper_size_bound) = string_iter.size_hint();
346
        // the upper bound, defined by take above, is 100
347
        assert_eq!(upper_size_bound, Some(100));
348
        let string_array: StringArray = string_iter.collect();
349
        // but the actual number of items in the array should be 10
350
        assert_eq!(string_array.len(), 10);
351
    }
352
353
    #[test]
354
    fn test_string_array_all_null() {
355
        let data: Vec<Option<&str>> = vec![None];
356
        let array = StringArray::from(data);
357
        array
358
            .into_data()
359
            .validate_full()
360
            .expect("All null array has valid array data");
361
    }
362
363
    #[test]
364
    fn test_large_string_array_all_null() {
365
        let data: Vec<Option<&str>> = vec![None];
366
        let array = LargeStringArray::from(data);
367
        array
368
            .into_data()
369
            .validate_full()
370
            .expect("All null array has valid array data");
371
    }
372
373
    fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
374
        let values = b"HelloArrowAndParquet";
375
        // "ArrowAndParquet"
376
        let child_data = ArrayData::builder(DataType::UInt8)
377
            .len(15)
378
            .offset(5)
379
            .add_buffer(Buffer::from(values))
380
            .build()
381
            .unwrap();
382
383
        let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
384
        let null_buffer = Buffer::from_slice_ref([0b101]);
385
        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
386
            Field::new_list_field(DataType::UInt8, false),
387
        ));
388
389
        // [None, Some("Parquet")]
390
        let array_data = ArrayData::builder(data_type)
391
            .len(2)
392
            .offset(1)
393
            .add_buffer(Buffer::from_slice_ref(offsets))
394
            .null_bit_buffer(Some(null_buffer))
395
            .add_child_data(child_data)
396
            .build()
397
            .unwrap();
398
        let list_array = GenericListArray::<O>::from(array_data);
399
        let string_array = GenericStringArray::<O>::from(list_array);
400
401
        assert_eq!(2, string_array.len());
402
        assert_eq!(1, string_array.null_count());
403
        assert!(string_array.is_null(0));
404
        assert!(string_array.is_valid(1));
405
        assert_eq!("Parquet", string_array.value(1));
406
    }
407
408
    #[test]
409
    fn test_string_array_from_list_array() {
410
        _test_generic_string_array_from_list_array::<i32>();
411
    }
412
413
    #[test]
414
    fn test_large_string_array_from_list_array() {
415
        _test_generic_string_array_from_list_array::<i64>();
416
    }
417
418
    fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
419
        let values = b"HelloArrow";
420
        let child_data = ArrayData::builder(DataType::UInt8)
421
            .len(10)
422
            .add_buffer(Buffer::from(values))
423
            .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
424
            .build()
425
            .unwrap();
426
427
        let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
428
429
        // It is possible to create a null struct containing a non-nullable child
430
        // see https://github.com/apache/arrow-rs/pull/3244 for details
431
        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
432
            Field::new_list_field(DataType::UInt8, true),
433
        ));
434
435
        // [None, Some(b"Parquet")]
436
        let array_data = ArrayData::builder(data_type)
437
            .len(2)
438
            .add_buffer(Buffer::from_slice_ref(offsets))
439
            .add_child_data(child_data)
440
            .build()
441
            .unwrap();
442
        let list_array = GenericListArray::<O>::from(array_data);
443
        drop(GenericStringArray::<O>::from(list_array));
444
    }
445
446
    #[test]
447
    #[should_panic(expected = "The child array cannot contain null values.")]
448
    fn test_string_array_from_list_array_with_child_nulls_failed() {
449
        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
450
    }
451
452
    #[test]
453
    #[should_panic(expected = "The child array cannot contain null values.")]
454
    fn test_large_string_array_from_list_array_with_child_nulls_failed() {
455
        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
456
    }
457
458
    fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
459
        let values = b"HelloArrow";
460
        let child_data = ArrayData::builder(DataType::UInt16)
461
            .len(5)
462
            .add_buffer(Buffer::from(values))
463
            .build()
464
            .unwrap();
465
466
        let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
467
        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
468
            Field::new_list_field(DataType::UInt16, false),
469
        ));
470
471
        let array_data = ArrayData::builder(data_type)
472
            .len(2)
473
            .add_buffer(Buffer::from_slice_ref(offsets))
474
            .add_child_data(child_data)
475
            .build()
476
            .unwrap();
477
        let list_array = GenericListArray::<O>::from(array_data);
478
        drop(GenericStringArray::<O>::from(list_array));
479
    }
480
481
    #[test]
482
    #[should_panic(
483
        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
484
    )]
485
    fn test_string_array_from_list_array_wrong_type() {
486
        _test_generic_string_array_from_list_array_wrong_type::<i32>();
487
    }
488
489
    #[test]
490
    #[should_panic(
491
        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
492
    )]
493
    fn test_large_string_array_from_list_array_wrong_type() {
494
        _test_generic_string_array_from_list_array_wrong_type::<i64>();
495
    }
496
497
    #[test]
498
    #[should_panic(
499
        expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
500
    )]
501
    fn test_list_array_utf8_validation() {
502
        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
503
        builder.values().append_value(0xFF);
504
        builder.append(true);
505
        let list = builder.finish();
506
        let _ = StringArray::from(list);
507
    }
508
509
    #[test]
510
    fn test_empty_offsets() {
511
        let string = StringArray::from(
512
            ArrayData::builder(DataType::Utf8)
513
                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
514
                .build()
515
                .unwrap(),
516
        );
517
        assert_eq!(string.len(), 0);
518
        assert_eq!(string.value_offsets(), &[0]);
519
520
        let string = LargeStringArray::from(
521
            ArrayData::builder(DataType::LargeUtf8)
522
                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
523
                .build()
524
                .unwrap(),
525
        );
526
        assert_eq!(string.len(), 0);
527
        assert_eq!(string.value_offsets(), &[0]);
528
    }
529
530
    #[test]
531
    fn test_into_builder() {
532
        let array: StringArray = vec!["hello", "arrow"].into();
533
534
        // Append values
535
        let mut builder = array.into_builder().unwrap();
536
537
        builder.append_value("rust");
538
539
        let expected: StringArray = vec!["hello", "arrow", "rust"].into();
540
        let array = builder.finish();
541
        assert_eq!(expected, array);
542
    }
543
544
    #[test]
545
    fn test_into_builder_err() {
546
        let array: StringArray = vec!["hello", "arrow"].into();
547
548
        // Clone it, so we cannot get a mutable builder back
549
        let shared_array = array.clone();
550
551
        let err_return = array.into_builder().unwrap_err();
552
        assert_eq!(&err_return, &shared_array);
553
    }
554
}