Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
19
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
20
use crate::{
21
    Array, ArrayRef, DictionaryArray, GenericByteArray, PrimitiveArray, TypedDictionaryArray,
22
};
23
use arrow_buffer::ArrowNativeType;
24
use arrow_schema::{ArrowError, DataType};
25
use hashbrown::HashTable;
26
use num::NumCast;
27
use std::any::Any;
28
use std::sync::Arc;
29
30
/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
31
///
32
/// For example to map a set of byte indices to String values. Note that
33
/// the use of a `HashMap` here will not scale to very large arrays or
34
/// result in an ordered dictionary.
35
#[derive(Debug)]
36
pub struct GenericByteDictionaryBuilder<K, T>
37
where
38
    K: ArrowDictionaryKeyType,
39
    T: ByteArrayType,
40
{
41
    state: ahash::RandomState,
42
    dedup: HashTable<usize>,
43
44
    keys_builder: PrimitiveBuilder<K>,
45
    values_builder: GenericByteBuilder<T>,
46
}
47
48
impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
49
where
50
    K: ArrowDictionaryKeyType,
51
    T: ByteArrayType,
52
{
53
    fn default() -> Self {
54
        Self::new()
55
    }
56
}
57
58
impl<K, T> GenericByteDictionaryBuilder<K, T>
59
where
60
    K: ArrowDictionaryKeyType,
61
    T: ByteArrayType,
62
{
63
    /// Creates a new `GenericByteDictionaryBuilder`
64
    pub fn new() -> Self {
65
        let keys_builder = PrimitiveBuilder::new();
66
        let values_builder = GenericByteBuilder::<T>::new();
67
        Self {
68
            state: Default::default(),
69
            dedup: HashTable::with_capacity(keys_builder.capacity()),
70
            keys_builder,
71
            values_builder,
72
        }
73
    }
74
75
    /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
76
    ///
77
    /// `keys_capacity`: the number of keys, i.e. length of array to build
78
    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
79
    /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
80
0
    pub fn with_capacity(
81
0
        keys_capacity: usize,
82
0
        value_capacity: usize,
83
0
        data_capacity: usize,
84
0
    ) -> Self {
85
0
        Self {
86
0
            state: Default::default(),
87
0
            dedup: Default::default(),
88
0
            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
89
0
            values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
90
0
        }
91
0
    }
92
93
    /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
94
    /// which is initialized with the given values.
95
    /// The indices of those dictionary values are used as keys.
96
    ///
97
    /// # Example
98
    ///
99
    /// ```
100
    /// # use arrow_array::builder::StringDictionaryBuilder;
101
    /// # use arrow_array::{Int16Array, StringArray};
102
    ///
103
    /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
104
    ///
105
    /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
106
    /// builder.append("def").unwrap();
107
    /// builder.append_null();
108
    /// builder.append("abc").unwrap();
109
    ///
110
    /// let dictionary_array = builder.finish();
111
    ///
112
    /// let keys = dictionary_array.keys();
113
    ///
114
    /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
115
    /// ```
116
    pub fn new_with_dictionary(
117
        keys_capacity: usize,
118
        dictionary_values: &GenericByteArray<T>,
119
    ) -> Result<Self, ArrowError> {
120
        let state = ahash::RandomState::default();
121
        let dict_len = dictionary_values.len();
122
123
        let mut dedup = HashTable::with_capacity(dict_len);
124
125
        let values_len = dictionary_values.value_data().len();
126
        let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
127
128
        K::Native::from_usize(dictionary_values.len())
129
            .ok_or(ArrowError::DictionaryKeyOverflowError)?;
130
131
        for (idx, maybe_value) in dictionary_values.iter().enumerate() {
132
            match maybe_value {
133
                Some(value) => {
134
                    let value_bytes: &[u8] = value.as_ref();
135
                    let hash = state.hash_one(value_bytes);
136
137
                    dedup
138
                        .entry(
139
                            hash,
140
                            |idx: &usize| value_bytes == get_bytes(&values_builder, *idx),
141
                            |idx: &usize| state.hash_one(get_bytes(&values_builder, *idx)),
142
                        )
143
                        .or_insert(idx);
144
145
                    values_builder.append_value(value);
146
                }
147
                None => values_builder.append_null(),
148
            }
149
        }
150
151
        Ok(Self {
152
            state,
153
            dedup,
154
            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
155
            values_builder,
156
        })
157
    }
158
159
    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder with the same
160
    /// keys and values, but with a new data type for the keys.
161
    ///
162
    /// # Example
163
    /// ```
164
    /// #
165
    /// # use arrow_array::builder::StringDictionaryBuilder;
166
    /// # use arrow_array::types::{UInt8Type, UInt16Type};
167
    /// # use arrow_array::UInt16Array;
168
    /// # use arrow_schema::ArrowError;
169
    ///
170
    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
171
    ///
172
    /// // appending too many values causes the dictionary to overflow
173
    /// for i in 0..256 {
174
    ///     u8_keyed_builder.append_value(format!("{}", i));
175
    /// }
176
    /// let result = u8_keyed_builder.append("256");
177
    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
178
    ///
179
    /// // we need to upgrade to a larger key type
180
    /// let mut u16_keyed_builder = StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
181
    /// let dictionary_array = u16_keyed_builder.finish();
182
    /// let keys = dictionary_array.keys();
183
    ///
184
    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
185
    /// ```
186
    pub fn try_new_from_builder<K2>(
187
        mut source: GenericByteDictionaryBuilder<K2, T>,
188
    ) -> Result<Self, ArrowError>
189
    where
190
        K::Native: NumCast,
191
        K2: ArrowDictionaryKeyType,
192
        K2::Native: NumCast,
193
    {
194
        let state = source.state;
195
        let dedup = source.dedup;
196
        let values_builder = source.values_builder;
197
198
        let source_keys = source.keys_builder.finish();
199
        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
200
            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
201
                ArrowError::CastError(format!(
202
                    "Can't cast dictionary keys from source type {:?} to type {:?}",
203
                    K2::DATA_TYPE,
204
                    K::DATA_TYPE
205
                ))
206
            })
207
        })?;
208
209
        // drop source key here because currently source_keys and new_keys are holding reference to
210
        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
211
        // be the only reference holder.
212
        drop(source_keys);
213
214
        Ok(Self {
215
            state,
216
            dedup,
217
            keys_builder: new_keys
218
                .into_builder()
219
                .expect("underlying buffer has no references"),
220
            values_builder,
221
        })
222
    }
223
}
224
225
impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
226
where
227
    K: ArrowDictionaryKeyType,
228
    T: ByteArrayType,
229
{
230
    /// Returns the builder as an non-mutable `Any` reference.
231
0
    fn as_any(&self) -> &dyn Any {
232
0
        self
233
0
    }
234
235
    /// Returns the builder as an mutable `Any` reference.
236
0
    fn as_any_mut(&mut self) -> &mut dyn Any {
237
0
        self
238
0
    }
239
240
    /// Returns the boxed builder as a box of `Any`.
241
0
    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
242
0
        self
243
0
    }
244
245
    /// Returns the number of array slots in the builder
246
0
    fn len(&self) -> usize {
247
0
        self.keys_builder.len()
248
0
    }
249
250
    /// Builds the array and reset this builder.
251
0
    fn finish(&mut self) -> ArrayRef {
252
0
        Arc::new(self.finish())
253
0
    }
254
255
    /// Builds the array without resetting the builder.
256
0
    fn finish_cloned(&self) -> ArrayRef {
257
0
        Arc::new(self.finish_cloned())
258
0
    }
259
}
260
261
impl<K, T> GenericByteDictionaryBuilder<K, T>
262
where
263
    K: ArrowDictionaryKeyType,
264
    T: ByteArrayType,
265
{
266
0
    fn get_or_insert_key(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
267
0
        let value_native: &T::Native = value.as_ref();
268
0
        let value_bytes: &[u8] = value_native.as_ref();
269
270
0
        let state = &self.state;
271
0
        let storage = &mut self.values_builder;
272
0
        let hash = state.hash_one(value_bytes);
273
274
0
        let idx = *self
275
0
            .dedup
276
0
            .entry(
277
0
                hash,
278
0
                |idx| value_bytes == get_bytes(storage, *idx),
279
0
                |idx| state.hash_one(get_bytes(storage, *idx)),
280
            )
281
0
            .or_insert_with(|| {
282
0
                let idx = storage.len();
283
0
                storage.append_value(value);
284
0
                idx
285
0
            })
286
0
            .get();
287
288
0
        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
289
290
0
        Ok(key)
291
0
    }
292
293
    /// Append a value to the array. Return an existing index
294
    /// if already present in the values array or a new index if the
295
    /// value is appended to the values array.
296
    ///
297
    /// Returns an error if the new index would overflow the key type.
298
0
    pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
299
0
        let key = self.get_or_insert_key(value)?;
300
0
        self.keys_builder.append_value(key);
301
0
        Ok(key)
302
0
    }
303
304
    /// Append a value multiple times to the array.
305
    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
306
    ///
307
    /// Returns an error if the new index would overflow the key type.
308
    pub fn append_n(
309
        &mut self,
310
        value: impl AsRef<T::Native>,
311
        count: usize,
312
    ) -> Result<K::Native, ArrowError> {
313
        let key = self.get_or_insert_key(value)?;
314
        self.keys_builder.append_value_n(key, count);
315
        Ok(key)
316
    }
317
318
    /// Infallibly append a value to this builder
319
    ///
320
    /// # Panics
321
    ///
322
    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
323
    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
324
        self.append(value).expect("dictionary key overflow");
325
    }
326
327
    /// Infallibly append a value to this builder repeatedly `count` times.
328
    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
329
    ///
330
    /// # Panics
331
    ///
332
    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
333
    pub fn append_values(&mut self, value: impl AsRef<T::Native>, count: usize) {
334
        self.append_n(value, count)
335
            .expect("dictionary key overflow");
336
    }
337
338
    /// Appends a null slot into the builder
339
    #[inline]
340
0
    pub fn append_null(&mut self) {
341
0
        self.keys_builder.append_null()
342
0
    }
343
344
    /// Infallibly append `n` null slots into the builder
345
    #[inline]
346
    pub fn append_nulls(&mut self, n: usize) {
347
        self.keys_builder.append_nulls(n)
348
    }
349
350
    /// Append an `Option` value into the builder
351
    ///
352
    /// # Panics
353
    ///
354
    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
355
    #[inline]
356
    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
357
        match value {
358
            None => self.append_null(),
359
            Some(v) => self.append_value(v),
360
        };
361
    }
362
363
    /// Append an `Option` value into the builder repeatedly `count` times.
364
    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
365
    ///
366
    /// # Panics
367
    ///
368
    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
369
    pub fn append_options(&mut self, value: Option<impl AsRef<T::Native>>, count: usize) {
370
        match value {
371
            None => self.keys_builder.append_nulls(count),
372
            Some(v) => self.append_values(v, count),
373
        };
374
    }
375
376
    /// Extends builder with an existing dictionary array.
377
    ///
378
    /// This is the same as [`Self::extend`] but is faster as it translates
379
    /// the dictionary values once rather than doing a lookup for each item in the iterator
380
    ///
381
    /// when dictionary values are null (the actual mapped values) the keys are null
382
    ///
383
    pub fn extend_dictionary(
384
        &mut self,
385
        dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
386
    ) -> Result<(), ArrowError> {
387
        let values = dictionary.values();
388
389
        let v_len = values.len();
390
        let k_len = dictionary.keys().len();
391
        if v_len == 0 && k_len == 0 {
392
            return Ok(());
393
        }
394
395
        // All nulls
396
        if v_len == 0 {
397
            self.append_nulls(k_len);
398
            return Ok(());
399
        }
400
401
        if k_len == 0 {
402
            return Err(ArrowError::InvalidArgumentError(
403
                "Dictionary keys should not be empty when values are not empty".to_string(),
404
            ));
405
        }
406
407
        // Orphan values will be carried over to the new dictionary
408
        let mapped_values = values
409
            .iter()
410
            // Dictionary values can technically be null, so we need to handle that
411
            .map(|dict_value| {
412
                dict_value
413
                    .map(|dict_value| self.get_or_insert_key(dict_value))
414
                    .transpose()
415
            })
416
            .collect::<Result<Vec<_>, _>>()?;
417
418
        // Just insert the keys without additional lookups
419
        dictionary.keys().iter().for_each(|key| match key {
420
            None => self.append_null(),
421
            Some(original_dict_index) => {
422
                let index = original_dict_index.as_usize().min(v_len - 1);
423
                match mapped_values[index] {
424
                    None => self.append_null(),
425
                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
426
                }
427
            }
428
        });
429
430
        Ok(())
431
    }
432
433
    /// Builds the `DictionaryArray` and reset this builder.
434
0
    pub fn finish(&mut self) -> DictionaryArray<K> {
435
0
        self.dedup.clear();
436
0
        let values = self.values_builder.finish();
437
0
        let keys = self.keys_builder.finish();
438
439
0
        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
440
441
0
        let builder = keys
442
0
            .into_data()
443
0
            .into_builder()
444
0
            .data_type(data_type)
445
0
            .child_data(vec![values.into_data()]);
446
447
0
        DictionaryArray::from(unsafe { builder.build_unchecked() })
448
0
    }
449
450
    /// Builds the `DictionaryArray` without resetting the builder.
451
0
    pub fn finish_cloned(&self) -> DictionaryArray<K> {
452
0
        let values = self.values_builder.finish_cloned();
453
0
        let keys = self.keys_builder.finish_cloned();
454
455
0
        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
456
457
0
        let builder = keys
458
0
            .into_data()
459
0
            .into_builder()
460
0
            .data_type(data_type)
461
0
            .child_data(vec![values.into_data()]);
462
463
0
        DictionaryArray::from(unsafe { builder.build_unchecked() })
464
0
    }
465
466
    /// Builds the `DictionaryArray` without resetting the values builder or
467
    /// the internal de-duplication map.
468
    ///
469
    /// The advantage of doing this is that the values will represent the entire
470
    /// set of what has been built so-far by this builder and ensures
471
    /// consistency in the assignment of keys to values across multiple calls
472
    /// to `finish_preserve_values`. This enables ipc writers to efficiently
473
    /// emit delta dictionaries.
474
    ///
475
    /// The downside to this is that building the record requires creating a
476
    /// copy of the values, which can become slowly more expensive if the
477
    /// dictionary grows.
478
    ///
479
    /// Additionally, if record batches from multiple different dictionary
480
    /// builders for the same column are fed into a single ipc writer, beware
481
    /// that entire dictionaries are likely to be re-sent frequently even when
482
    /// the majority of the values are not used by the current record batch.
483
    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
484
        let values = self.values_builder.finish_cloned();
485
        let keys = self.keys_builder.finish();
486
487
        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
488
489
        let builder = keys
490
            .into_data()
491
            .into_builder()
492
            .data_type(data_type)
493
            .child_data(vec![values.into_data()]);
494
495
        DictionaryArray::from(unsafe { builder.build_unchecked() })
496
    }
497
498
    /// Returns the current null buffer as a slice
499
    pub fn validity_slice(&self) -> Option<&[u8]> {
500
        self.keys_builder.validity_slice()
501
    }
502
}
503
504
impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
505
    for GenericByteDictionaryBuilder<K, T>
506
{
507
    #[inline]
508
    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
509
        for v in iter {
510
            self.append_option(v)
511
        }
512
    }
513
}
514
515
0
fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
516
0
    let offsets = values.offsets_slice();
517
0
    let values = values.values_slice();
518
519
0
    let end_offset = offsets[idx + 1].as_usize();
520
0
    let start_offset = offsets[idx].as_usize();
521
522
0
    &values[start_offset..end_offset]
523
0
}
524
525
/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
526
///
527
/// ```
528
/// // Create a dictionary array indexed by bytes whose values are Strings.
529
/// // It can thus hold up to 256 distinct string values.
530
///
531
/// # use arrow_array::builder::StringDictionaryBuilder;
532
/// # use arrow_array::{Int8Array, StringArray};
533
/// # use arrow_array::types::Int8Type;
534
///
535
/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
536
///
537
/// // The builder builds the dictionary value by value
538
/// builder.append("abc").unwrap();
539
/// builder.append_null();
540
/// builder.append_n("def", 2).unwrap();  // appends "def" twice with a single lookup
541
/// builder.append("abc").unwrap();
542
/// let array = builder.finish();
543
///
544
/// assert_eq!(
545
///   array.keys(),
546
///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
547
/// );
548
///
549
/// // Values are polymorphic and so require a downcast.
550
/// let av = array.values();
551
/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
552
///
553
/// assert_eq!(ava.value(0), "abc");
554
/// assert_eq!(ava.value(1), "def");
555
///
556
/// ```
557
pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
558
559
/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
560
pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
561
562
/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
563
///
564
/// ```
565
/// // Create a dictionary array indexed by bytes whose values are binary.
566
/// // It can thus hold up to 256 distinct binary values.
567
///
568
/// # use arrow_array::builder::BinaryDictionaryBuilder;
569
/// # use arrow_array::{BinaryArray, Int8Array};
570
/// # use arrow_array::types::Int8Type;
571
///
572
/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
573
///
574
/// // The builder builds the dictionary value by value
575
/// builder.append(b"abc").unwrap();
576
/// builder.append_null();
577
/// builder.append(b"def").unwrap();
578
/// builder.append(b"def").unwrap();
579
/// builder.append(b"abc").unwrap();
580
/// let array = builder.finish();
581
///
582
/// assert_eq!(
583
///   array.keys(),
584
///   &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
585
/// );
586
///
587
/// // Values are polymorphic and so require a downcast.
588
/// let av = array.values();
589
/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
590
///
591
/// assert_eq!(ava.value(0), b"abc");
592
/// assert_eq!(ava.value(1), b"def");
593
///
594
/// ```
595
pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
596
597
/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
598
pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
599
600
#[cfg(test)]
601
mod tests {
602
    use super::*;
603
604
    use crate::array::Int8Array;
605
    use crate::cast::AsArray;
606
    use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type, Utf8Type};
607
    use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
608
609
    fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
610
    where
611
        T: ByteArrayType,
612
        <T as ByteArrayType>::Native: PartialEq,
613
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
614
    {
615
        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
616
        builder.append(values[0]).unwrap();
617
        builder.append_null();
618
        builder.append(values[1]).unwrap();
619
        builder.append(values[1]).unwrap();
620
        builder.append(values[0]).unwrap();
621
        let array = builder.finish();
622
623
        assert_eq!(
624
            array.keys(),
625
            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
626
        );
627
628
        // Values are polymorphic and so require a downcast.
629
        let av = array.values();
630
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
631
632
        assert_eq!(*ava.value(0), *values[0]);
633
        assert_eq!(*ava.value(1), *values[1]);
634
    }
635
636
    #[test]
637
    fn test_string_dictionary_builder() {
638
        test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
639
    }
640
641
    #[test]
642
    fn test_binary_dictionary_builder() {
643
        test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
644
    }
645
646
    fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
647
    where
648
        T: ByteArrayType,
649
        <T as ByteArrayType>::Native: PartialEq,
650
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
651
    {
652
        let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
653
654
        builder.append(values[0]).unwrap();
655
        builder.append_null();
656
        builder.append(values[1]).unwrap();
657
        builder.append(values[1]).unwrap();
658
        builder.append(values[0]).unwrap();
659
        let mut array = builder.finish_cloned();
660
661
        assert_eq!(
662
            array.keys(),
663
            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
664
        );
665
666
        // Values are polymorphic and so require a downcast.
667
        let av = array.values();
668
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
669
670
        assert_eq!(ava.value(0), values[0]);
671
        assert_eq!(ava.value(1), values[1]);
672
673
        builder.append(values[0]).unwrap();
674
        builder.append(values[2]).unwrap();
675
        builder.append(values[1]).unwrap();
676
677
        array = builder.finish();
678
679
        assert_eq!(
680
            array.keys(),
681
            &Int8Array::from(vec![
682
                Some(0),
683
                None,
684
                Some(1),
685
                Some(1),
686
                Some(0),
687
                Some(0),
688
                Some(2),
689
                Some(1)
690
            ])
691
        );
692
693
        // Values are polymorphic and so require a downcast.
694
        let av2 = array.values();
695
        let ava2: &GenericByteArray<T> =
696
            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
697
698
        assert_eq!(ava2.value(0), values[0]);
699
        assert_eq!(ava2.value(1), values[1]);
700
        assert_eq!(ava2.value(2), values[2]);
701
    }
702
703
    #[test]
704
    fn test_string_dictionary_builder_finish_cloned() {
705
        test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
706
            "abc", "def", "ghi",
707
        ]);
708
    }
709
710
    #[test]
711
    fn test_binary_dictionary_builder_finish_cloned() {
712
        test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
713
            b"abc", b"def", b"ghi",
714
        ]);
715
    }
716
717
    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values: Vec<&T::Native>)
718
    where
719
        K1: ArrowDictionaryKeyType,
720
        K1::Native: NumCast,
721
        K2: ArrowDictionaryKeyType,
722
        K2::Native: NumCast + From<u8>,
723
        T: ByteArrayType,
724
        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
725
    {
726
        let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
727
        source.append(values[0]).unwrap();
728
        source.append(values[1]).unwrap();
729
        source.append_null();
730
        source.append(values[2]).unwrap();
731
732
        let mut result =
733
            GenericByteDictionaryBuilder::<K2, T>::try_new_from_builder(source).unwrap();
734
        let array = result.finish();
735
736
        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
737
        expected_keys_builder
738
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
739
        expected_keys_builder
740
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
741
        expected_keys_builder.append_null();
742
        expected_keys_builder
743
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
744
        let expected_keys = expected_keys_builder.finish();
745
        assert_eq!(array.keys(), &expected_keys);
746
747
        let av = array.values();
748
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
749
        assert_eq!(ava.value(0), values[0]);
750
        assert_eq!(ava.value(1), values[1]);
751
        assert_eq!(ava.value(2), values[2]);
752
    }
753
754
    fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
755
    where
756
        T: ByteArrayType,
757
        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as ByteArrayType>::Native>,
758
    {
759
        // test cast to bigger size unsigned
760
        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type, T>(
761
            values.clone(),
762
        );
763
        // test cast going to smaller size unsigned
764
        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type, T>(
765
            values.clone(),
766
        );
767
        // test cast going to bigger size signed
768
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type, T>(values.clone());
769
        // test cast going to smaller size signed
770
        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
771
        // test going from signed to signed for different size changes
772
        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type, T>(values.clone());
773
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type, T>(values.clone());
774
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type, T>(values.clone());
775
        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type, T>(values.clone());
776
    }
777
778
    #[test]
779
    fn test_string_dictionary_builder_try_new_from_builder() {
780
        test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def", "ghi"]);
781
    }
782
783
    #[test]
784
    fn test_binary_dictionary_builder_try_new_from_builder() {
785
        test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def", b"ghi"]);
786
    }
787
788
    #[test]
789
    fn test_try_new_from_builder_cast_fails() {
790
        let mut source_builder = StringDictionaryBuilder::<UInt16Type>::new();
791
        for i in 0..257 {
792
            source_builder.append_value(format!("val{i}"));
793
        }
794
795
        // there should be too many values that we can't downcast to the underlying type
796
        // we have keys that wouldn't fit into UInt8Type
797
        let result = StringDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
798
        assert!(result.is_err());
799
        if let Err(e) = result {
800
            assert!(matches!(e, ArrowError::CastError(_)));
801
            assert_eq!(
802
                e.to_string(),
803
                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
804
            );
805
        }
806
    }
807
808
    fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
809
        dictionary: GenericByteArray<T>,
810
        values: Vec<&T::Native>,
811
    ) where
812
        T: ByteArrayType,
813
        <T as ByteArrayType>::Native: PartialEq,
814
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
815
    {
816
        let mut builder =
817
            GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
818
                .unwrap();
819
        builder.append(values[0]).unwrap();
820
        builder.append_null();
821
        builder.append(values[1]).unwrap();
822
        builder.append(values[1]).unwrap();
823
        builder.append(values[0]).unwrap();
824
        builder.append(values[2]).unwrap();
825
        let array = builder.finish();
826
827
        assert_eq!(
828
            array.keys(),
829
            &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
830
        );
831
832
        // Values are polymorphic and so require a downcast.
833
        let av = array.values();
834
        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
835
836
        assert!(!ava.is_valid(0));
837
        assert_eq!(ava.value(1), values[1]);
838
        assert_eq!(ava.value(2), values[0]);
839
        assert_eq!(ava.value(3), values[2]);
840
    }
841
842
    #[test]
843
    fn test_string_dictionary_builder_with_existing_dictionary() {
844
        test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
845
            StringArray::from(vec![None, Some("def"), Some("abc")]),
846
            vec!["abc", "def", "ghi"],
847
        );
848
    }
849
850
    #[test]
851
    fn test_binary_dictionary_builder_with_existing_dictionary() {
852
        let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
853
        test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
854
            BinaryArray::from(values),
855
            vec![b"abc", b"def", b"ghi"],
856
        );
857
    }
858
859
    fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
860
        dictionary: GenericByteArray<T>,
861
        values: Vec<&T::Native>,
862
    ) where
863
        T: ByteArrayType,
864
        <T as ByteArrayType>::Native: PartialEq,
865
        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
866
    {
867
        let mut builder =
868
            GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
869
                .unwrap();
870
        builder.append(values[0]).unwrap();
871
        builder.append_null();
872
        builder.append(values[1]).unwrap();
873
        builder.append(values[0]).unwrap();
874
        let array = builder.finish();
875
876
        assert!(array.is_null(1));
877
        assert!(!array.is_valid(1));
878
879
        let keys = array.keys();
880
881
        assert_eq!(keys.value(0), 1);
882
        assert!(keys.is_null(1));
883
        // zero initialization is currently guaranteed by Buffer allocation and resizing
884
        assert_eq!(keys.value(1), 0);
885
        assert_eq!(keys.value(2), 2);
886
        assert_eq!(keys.value(3), 1);
887
    }
888
889
    #[test]
890
    fn test_string_dictionary_builder_with_reserved_null_value() {
891
        let v: Vec<Option<&str>> = vec![None];
892
        test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
893
            StringArray::from(v),
894
            vec!["abc", "def"],
895
        );
896
    }
897
898
    #[test]
899
    fn test_binary_dictionary_builder_with_reserved_null_value() {
900
        let values: Vec<Option<&[u8]>> = vec![None];
901
        test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
902
            BinaryArray::from(values),
903
            vec![b"abc", b"def"],
904
        );
905
    }
906
907
    #[test]
908
    fn test_extend() {
909
        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
910
        builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
911
        builder.extend(["c", "d", "a"].into_iter().map(Some));
912
        let dict = builder.finish();
913
        assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
914
        assert_eq!(dict.values().len(), 4);
915
    }
916
917
    #[test]
918
    fn test_extend_dictionary() {
919
        let some_dict = {
920
            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
921
            builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
922
            builder.extend([None::<&str>]);
923
            builder.extend(["c", "d", "a"].into_iter().map(Some));
924
            builder.append_null();
925
            builder.finish()
926
        };
927
928
        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
929
        builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
930
        builder
931
            .extend_dictionary(&some_dict.downcast_dict().unwrap())
932
            .unwrap();
933
        let dict = builder.finish();
934
935
        assert_eq!(dict.values().len(), 6);
936
937
        let values = dict
938
            .downcast_dict::<GenericByteArray<Utf8Type>>()
939
            .unwrap()
940
            .into_iter()
941
            .collect::<Vec<_>>();
942
943
        assert_eq!(
944
            values,
945
            [
946
                Some("e"),
947
                Some("e"),
948
                Some("f"),
949
                Some("e"),
950
                Some("d"),
951
                Some("a"),
952
                Some("b"),
953
                Some("c"),
954
                Some("a"),
955
                Some("b"),
956
                Some("c"),
957
                None,
958
                Some("c"),
959
                Some("d"),
960
                Some("a"),
961
                None
962
            ]
963
        );
964
    }
965
    #[test]
966
    fn test_extend_dictionary_with_null_in_mapped_value() {
967
        let some_dict = {
968
            let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
969
            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
970
971
            // Manually build a dictionary values that the mapped values have null
972
            values_builder.append_null();
973
            keys_builder.append_value(0);
974
            values_builder.append_value("I like worm hugs");
975
            keys_builder.append_value(1);
976
977
            let values = values_builder.finish();
978
            let keys = keys_builder.finish();
979
980
            let data_type = DataType::Dictionary(
981
                Box::new(Int32Type::DATA_TYPE),
982
                Box::new(Utf8Type::DATA_TYPE),
983
            );
984
985
            let builder = keys
986
                .into_data()
987
                .into_builder()
988
                .data_type(data_type)
989
                .child_data(vec![values.into_data()]);
990
991
            DictionaryArray::from(unsafe { builder.build_unchecked() })
992
        };
993
994
        let some_dict_values = some_dict.values().as_string::<i32>();
995
        assert_eq!(
996
            some_dict_values.into_iter().collect::<Vec<_>>(),
997
            &[None, Some("I like worm hugs")]
998
        );
999
1000
        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1001
        builder
1002
            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1003
            .unwrap();
1004
        let dict = builder.finish();
1005
1006
        assert_eq!(dict.values().len(), 1);
1007
1008
        let values = dict
1009
            .downcast_dict::<GenericByteArray<Utf8Type>>()
1010
            .unwrap()
1011
            .into_iter()
1012
            .collect::<Vec<_>>();
1013
1014
        assert_eq!(values, [None, Some("I like worm hugs")]);
1015
    }
1016
1017
    #[test]
1018
    fn test_extend_all_null_dictionary() {
1019
        let some_dict = {
1020
            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1021
            builder.append_nulls(2);
1022
            builder.finish()
1023
        };
1024
1025
        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1026
        builder
1027
            .extend_dictionary(&some_dict.downcast_dict().unwrap())
1028
            .unwrap();
1029
        let dict = builder.finish();
1030
1031
        assert_eq!(dict.values().len(), 0);
1032
1033
        let values = dict
1034
            .downcast_dict::<GenericByteArray<Utf8Type>>()
1035
            .unwrap()
1036
            .into_iter()
1037
            .collect::<Vec<_>>();
1038
1039
        assert_eq!(values, [None, None]);
1040
    }
1041
1042
    #[test]
1043
    fn test_finish_preserve_values() {
1044
        // Create the first dictionary
1045
        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
1046
        builder.append("a").unwrap();
1047
        builder.append("b").unwrap();
1048
        builder.append("c").unwrap();
1049
        let dict = builder.finish_preserve_values();
1050
        assert_eq!(dict.keys().values(), &[0, 1, 2]);
1051
        assert_eq!(dict.values().len(), 3);
1052
        let values = dict
1053
            .downcast_dict::<GenericByteArray<Utf8Type>>()
1054
            .unwrap()
1055
            .into_iter()
1056
            .collect::<Vec<_>>();
1057
        assert_eq!(values, [Some("a"), Some("b"), Some("c")]);
1058
1059
        // Create a new dictionary
1060
        builder.append("d").unwrap();
1061
        builder.append("e").unwrap();
1062
        let dict2 = builder.finish_preserve_values();
1063
1064
        // Make sure the keys are assigned after the old ones and we have the
1065
        // right values
1066
        assert_eq!(dict2.keys().values(), &[3, 4]);
1067
        let values = dict2
1068
            .downcast_dict::<GenericByteArray<Utf8Type>>()
1069
            .unwrap()
1070
            .into_iter()
1071
            .collect::<Vec<_>>();
1072
        assert_eq!(values, [Some("d"), Some("e")]);
1073
1074
        // Check that we have all of the expected values
1075
        assert_eq!(dict2.values().len(), 5);
1076
        let all_values = dict2
1077
            .values()
1078
            .as_any()
1079
            .downcast_ref::<StringArray>()
1080
            .unwrap()
1081
            .into_iter()
1082
            .collect::<Vec<_>>();
1083
        assert_eq!(
1084
            all_values,
1085
            [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"),]
1086
        );
1087
    }
1088
}