Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19
use crate::types::ArrowDictionaryKeyType;
20
use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray};
21
use arrow_buffer::ArrowNativeType;
22
use arrow_schema::DataType::FixedSizeBinary;
23
use arrow_schema::{ArrowError, DataType};
24
use hashbrown::HashTable;
25
use num::NumCast;
26
use std::any::Any;
27
use std::sync::Arc;
28
29
/// Builder for [`DictionaryArray`] of [`FixedSizeBinaryArray`]
30
///
31
/// The output array has a dictionary of unique, fixed-size binary values. The
32
/// builder handles deduplication.
33
///
34
/// # Example
35
/// ```
36
/// # use arrow_array::builder::{FixedSizeBinaryDictionaryBuilder};
37
/// # use arrow_array::array::{Array, FixedSizeBinaryArray};
38
/// # use arrow_array::DictionaryArray;
39
/// # use arrow_array::types::Int8Type;
40
/// // Build 3 byte FixedBinaryArrays
41
/// let byte_width = 3;
42
/// let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
43
/// builder.append("abc").unwrap();
44
/// builder.append_null();
45
/// builder.append(b"def").unwrap();
46
/// builder.append(b"def").unwrap(); // duplicate value
47
/// // Result is a Dictionary Array
48
/// let array = builder.finish();
49
/// let dict_array = array.as_any().downcast_ref::<DictionaryArray<Int8Type>>().unwrap();
50
/// // The array represents "abc", null, "def", "def"
51
/// assert_eq!(array.keys().len(), 4);
52
/// // but there are only 2 unique values
53
/// assert_eq!(array.values().len(), 2);
54
/// let values = dict_array.values().as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
55
/// assert_eq!(values.value(0), "abc".as_bytes());
56
/// assert_eq!(values.value(1), "def".as_bytes());
57
/// ```
58
///
59
/// [`FixedSizeBinaryArray`]: crate::FixedSizeBinaryArray
60
#[derive(Debug)]
61
pub struct FixedSizeBinaryDictionaryBuilder<K>
62
where
63
    K: ArrowDictionaryKeyType,
64
{
65
    state: ahash::RandomState,
66
    dedup: HashTable<usize>,
67
68
    keys_builder: PrimitiveBuilder<K>,
69
    values_builder: FixedSizeBinaryBuilder,
70
    byte_width: i32,
71
}
72
73
impl<K> FixedSizeBinaryDictionaryBuilder<K>
74
where
75
    K: ArrowDictionaryKeyType,
76
{
77
    /// Creates a new `FixedSizeBinaryDictionaryBuilder`
78
    pub fn new(byte_width: i32) -> Self {
79
        let keys_builder = PrimitiveBuilder::new();
80
        let values_builder = FixedSizeBinaryBuilder::new(byte_width);
81
        Self {
82
            state: Default::default(),
83
            dedup: HashTable::with_capacity(keys_builder.capacity()),
84
            keys_builder,
85
            values_builder,
86
            byte_width,
87
        }
88
    }
89
90
    /// Creates a new `FixedSizeBinaryDictionaryBuilder` with the provided capacities
91
    ///
92
    /// `keys_capacity`: the number of keys, i.e. length of array to build
93
    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
94
    /// `byte_width`: the byte width for individual values in the values array
95
0
    pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
96
0
        Self {
97
0
            state: Default::default(),
98
0
            dedup: Default::default(),
99
0
            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
100
0
            values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
101
0
            byte_width,
102
0
        }
103
0
    }
104
105
    /// Creates a new `FixedSizeBinaryDictionaryBuilder` from the existing builder with the same
106
    /// keys and values, but with a new data type for the keys.
107
    ///
108
    /// # Example
109
    /// ```
110
    /// # use arrow_array::builder::FixedSizeBinaryDictionaryBuilder;
111
    /// # use arrow_array::types::{UInt8Type, UInt16Type, UInt64Type};
112
    /// # use arrow_array::UInt16Array;
113
    /// # use arrow_schema::ArrowError;
114
    ///
115
    /// let mut u8_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt8Type>::new(2);
116
    /// // appending too many values causes the dictionary to overflow
117
    /// for i in 0..=255 {
118
    ///     u8_keyed_builder.append_value(vec![0, i]);
119
    /// }
120
    /// let result = u8_keyed_builder.append(vec![1, 0]);
121
    /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{})));
122
    ///
123
    /// // we need to upgrade to a larger key type
124
    /// let mut u16_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
125
    /// let dictionary_array = u16_keyed_builder.finish();
126
    /// let keys = dictionary_array.keys();
127
    ///
128
    /// assert_eq!(keys, &UInt16Array::from_iter(0..256));
129
    /// ```
130
    pub fn try_new_from_builder<K2>(
131
        mut source: FixedSizeBinaryDictionaryBuilder<K2>,
132
    ) -> Result<Self, ArrowError>
133
    where
134
        K::Native: NumCast,
135
        K2: ArrowDictionaryKeyType,
136
        K2::Native: NumCast,
137
    {
138
        let state = source.state;
139
        let dedup = source.dedup;
140
        let values_builder = source.values_builder;
141
        let byte_width = source.byte_width;
142
143
        let source_keys = source.keys_builder.finish();
144
        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
145
            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
146
                ArrowError::CastError(format!(
147
                    "Can't cast dictionary keys from source type {:?} to type {:?}",
148
                    K2::DATA_TYPE,
149
                    K::DATA_TYPE
150
                ))
151
            })
152
        })?;
153
154
        // drop source key here because currently source_keys and new_keys are holding reference to
155
        // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must
156
        // be the only reference holder.
157
        drop(source_keys);
158
159
        Ok(Self {
160
            state,
161
            dedup,
162
            keys_builder: new_keys
163
                .into_builder()
164
                .expect("underlying buffer has no references"),
165
            values_builder,
166
            byte_width,
167
        })
168
    }
169
}
170
171
impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
172
where
173
    K: ArrowDictionaryKeyType,
174
{
175
    /// Returns the builder as an non-mutable `Any` reference.
176
    fn as_any(&self) -> &dyn Any {
177
        self
178
    }
179
180
    /// Returns the builder as an mutable `Any` reference.
181
    fn as_any_mut(&mut self) -> &mut dyn Any {
182
        self
183
    }
184
185
    /// Returns the boxed builder as a box of `Any`.
186
    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
187
        self
188
    }
189
190
    /// Returns the number of array slots in the builder
191
    fn len(&self) -> usize {
192
        self.keys_builder.len()
193
    }
194
195
    /// Builds the array and reset this builder.
196
    fn finish(&mut self) -> ArrayRef {
197
        Arc::new(self.finish())
198
    }
199
200
    /// Builds the array without resetting the builder.
201
    fn finish_cloned(&self) -> ArrayRef {
202
        Arc::new(self.finish_cloned())
203
    }
204
}
205
206
impl<K> FixedSizeBinaryDictionaryBuilder<K>
207
where
208
    K: ArrowDictionaryKeyType,
209
{
210
0
    fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
211
0
        let value_bytes: &[u8] = value.as_ref();
212
213
0
        let state = &self.state;
214
0
        let storage = &mut self.values_builder;
215
0
        let hash = state.hash_one(value_bytes);
216
217
0
        let idx = *self
218
0
            .dedup
219
0
            .entry(
220
0
                hash,
221
0
                |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
222
0
                |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
223
            )
224
0
            .or_insert_with(|| {
225
0
                let idx = storage.len();
226
0
                let _ = storage.append_value(value);
227
0
                idx
228
0
            })
229
0
            .get();
230
231
0
        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
232
233
0
        Ok(key)
234
0
    }
235
236
    /// Append a value to the array. Return an existing index
237
    /// if already present in the values array or a new index if the
238
    /// value is appended to the values array.
239
    ///
240
    /// Returns an error if the new index would overflow the key type.
241
0
    pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
242
0
        if self.byte_width != value.as_ref().len() as i32 {
243
0
            Err(ArrowError::InvalidArgumentError(format!(
244
0
                "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
245
0
                self.byte_width,
246
0
                value.as_ref().len()
247
0
            )))
248
        } else {
249
0
            let key = self.get_or_insert_key(value)?;
250
0
            self.keys_builder.append_value(key);
251
0
            Ok(key)
252
        }
253
0
    }
254
255
    /// Appends a null slot into the builder
256
    #[inline]
257
0
    pub fn append_null(&mut self) {
258
0
        self.keys_builder.append_null()
259
0
    }
260
261
    /// Appends `n` `null`s into the builder.
262
    #[inline]
263
    pub fn append_nulls(&mut self, n: usize) {
264
        self.keys_builder.append_nulls(n);
265
    }
266
267
    /// Infallibly append a value to this builder
268
    ///
269
    /// # Panics
270
    ///
271
    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
272
    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
273
        self.append(value).expect("dictionary key overflow");
274
    }
275
276
    /// Builds the `DictionaryArray` and reset this builder.
277
0
    pub fn finish(&mut self) -> DictionaryArray<K> {
278
0
        self.dedup.clear();
279
0
        let values = self.values_builder.finish();
280
0
        let keys = self.keys_builder.finish();
281
282
0
        let data_type = DataType::Dictionary(
283
0
            Box::new(K::DATA_TYPE),
284
0
            Box::new(FixedSizeBinary(self.byte_width)),
285
0
        );
286
287
0
        let builder = keys
288
0
            .into_data()
289
0
            .into_builder()
290
0
            .data_type(data_type)
291
0
            .child_data(vec![values.into_data()]);
292
293
0
        DictionaryArray::from(unsafe { builder.build_unchecked() })
294
0
    }
295
296
    /// Builds the `DictionaryArray` without resetting the builder.
297
    pub fn finish_cloned(&self) -> DictionaryArray<K> {
298
        let values = self.values_builder.finish_cloned();
299
        let keys = self.keys_builder.finish_cloned();
300
301
        let data_type = DataType::Dictionary(
302
            Box::new(K::DATA_TYPE),
303
            Box::new(FixedSizeBinary(self.byte_width)),
304
        );
305
306
        let builder = keys
307
            .into_data()
308
            .into_builder()
309
            .data_type(data_type)
310
            .child_data(vec![values.into_data()]);
311
312
        DictionaryArray::from(unsafe { builder.build_unchecked() })
313
    }
314
315
    /// Builds the `DictionaryArray` without resetting the values builder or
316
    /// the internal de-duplication map.
317
    ///
318
    /// The advantage of doing this is that the values will represent the entire
319
    /// set of what has been built so-far by this builder and ensures
320
    /// consistency in the assignment of keys to values across multiple calls
321
    /// to `finish_preserve_values`. This enables ipc writers to efficiently
322
    /// emit delta dictionaries.
323
    ///
324
    /// The downside to this is that building the record requires creating a
325
    /// copy of the values, which can become slowly more expensive if the
326
    /// dictionary grows.
327
    ///
328
    /// Additionally, if record batches from multiple different dictionary
329
    /// builders for the same column are fed into a single ipc writer, beware
330
    /// that entire dictionaries are likely to be re-sent frequently even when
331
    /// the majority of the values are not used by the current record batch.
332
    pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> {
333
        let values = self.values_builder.finish_cloned();
334
        let keys = self.keys_builder.finish();
335
336
        let data_type = DataType::Dictionary(
337
            Box::new(K::DATA_TYPE),
338
            Box::new(FixedSizeBinary(self.byte_width)),
339
        );
340
341
        let builder = keys
342
            .into_data()
343
            .into_builder()
344
            .data_type(data_type)
345
            .child_data(vec![values.into_data()]);
346
347
        DictionaryArray::from(unsafe { builder.build_unchecked() })
348
    }
349
}
350
351
0
fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
352
0
    let values = values.values_slice();
353
0
    let start = idx * byte_width.as_usize();
354
0
    let end = idx * byte_width.as_usize() + byte_width.as_usize();
355
0
    &values[start..end]
356
0
}
357
358
#[cfg(test)]
359
mod tests {
360
    use super::*;
361
362
    use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type};
363
    use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array};
364
365
    #[test]
366
    fn test_fixed_size_dictionary_builder() {
367
        let values = ["abc", "def"];
368
369
        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
370
        assert_eq!(b.append(values[0]).unwrap(), 0);
371
        b.append_null();
372
        assert_eq!(b.append(values[1]).unwrap(), 1);
373
        assert_eq!(b.append(values[1]).unwrap(), 1);
374
        assert_eq!(b.append(values[0]).unwrap(), 0);
375
        b.append_nulls(2);
376
        assert_eq!(b.append(values[0]).unwrap(), 0);
377
        let array = b.finish();
378
379
        assert_eq!(
380
            array.keys(),
381
            &Int8Array::from(vec![
382
                Some(0),
383
                None,
384
                Some(1),
385
                Some(1),
386
                Some(0),
387
                None,
388
                None,
389
                Some(0)
390
            ]),
391
        );
392
393
        // Values are polymorphic and so require a downcast.
394
        let ava = array
395
            .values()
396
            .as_any()
397
            .downcast_ref::<FixedSizeBinaryArray>()
398
            .unwrap();
399
400
        assert_eq!(ava.value(0), values[0].as_bytes());
401
        assert_eq!(ava.value(1), values[1].as_bytes());
402
    }
403
404
    #[test]
405
    fn test_fixed_size_dictionary_builder_wrong_size() {
406
        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
407
        let err = b.append(b"too long").unwrap_err().to_string();
408
        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
409
        let err = b.append("").unwrap_err().to_string();
410
        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
411
    }
412
413
    #[test]
414
    fn test_fixed_size_dictionary_builder_finish_cloned() {
415
        let values = ["abc", "def", "ghi"];
416
417
        let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
418
419
        builder.append(values[0]).unwrap();
420
        builder.append_null();
421
        builder.append(values[1]).unwrap();
422
        builder.append(values[1]).unwrap();
423
        builder.append(values[0]).unwrap();
424
        let mut array = builder.finish_cloned();
425
426
        assert_eq!(
427
            array.keys(),
428
            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
429
        );
430
431
        // Values are polymorphic and so require a downcast.
432
        let ava = array
433
            .values()
434
            .as_any()
435
            .downcast_ref::<FixedSizeBinaryArray>()
436
            .unwrap();
437
438
        assert_eq!(ava.value(0), values[0].as_bytes());
439
        assert_eq!(ava.value(1), values[1].as_bytes());
440
441
        builder.append(values[0]).unwrap();
442
        builder.append(values[2]).unwrap();
443
        builder.append(values[1]).unwrap();
444
445
        array = builder.finish();
446
447
        assert_eq!(
448
            array.keys(),
449
            &Int8Array::from(vec![
450
                Some(0),
451
                None,
452
                Some(1),
453
                Some(1),
454
                Some(0),
455
                Some(0),
456
                Some(2),
457
                Some(1)
458
            ])
459
        );
460
461
        // Values are polymorphic and so require a downcast.
462
        let ava2 = array
463
            .values()
464
            .as_any()
465
            .downcast_ref::<FixedSizeBinaryArray>()
466
            .unwrap();
467
468
        assert_eq!(ava2.value(0), values[0].as_bytes());
469
        assert_eq!(ava2.value(1), values[1].as_bytes());
470
        assert_eq!(ava2.value(2), values[2].as_bytes());
471
    }
472
473
    fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>)
474
    where
475
        K1: ArrowDictionaryKeyType,
476
        K1::Native: NumCast,
477
        K2: ArrowDictionaryKeyType,
478
        K2::Native: NumCast + From<u8>,
479
    {
480
        let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3);
481
        source.append_value(values[0]);
482
        source.append_null();
483
        source.append_value(values[1]);
484
        source.append_value(values[2]);
485
486
        let mut result =
487
            FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap();
488
        let array = result.finish();
489
490
        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
491
        expected_keys_builder
492
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8));
493
        expected_keys_builder.append_null();
494
        expected_keys_builder
495
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8));
496
        expected_keys_builder
497
            .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8));
498
        let expected_keys = expected_keys_builder.finish();
499
        assert_eq!(array.keys(), &expected_keys);
500
501
        let av = array.values();
502
        let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
503
        assert_eq!(ava.value(0), values[0]);
504
        assert_eq!(ava.value(1), values[1]);
505
        assert_eq!(ava.value(2), values[2]);
506
    }
507
508
    #[test]
509
    fn test_try_new_from_builder() {
510
        let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]];
511
        // test cast to bigger size unsigned
512
        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone());
513
        // test cast going to smaller size unsigned
514
        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone());
515
        // test cast going to bigger size signed
516
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone());
517
        // test cast going to smaller size signed
518
        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
519
        // test going from signed to signed for different size changes
520
        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone());
521
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone());
522
        _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone());
523
        _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone());
524
    }
525
526
    #[test]
527
    fn test_try_new_from_builder_cast_fails() {
528
        let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2);
529
        for i in 0u16..257u16 {
530
            source_builder.append_value(vec![(i >> 8) as u8, i as u8]);
531
        }
532
533
        // there should be too many values that we can't downcast to the underlying type
534
        // we have keys that wouldn't fit into UInt8Type
535
        let result =
536
            FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder);
537
        assert!(result.is_err());
538
        if let Err(e) = result {
539
            assert!(matches!(e, ArrowError::CastError(_)));
540
            assert_eq!(
541
                e.to_string(),
542
                "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8"
543
            );
544
        }
545
    }
546
547
    #[test]
548
    fn test_finish_preserve_values() {
549
        // Create the first dictionary
550
        let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3);
551
        builder.append_value("aaa");
552
        builder.append_value("bbb");
553
        builder.append_value("ccc");
554
        let dict = builder.finish_preserve_values();
555
        assert_eq!(dict.keys().values(), &[0, 1, 2]);
556
        let values = dict
557
            .downcast_dict::<FixedSizeBinaryArray>()
558
            .unwrap()
559
            .into_iter()
560
            .collect::<Vec<_>>();
561
        assert_eq!(
562
            values,
563
            vec![
564
                Some("aaa".as_bytes()),
565
                Some("bbb".as_bytes()),
566
                Some("ccc".as_bytes())
567
            ]
568
        );
569
570
        // Create a new dictionary
571
        builder.append_value("ddd");
572
        builder.append_value("eee");
573
        let dict2 = builder.finish_preserve_values();
574
575
        // Make sure the keys are assigned after the old ones and we have the
576
        // right values
577
        assert_eq!(dict2.keys().values(), &[3, 4]);
578
        let values = dict2
579
            .downcast_dict::<FixedSizeBinaryArray>()
580
            .unwrap()
581
            .into_iter()
582
            .collect::<Vec<_>>();
583
        assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]);
584
585
        // Check that we have all of the expected values
586
        let all_values = dict2
587
            .values()
588
            .as_any()
589
            .downcast_ref::<FixedSizeBinaryArray>()
590
            .unwrap()
591
            .into_iter()
592
            .collect::<Vec<_>>();
593
        assert_eq!(
594
            all_values,
595
            [
596
                Some("aaa".as_bytes()),
597
                Some("bbb".as_bytes()),
598
                Some("ccc".as_bytes()),
599
                Some("ddd".as_bytes()),
600
                Some("eee".as_bytes())
601
            ]
602
        );
603
    }
604
}