Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-select/src/dictionary.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Dictionary utilities for Arrow arrays
19
20
use std::sync::Arc;
21
22
use crate::filter::filter;
23
use crate::interleave::interleave;
24
use ahash::RandomState;
25
use arrow_array::builder::BooleanBufferBuilder;
26
use arrow_array::types::{
27
    ArrowDictionaryKeyType, ArrowPrimitiveType, BinaryType, ByteArrayType, LargeBinaryType,
28
    LargeUtf8Type, Utf8Type,
29
};
30
use arrow_array::{cast::AsArray, downcast_primitive};
31
use arrow_array::{
32
    downcast_dictionary_array, AnyDictionaryArray, Array, ArrayRef, ArrowNativeTypeOp,
33
    BooleanArray, DictionaryArray, GenericByteArray, PrimitiveArray,
34
};
35
use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer, ToByteSlice};
36
use arrow_schema::{ArrowError, DataType};
37
38
/// Garbage collects a [DictionaryArray] by removing unreferenced values.
39
///
40
/// Returns a new [DictionaryArray] such that there are no values
41
/// that are not referenced by at least one key. There may still be duplicate
42
/// values.
43
///
44
/// See also [`garbage_collect_any_dictionary`] if you need to handle multiple dictionary types
45
0
pub fn garbage_collect_dictionary<K: ArrowDictionaryKeyType>(
46
0
    dictionary: &DictionaryArray<K>,
47
0
) -> Result<DictionaryArray<K>, ArrowError> {
48
0
    let keys = dictionary.keys();
49
0
    let values = dictionary.values();
50
51
0
    let mask = dictionary.occupancy();
52
53
    // If no work to do, return the original dictionary
54
0
    if mask.count_set_bits() == values.len() {
55
0
        return Ok(dictionary.clone());
56
0
    }
57
58
    // Create a mapping from the old keys to the new keys, use a Vec for easy indexing
59
0
    let mut key_remap = vec![K::Native::ZERO; values.len()];
60
0
    for (new_idx, old_idx) in mask.set_indices().enumerate() {
61
0
        key_remap[old_idx] = K::Native::from_usize(new_idx)
62
0
            .expect("new index should fit in K::Native, as old index was in range");
63
0
    }
64
65
    // ... and then build the new keys array
66
0
    let new_keys = keys.unary(|key| {
67
0
        key_remap
68
0
            .get(key.as_usize())
69
0
            .copied()
70
            // nulls may be present in the keys, and they will have arbitrary value; we don't care
71
            // and can safely return zero
72
0
            .unwrap_or(K::Native::ZERO)
73
0
    });
74
75
    // Create a new values array by filtering using the mask
76
0
    let values = filter(dictionary.values(), &BooleanArray::new(mask, None))?;
77
78
0
    Ok(DictionaryArray::new(new_keys, values))
79
0
}
80
81
/// Equivalent to [`garbage_collect_dictionary`] but without requiring casting to a specific key type.
82
0
pub fn garbage_collect_any_dictionary(
83
0
    dictionary: &dyn AnyDictionaryArray,
84
0
) -> Result<ArrayRef, ArrowError> {
85
    // FIXME: this is a workaround for MSRV Rust versions below 1.86 where trait upcasting is not stable.
86
    // From 1.86 onward, `&dyn AnyDictionaryArray` can be directly passed to `downcast_dictionary_array!`.
87
0
    let dictionary = &*dictionary.slice(0, dictionary.len());
88
0
    downcast_dictionary_array!(
89
0
        dictionary => garbage_collect_dictionary(dictionary).map(|dict| Arc::new(dict) as ArrayRef),
90
0
        _ => unreachable!("have a dictionary array")
91
    )
92
0
}
93
94
/// A best effort interner that maintains a fixed number of buckets
95
/// and interns keys based on their hash value
96
///
97
/// Hash collisions will result in replacement
98
struct Interner<'a, V> {
99
    state: RandomState,
100
    buckets: Vec<Option<InternerBucket<'a, V>>>,
101
    shift: u32,
102
}
103
104
/// A single bucket in [`Interner`].
105
type InternerBucket<'a, V> = (Option<&'a [u8]>, V);
106
107
impl<'a, V> Interner<'a, V> {
108
    /// Capacity controls the number of unique buckets allocated within the Interner
109
    ///
110
    /// A larger capacity reduces the probability of hash collisions, and should be set
111
    /// based on an approximation of the upper bound of unique values
112
3
    fn new(capacity: usize) -> Self {
113
        // Add additional buckets to help reduce collisions
114
3
        let shift = (capacity as u64 + 128).leading_zeros();
115
3
        let num_buckets = (u64::MAX >> shift) as usize;
116
3
        let buckets = (0..num_buckets.saturating_add(1)).map(|_| None).collect();
117
3
        Self {
118
3
            // A fixed seed to ensure deterministic behaviour
119
3
            state: RandomState::with_seeds(0, 0, 0, 0),
120
3
            buckets,
121
3
            shift,
122
3
        }
123
3
    }
124
125
11
    fn intern<F: FnOnce() -> Result<V, E>, E>(
126
11
        &mut self,
127
11
        new: Option<&'a [u8]>,
128
11
        f: F,
129
11
    ) -> Result<&V, E> {
130
11
        let hash = self.state.hash_one(new);
131
11
        let bucket_idx = hash >> self.shift;
132
11
        Ok(match &mut self.buckets[bucket_idx as usize] {
133
0
            Some((current, v)) => {
134
0
                if *current != new {
135
0
                    *v = f()?;
136
0
                    *current = new;
137
0
                }
138
0
                v
139
            }
140
11
            slot => &slot.insert((new, f()
?0
)).1,
141
        })
142
11
    }
143
}
144
145
pub(crate) struct MergedDictionaries<K: ArrowDictionaryKeyType> {
146
    /// Provides `key_mappings[`array_idx`][`old_key`] -> new_key`
147
    pub key_mappings: Vec<Vec<K::Native>>,
148
    /// The new values
149
    pub values: ArrayRef,
150
}
151
152
/// Performs a cheap, pointer-based comparison of two byte array
153
///
154
/// See [`ScalarBuffer::ptr_eq`]
155
3
fn bytes_ptr_eq<T: ByteArrayType>(a: &dyn Array, b: &dyn Array) -> bool {
156
3
    match (a.as_bytes_opt::<T>(), b.as_bytes_opt::<T>()) {
157
3
        (Some(a), Some(b)) => {
158
3
            let values_eq = a.values().ptr_eq(b.values()) && 
a.offsets()0
.
ptr_eq0
(
b.offsets()0
);
159
3
            match (a.nulls(), b.nulls()) {
160
0
                (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()),
161
3
                (None, None) => values_eq,
162
0
                _ => false,
163
            }
164
        }
165
0
        _ => false,
166
    }
167
3
}
168
169
/// A type-erased function that compares two array for pointer equality
170
type PtrEq = fn(&dyn Array, &dyn Array) -> bool;
171
172
/// A weak heuristic of whether to merge dictionary values that aims to only
173
/// perform the expensive merge computation when it is likely to yield at least
174
/// some return over the naive approach used by MutableArrayData
175
///
176
/// `len` is the total length of the merged output
177
3
pub(crate) fn should_merge_dictionary_values<K: ArrowDictionaryKeyType>(
178
3
    dictionaries: &[&DictionaryArray<K>],
179
3
    len: usize,
180
3
) -> bool {
181
    use DataType::*;
182
3
    let first_values = dictionaries[0].values().as_ref();
183
3
    let ptr_eq: PtrEq = match first_values.data_type() {
184
3
        Utf8 => bytes_ptr_eq::<Utf8Type>,
185
0
        LargeUtf8 => bytes_ptr_eq::<LargeUtf8Type>,
186
0
        Binary => bytes_ptr_eq::<BinaryType>,
187
0
        LargeBinary => bytes_ptr_eq::<LargeBinaryType>,
188
0
        dt => {
189
0
            if !dt.is_primitive() {
190
0
                return false;
191
0
            }
192
0
            |a, b| a.to_data().ptr_eq(&b.to_data())
193
        }
194
    };
195
196
3
    let mut single_dictionary = true;
197
3
    let mut total_values = first_values.len();
198
3
    for dict in dictionaries.iter().skip(1) {
199
3
        let values = dict.values().as_ref();
200
3
        total_values += values.len();
201
3
        if single_dictionary {
202
3
            single_dictionary = ptr_eq(first_values, values)
203
0
        }
204
    }
205
206
3
    let overflow = K::Native::from_usize(total_values).is_none();
207
3
    let values_exceed_length = total_values >= len;
208
209
3
    !single_dictionary && (overflow || values_exceed_length)
210
3
}
211
212
/// Given an array of dictionaries and an optional key mask compute a values array
213
/// containing referenced values, along with mappings from the [`DictionaryArray`]
214
/// keys to the new keys within this values array. Best-effort will be made to ensure
215
/// that the dictionary values are unique
216
///
217
/// This method is meant to be very fast and the output dictionary values
218
/// may not be unique, unlike `GenericByteDictionaryBuilder` which is slower
219
/// but produces unique values
220
3
pub(crate) fn merge_dictionary_values<K: ArrowDictionaryKeyType>(
221
3
    dictionaries: &[&DictionaryArray<K>],
222
3
    masks: Option<&[BooleanBuffer]>,
223
3
) -> Result<MergedDictionaries<K>, ArrowError> {
224
3
    let mut num_values = 0;
225
226
3
    let mut values_arrays = Vec::with_capacity(dictionaries.len());
227
3
    let mut value_slices = Vec::with_capacity(dictionaries.len());
228
229
6
    for (idx, dictionary) in 
dictionaries3
.
iter3
().
enumerate3
() {
230
6
        let mask = masks.and_then(|m| 
m0
.
get0
(
idx0
));
231
        let key_mask_owned;
232
6
        let key_mask = match (dictionary.nulls(), mask) {
233
1
            (Some(n), None) => Some(n.inner()),
234
0
            (None, Some(n)) => Some(n),
235
0
            (Some(n), Some(m)) => {
236
0
                key_mask_owned = n.inner() & m;
237
0
                Some(&key_mask_owned)
238
            }
239
5
            (None, None) => None,
240
        };
241
6
        let keys = dictionary.keys().values();
242
6
        let values = dictionary.values().as_ref();
243
6
        let values_mask = compute_values_mask(keys, key_mask, values.len());
244
245
6
        let masked_values = get_masked_values(values, &values_mask);
246
6
        num_values += masked_values.len();
247
6
        value_slices.push(masked_values);
248
6
        values_arrays.push(values)
249
    }
250
251
    // Map from value to new index
252
3
    let mut interner = Interner::new(num_values);
253
    // Interleave indices for new values array
254
3
    let mut indices = Vec::with_capacity(num_values);
255
256
    // Compute the mapping for each dictionary
257
3
    let key_mappings = dictionaries
258
3
        .iter()
259
3
        .enumerate()
260
3
        .zip(value_slices)
261
6
        .
map3
(|((dictionary_idx, dictionary), values)| {
262
6
            let zero = K::Native::from_usize(0).unwrap();
263
6
            let mut mapping = vec![zero; dictionary.values().len()];
264
265
17
            for (
value_idx11
,
value11
) in values {
266
11
                mapping[value_idx] =
267
11
                    *interner.intern(value, || match K::Native::from_usize(indices.len()) {
268
11
                        Some(idx) => {
269
11
                            indices.push((dictionary_idx, value_idx));
270
11
                            Ok(idx)
271
                        }
272
0
                        None => Err(ArrowError::DictionaryKeyOverflowError),
273
11
                    })
?0
;
274
            }
275
6
            Ok(mapping)
276
6
        })
277
3
        .collect::<Result<Vec<_>, ArrowError>>()
?0
;
278
279
    Ok(MergedDictionaries {
280
3
        key_mappings,
281
3
        values: interleave(&values_arrays, &indices)
?0
,
282
    })
283
3
}
284
285
/// Return a mask identifying the values that are referenced by keys in `dictionary`
286
/// at the positions indicated by `selection`
287
6
fn compute_values_mask<K: ArrowNativeType>(
288
6
    keys: &ScalarBuffer<K>,
289
6
    mask: Option<&BooleanBuffer>,
290
6
    max_key: usize,
291
6
) -> BooleanBuffer {
292
6
    let mut builder = BooleanBufferBuilder::new(max_key);
293
6
    builder.advance(max_key);
294
295
6
    match mask {
296
1
        Some(n) => n
297
1
            .set_indices()
298
1
            .for_each(|idx| builder.set_bit(keys[idx].as_usize(), true)),
299
5
        None => keys
300
5
            .iter()
301
10
            .
for_each5
(|k| builder.set_bit(k.as_usize(), true)),
302
    }
303
6
    builder.finish()
304
6
}
305
306
/// Process primitive array values to bytes
307
0
fn masked_primitives_to_bytes<'a, T: ArrowPrimitiveType>(
308
0
    array: &'a PrimitiveArray<T>,
309
0
    mask: &BooleanBuffer,
310
0
) -> Vec<(usize, Option<&'a [u8]>)>
311
0
where
312
0
    T::Native: ToByteSlice,
313
{
314
0
    let mut out = Vec::with_capacity(mask.count_set_bits());
315
0
    let values = array.values();
316
0
    for idx in mask.set_indices() {
317
0
        out.push((
318
0
            idx,
319
0
            array.is_valid(idx).then_some(values[idx].to_byte_slice()),
320
0
        ))
321
    }
322
0
    out
323
0
}
324
325
macro_rules! masked_primitive_to_bytes_helper {
326
    ($t:ty, $array:expr, $mask:expr) => {
327
        masked_primitives_to_bytes::<$t>($array.as_primitive(), $mask)
328
    };
329
}
330
331
/// Return a Vec containing for each set index in `mask`, the index and byte value of that index
332
6
fn get_masked_values<'a>(
333
6
    array: &'a dyn Array,
334
6
    mask: &BooleanBuffer,
335
6
) -> Vec<(usize, Option<&'a [u8]>)> {
336
0
    downcast_primitive! {
337
6
        array.data_type() => (masked_primitive_to_bytes_helper, 
array0
,
mask0
),
338
6
        DataType::Utf8 => masked_bytes(array.as_string::<i32>(), mask),
339
0
        DataType::LargeUtf8 => masked_bytes(array.as_string::<i64>(), mask),
340
0
        DataType::Binary => masked_bytes(array.as_binary::<i32>(), mask),
341
0
        DataType::LargeBinary => masked_bytes(array.as_binary::<i64>(), mask),
342
0
        _ => unimplemented!("Dictionary merging for type {} is not implemented", array.data_type()),
343
    }
344
6
}
345
346
/// Compute [`get_masked_values`] for a [`GenericByteArray`]
347
///
348
/// Note: this does not check the null mask and will return values contained in null slots
349
6
fn masked_bytes<'a, T: ByteArrayType>(
350
6
    array: &'a GenericByteArray<T>,
351
6
    mask: &BooleanBuffer,
352
6
) -> Vec<(usize, Option<&'a [u8]>)> {
353
6
    let mut out = Vec::with_capacity(mask.count_set_bits());
354
11
    for idx in 
mask6
.
set_indices6
() {
355
11
        out.push((
356
11
            idx,
357
11
            array.is_valid(idx).then_some(array.value(idx).as_ref()),
358
11
        ))
359
    }
360
6
    out
361
6
}
362
363
#[cfg(test)]
364
mod tests {
365
    use super::*;
366
367
    use arrow_array::cast::as_string_array;
368
    use arrow_array::types::Int32Type;
369
    use arrow_array::types::Int8Type;
370
    use arrow_array::{DictionaryArray, Int32Array, Int8Array, StringArray};
371
    use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, OffsetBuffer};
372
    use std::sync::Arc;
373
374
    #[test]
375
    fn test_garbage_collect_i32_dictionary() {
376
        let values = StringArray::from_iter_values(["a", "b", "c", "d"]);
377
        let keys = Int32Array::from_iter_values([0, 1, 1, 3, 0, 0, 1]);
378
        let dict = DictionaryArray::<Int32Type>::new(keys, Arc::new(values));
379
380
        // Only "a", "b", "d" are referenced, "c" is not
381
        let gc = garbage_collect_dictionary(&dict).unwrap();
382
383
        let expected_values = StringArray::from_iter_values(["a", "b", "d"]);
384
        let expected_keys = Int32Array::from_iter_values([0, 1, 1, 2, 0, 0, 1]);
385
        let expected = DictionaryArray::<Int32Type>::new(expected_keys, Arc::new(expected_values));
386
387
        assert_eq!(gc, expected);
388
    }
389
390
    #[test]
391
    fn test_garbage_collect_any_dictionary() {
392
        let values = StringArray::from_iter_values(["a", "b", "c", "d"]);
393
        let keys = Int32Array::from_iter_values([0, 1, 1, 3, 0, 0, 1]);
394
        let dict = DictionaryArray::<Int32Type>::new(keys, Arc::new(values));
395
396
        let gc = garbage_collect_any_dictionary(&dict).unwrap();
397
398
        let expected_values = StringArray::from_iter_values(["a", "b", "d"]);
399
        let expected_keys = Int32Array::from_iter_values([0, 1, 1, 2, 0, 0, 1]);
400
        let expected = DictionaryArray::<Int32Type>::new(expected_keys, Arc::new(expected_values));
401
402
        assert_eq!(gc.as_ref(), &expected);
403
    }
404
405
    #[test]
406
    fn test_garbage_collect_with_nulls() {
407
        let values = StringArray::from_iter_values(["a", "b", "c"]);
408
        let keys = Int8Array::from(vec![Some(2), None, Some(0)]);
409
        let dict = DictionaryArray::<Int8Type>::new(keys, Arc::new(values));
410
411
        let gc = garbage_collect_dictionary(&dict).unwrap();
412
413
        let expected_values = StringArray::from_iter_values(["a", "c"]);
414
        let expected_keys = Int8Array::from(vec![Some(1), None, Some(0)]);
415
        let expected = DictionaryArray::<Int8Type>::new(expected_keys, Arc::new(expected_values));
416
417
        assert_eq!(gc, expected);
418
    }
419
420
    #[test]
421
    fn test_garbage_collect_empty_dictionary() {
422
        let values = StringArray::from_iter_values::<&str, _>([]);
423
        let keys = Int32Array::from_iter_values([]);
424
        let dict = DictionaryArray::<Int32Type>::new(keys, Arc::new(values));
425
426
        let gc = garbage_collect_dictionary(&dict).unwrap();
427
428
        assert_eq!(gc, dict);
429
    }
430
431
    #[test]
432
    fn test_garbage_collect_dictionary_all_unreferenced() {
433
        let values = StringArray::from_iter_values(["a", "b", "c"]);
434
        let keys = Int32Array::from(vec![None, None, None]);
435
        let dict = DictionaryArray::<Int32Type>::new(keys, Arc::new(values));
436
437
        let gc = garbage_collect_dictionary(&dict).unwrap();
438
439
        // All keys are null, so dictionary values can be empty
440
        let expected_values = StringArray::from_iter_values::<&str, _>([]);
441
        let expected_keys = Int32Array::from(vec![None, None, None]);
442
        let expected = DictionaryArray::<Int32Type>::new(expected_keys, Arc::new(expected_values));
443
444
        assert_eq!(gc, expected);
445
    }
446
447
    #[test]
448
    fn test_merge_strings() {
449
        let a = DictionaryArray::<Int32Type>::from_iter(["a", "b", "a", "b", "d", "c", "e"]);
450
        let b = DictionaryArray::<Int32Type>::from_iter(["c", "f", "c", "d", "a", "d"]);
451
        let merged = merge_dictionary_values(&[&a, &b], None).unwrap();
452
453
        let values = as_string_array(merged.values.as_ref());
454
        let actual: Vec<_> = values.iter().map(Option::unwrap).collect();
455
        assert_eq!(&actual, &["a", "b", "d", "c", "e", "f"]);
456
457
        assert_eq!(merged.key_mappings.len(), 2);
458
        assert_eq!(&merged.key_mappings[0], &[0, 1, 2, 3, 4]);
459
        assert_eq!(&merged.key_mappings[1], &[3, 5, 2, 0]);
460
461
        let a_slice = a.slice(1, 4);
462
        let merged = merge_dictionary_values(&[&a_slice, &b], None).unwrap();
463
464
        let values = as_string_array(merged.values.as_ref());
465
        let actual: Vec<_> = values.iter().map(Option::unwrap).collect();
466
        assert_eq!(&actual, &["a", "b", "d", "c", "f"]);
467
468
        assert_eq!(merged.key_mappings.len(), 2);
469
        assert_eq!(&merged.key_mappings[0], &[0, 1, 2, 0, 0]);
470
        assert_eq!(&merged.key_mappings[1], &[3, 4, 2, 0]);
471
472
        // Mask out only ["b", "b", "d"] from a
473
        let a_mask = BooleanBuffer::from_iter([false, true, false, true, true, false, false]);
474
        let b_mask = BooleanBuffer::new_set(b.len());
475
        let merged = merge_dictionary_values(&[&a, &b], Some(&[a_mask, b_mask])).unwrap();
476
477
        let values = as_string_array(merged.values.as_ref());
478
        let actual: Vec<_> = values.iter().map(Option::unwrap).collect();
479
        assert_eq!(&actual, &["b", "d", "c", "f", "a"]);
480
481
        assert_eq!(merged.key_mappings.len(), 2);
482
        assert_eq!(&merged.key_mappings[0], &[0, 0, 1, 0, 0]);
483
        assert_eq!(&merged.key_mappings[1], &[2, 3, 1, 4]);
484
    }
485
486
    #[test]
487
    fn test_merge_nulls() {
488
        let buffer = Buffer::from(b"helloworldbingohelloworld");
489
        let offsets = OffsetBuffer::from_lengths([5, 5, 5, 5, 5]);
490
        let nulls = NullBuffer::from(vec![true, false, true, true, true]);
491
        let values = StringArray::new(offsets, buffer, Some(nulls));
492
493
        let key_values = vec![1, 2, 3, 1, 8, 2, 3];
494
        let key_nulls = NullBuffer::from(vec![true, true, false, true, false, true, true]);
495
        let keys = Int32Array::new(key_values.into(), Some(key_nulls));
496
        let a = DictionaryArray::new(keys, Arc::new(values));
497
        // [NULL, "bingo", NULL, NULL, NULL, "bingo", "hello"]
498
499
        let b = DictionaryArray::new(Int32Array::new_null(10), Arc::new(StringArray::new_null(0)));
500
501
        let merged = merge_dictionary_values(&[&a, &b], None).unwrap();
502
        let expected = StringArray::from(vec![None, Some("bingo"), Some("hello")]);
503
        assert_eq!(merged.values.as_ref(), &expected);
504
        assert_eq!(merged.key_mappings.len(), 2);
505
        assert_eq!(&merged.key_mappings[0], &[0, 0, 1, 2, 0]);
506
        assert_eq!(&merged.key_mappings[1], &[] as &[i32; 0]);
507
    }
508
509
    #[test]
510
    fn test_merge_keys_smaller() {
511
        let values = StringArray::from_iter_values(["a", "b"]);
512
        let keys = Int32Array::from_iter_values([1]);
513
        let a = DictionaryArray::new(keys, Arc::new(values));
514
515
        let merged = merge_dictionary_values(&[&a], None).unwrap();
516
        let expected = StringArray::from(vec!["b"]);
517
        assert_eq!(merged.values.as_ref(), &expected);
518
    }
519
}