Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-array/src/array/dictionary_array.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder};
19
use crate::cast::AsArray;
20
use crate::iterator::ArrayIter;
21
use crate::types::*;
22
use crate::{
23
    make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, PrimitiveArray, Scalar,
24
    StringArray,
25
};
26
use arrow_buffer::bit_util::set_bit;
27
use arrow_buffer::buffer::NullBuffer;
28
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
29
use arrow_data::ArrayData;
30
use arrow_schema::{ArrowError, DataType};
31
use std::any::Any;
32
use std::sync::Arc;
33
34
/// A [`DictionaryArray`] indexed by `i8`
35
///
36
/// # Example: Using `collect`
37
/// ```
38
/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray};
39
/// # use std::sync::Arc;
40
///
41
/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
42
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
43
/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
44
/// assert_eq!(array.values(), &values);
45
/// ```
46
///
47
/// See [`DictionaryArray`] for more information and examples
48
pub type Int8DictionaryArray = DictionaryArray<Int8Type>;
49
50
/// A [`DictionaryArray`] indexed by `i16`
51
///
52
/// # Example: Using `collect`
53
/// ```
54
/// # use arrow_array::{Array, Int16DictionaryArray, Int16Array, StringArray};
55
/// # use std::sync::Arc;
56
///
57
/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
58
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
59
/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2]));
60
/// assert_eq!(array.values(), &values);
61
/// ```
62
///
63
/// See [`DictionaryArray`] for more information and examples
64
pub type Int16DictionaryArray = DictionaryArray<Int16Type>;
65
66
/// A [`DictionaryArray`] indexed by `i32`
67
///
68
/// # Example: Using `collect`
69
/// ```
70
/// # use arrow_array::{Array, Int32DictionaryArray, Int32Array, StringArray};
71
/// # use std::sync::Arc;
72
///
73
/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
74
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
75
/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2]));
76
/// assert_eq!(array.values(), &values);
77
/// ```
78
///
79
/// See [`DictionaryArray`] for more information and examples
80
pub type Int32DictionaryArray = DictionaryArray<Int32Type>;
81
82
/// A [`DictionaryArray`] indexed by `i64`
83
///
84
/// # Example: Using `collect`
85
/// ```
86
/// # use arrow_array::{Array, Int64DictionaryArray, Int64Array, StringArray};
87
/// # use std::sync::Arc;
88
///
89
/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
90
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
91
/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2]));
92
/// assert_eq!(array.values(), &values);
93
/// ```
94
///
95
/// See [`DictionaryArray`] for more information and examples
96
pub type Int64DictionaryArray = DictionaryArray<Int64Type>;
97
98
/// A [`DictionaryArray`] indexed by `u8`
99
///
100
/// # Example: Using `collect`
101
/// ```
102
/// # use arrow_array::{Array, UInt8DictionaryArray, UInt8Array, StringArray};
103
/// # use std::sync::Arc;
104
///
105
/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
106
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
107
/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2]));
108
/// assert_eq!(array.values(), &values);
109
/// ```
110
///
111
/// See [`DictionaryArray`] for more information and examples
112
pub type UInt8DictionaryArray = DictionaryArray<UInt8Type>;
113
114
/// A [`DictionaryArray`] indexed by `u16`
115
///
116
/// # Example: Using `collect`
117
/// ```
118
/// # use arrow_array::{Array, UInt16DictionaryArray, UInt16Array, StringArray};
119
/// # use std::sync::Arc;
120
///
121
/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
122
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
123
/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2]));
124
/// assert_eq!(array.values(), &values);
125
/// ```
126
///
127
/// See [`DictionaryArray`] for more information and examples
128
pub type UInt16DictionaryArray = DictionaryArray<UInt16Type>;
129
130
/// A [`DictionaryArray`] indexed by `u32`
131
///
132
/// # Example: Using `collect`
133
/// ```
134
/// # use arrow_array::{Array, UInt32DictionaryArray, UInt32Array, StringArray};
135
/// # use std::sync::Arc;
136
///
137
/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
138
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
139
/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2]));
140
/// assert_eq!(array.values(), &values);
141
/// ```
142
///
143
/// See [`DictionaryArray`] for more information and examples
144
pub type UInt32DictionaryArray = DictionaryArray<UInt32Type>;
145
146
/// A [`DictionaryArray`] indexed by `u64`
147
///
148
/// # Example: Using `collect`
149
/// ```
150
/// # use arrow_array::{Array, UInt64DictionaryArray, UInt64Array, StringArray};
151
/// # use std::sync::Arc;
152
///
153
/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
154
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
155
/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2]));
156
/// assert_eq!(array.values(), &values);
157
/// ```
158
///
159
/// See [`DictionaryArray`] for more information and examples
160
pub type UInt64DictionaryArray = DictionaryArray<UInt64Type>;
161
162
/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
163
///
164
/// This is mostly used to represent strings or a limited set of primitive types as integers,
165
/// for example when doing NLP analysis or representing chromosomes by name.
166
///
167
/// [`DictionaryArray`] are represented using a `keys` array and a
168
/// `values` array, which may be different lengths. The `keys` array
169
/// stores indexes in the `values` array which holds
170
/// the corresponding logical value, as shown here:
171
///
172
/// ```text
173
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
174
///   ┌─────────────────┐  ┌─────────┐ │     ┌─────────────────┐
175
/// │ │        A        │  │    0    │       │        A        │     values[keys[0]]
176
///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
177
/// │ │        D        │  │    2    │       │        B        │     values[keys[1]]
178
///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
179
/// │ │        B        │  │    2    │       │        B        │     values[keys[2]]
180
///   └─────────────────┘  ├─────────┤ │     ├─────────────────┤
181
/// │                      │    1    │       │        D        │     values[keys[3]]
182
///                        ├─────────┤ │     ├─────────────────┤
183
/// │                      │    1    │       │        D        │     values[keys[4]]
184
///                        ├─────────┤ │     ├─────────────────┤
185
/// │                      │    0    │       │        A        │     values[keys[5]]
186
///                        └─────────┘ │     └─────────────────┘
187
/// │       values            keys
188
///  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
189
///                                             Logical array
190
///                                                Contents
191
///           DictionaryArray
192
///              length = 6
193
/// ```
194
///
195
/// # Example: From Nullable Data
196
///
197
/// ```
198
/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
199
/// let test = vec!["a", "a", "b", "c"];
200
/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
201
/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
202
/// ```
203
///
204
/// # Example: From Non-Nullable Data
205
///
206
/// ```
207
/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
208
/// let test = vec!["a", "a", "b", "c"];
209
/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
210
/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
211
/// ```
212
///
213
/// # Example: From Existing Arrays
214
///
215
/// ```
216
/// # use std::sync::Arc;
217
/// # use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type};
218
/// // You can form your own DictionaryArray by providing the
219
/// // values (dictionary) and keys (indexes into the dictionary):
220
/// let values = StringArray::from_iter_values(["a", "b", "c"]);
221
/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
222
/// let array = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
223
/// let expected: DictionaryArray::<Int8Type> = vec!["a", "a", "b", "c"].into_iter().collect();
224
/// assert_eq!(&array, &expected);
225
/// ```
226
///
227
/// # Example: Using Builder
228
///
229
/// ```
230
/// # use arrow_array::{Array, StringArray};
231
/// # use arrow_array::builder::StringDictionaryBuilder;
232
/// # use arrow_array::types::Int32Type;
233
/// let mut builder = StringDictionaryBuilder::<Int32Type>::new();
234
/// builder.append_value("a");
235
/// builder.append_null();
236
/// builder.append_value("a");
237
/// builder.append_value("b");
238
/// let array = builder.finish();
239
///
240
/// let values: Vec<_> = array.downcast_dict::<StringArray>().unwrap().into_iter().collect();
241
/// assert_eq!(&values, &[Some("a"), None, Some("a"), Some("b")]);
242
/// ```
243
pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
244
    data_type: DataType,
245
246
    /// The keys of this dictionary. These are constructed from the
247
    /// buffer and null bitmap of `data`.  Also, note that these do
248
    /// not correspond to the true values of this array. Rather, they
249
    /// map to the real values.
250
    keys: PrimitiveArray<K>,
251
252
    /// Array of dictionary values (can be any DataType).
253
    values: ArrayRef,
254
255
    /// Values are ordered.
256
    is_ordered: bool,
257
}
258
259
impl<K: ArrowDictionaryKeyType> Clone for DictionaryArray<K> {
260
12
    fn clone(&self) -> Self {
261
12
        Self {
262
12
            data_type: self.data_type.clone(),
263
12
            keys: self.keys.clone(),
264
12
            values: self.values.clone(),
265
12
            is_ordered: self.is_ordered,
266
12
        }
267
12
    }
268
}
269
270
impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
271
    /// Attempt to create a new DictionaryArray with a specified keys
272
    /// (indexes into the dictionary) and values (dictionary)
273
    /// array.
274
    ///
275
    /// # Panics
276
    ///
277
    /// Panics if [`Self::try_new`] returns an error
278
0
    pub fn new(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
279
0
        Self::try_new(keys, values).unwrap()
280
0
    }
281
282
    /// Attempt to create a new DictionaryArray with a specified keys
283
    /// (indexes into the dictionary) and values (dictionary)
284
    /// array.
285
    ///
286
    /// # Errors
287
    ///
288
    /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0`
289
17
    pub fn try_new(keys: PrimitiveArray<K>, values: ArrayRef) -> Result<Self, ArrowError> {
290
17
        let data_type = DataType::Dictionary(
291
17
            Box::new(keys.data_type().clone()),
292
17
            Box::new(values.data_type().clone()),
293
17
        );
294
295
17
        let zero = K::Native::usize_as(0);
296
17
        let values_len = values.len();
297
298
0
        if let Some((idx, v)) =
299
51
            
keys.values().iter().enumerate()17
.
find17
(|(idx, v)| {
300
51
                (v.is_lt(zero) || v.as_usize() >= values_len) && 
keys0
.
is_valid0
(
*idx0
)
301
51
            })
302
        {
303
0
            return Err(ArrowError::InvalidArgumentError(format!(
304
0
                "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}",
305
0
            )));
306
17
        }
307
308
17
        Ok(Self {
309
17
            data_type,
310
17
            keys,
311
17
            values,
312
17
            is_ordered: false,
313
17
        })
314
17
    }
315
316
    /// Create a new [`Scalar`] from `value`
317
    pub fn new_scalar<T: Array + 'static>(value: Scalar<T>) -> Scalar<Self> {
318
        Scalar::new(Self::new(
319
            PrimitiveArray::new(vec![K::Native::usize_as(0)].into(), None),
320
            Arc::new(value.into_inner()),
321
        ))
322
    }
323
324
    /// Create a new [`DictionaryArray`] without performing validation
325
    ///
326
    /// # Safety
327
    ///
328
    /// Safe provided [`Self::try_new`] would not return an error
329
3
    pub unsafe fn new_unchecked(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
330
3
        if cfg!(feature = "force_validate") {
331
0
            return Self::new(keys, values);
332
3
        }
333
334
3
        let data_type = DataType::Dictionary(
335
3
            Box::new(keys.data_type().clone()),
336
3
            Box::new(values.data_type().clone()),
337
3
        );
338
339
3
        Self {
340
3
            data_type,
341
3
            keys,
342
3
            values,
343
3
            is_ordered: false,
344
3
        }
345
3
    }
346
347
    /// Deconstruct this array into its constituent parts
348
    pub fn into_parts(self) -> (PrimitiveArray<K>, ArrayRef) {
349
        (self.keys, self.values)
350
    }
351
352
    /// Return an array view of the keys of this dictionary as a PrimitiveArray.
353
14
    pub fn keys(&self) -> &PrimitiveArray<K> {
354
14
        &self.keys
355
14
    }
356
357
    /// If `value` is present in `values` (aka the dictionary),
358
    /// returns the corresponding key (index into the `values`
359
    /// array). Otherwise returns `None`.
360
    ///
361
    /// Panics if `values` is not a [`StringArray`].
362
    pub fn lookup_key(&self, value: &str) -> Option<K::Native> {
363
        let rd_buf: &StringArray = self.values.as_any().downcast_ref::<StringArray>().unwrap();
364
365
        (0..rd_buf.len())
366
            .position(|i| rd_buf.value(i) == value)
367
            .and_then(K::Native::from_usize)
368
    }
369
370
    /// Returns a reference to the dictionary values array
371
23
    pub fn values(&self) -> &ArrayRef {
372
23
        &self.values
373
23
    }
374
375
    /// Returns a clone of the value type of this list.
376
    pub fn value_type(&self) -> DataType {
377
        self.values.data_type().clone()
378
    }
379
380
    /// The length of the dictionary is the length of the keys array.
381
3
    pub fn len(&self) -> usize {
382
3
        self.keys.len()
383
3
    }
384
385
    /// Whether this dictionary is empty
386
1
    pub fn is_empty(&self) -> bool {
387
1
        self.keys.is_empty()
388
1
    }
389
390
    /// Currently exists for compatibility purposes with Arrow IPC.
391
    pub fn is_ordered(&self) -> bool {
392
        self.is_ordered
393
    }
394
395
    /// Return an iterator over the keys (indexes into the dictionary)
396
    pub fn keys_iter(&self) -> impl Iterator<Item = Option<usize>> + '_ {
397
        self.keys.iter().map(|key| key.map(|k| k.as_usize()))
398
    }
399
400
    /// Return the value of `keys` (the dictionary key) at index `i`,
401
    /// cast to `usize`, `None` if the value at `i` is `NULL`.
402
7
    pub fn key(&self, i: usize) -> Option<usize> {
403
7
        self.keys.is_valid(i).then(|| self.keys.value(i).as_usize())
404
7
    }
405
406
    /// Returns a zero-copy slice of this array with the indicated offset and length.
407
6
    pub fn slice(&self, offset: usize, length: usize) -> Self {
408
6
        Self {
409
6
            data_type: self.data_type.clone(),
410
6
            keys: self.keys.slice(offset, length),
411
6
            values: self.values.clone(),
412
6
            is_ordered: self.is_ordered,
413
6
        }
414
6
    }
415
416
    /// Downcast this dictionary to a [`TypedDictionaryArray`]
417
    ///
418
    /// ```
419
    /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::Int32Type};
420
    ///
421
    /// let orig = [Some("a"), Some("b"), None];
422
    /// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
423
    /// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
424
    /// assert_eq!(typed.value(0), "a");
425
    /// assert_eq!(typed.value(1), "b");
426
    /// assert!(typed.is_null(2));
427
    /// ```
428
    ///
429
0
    pub fn downcast_dict<V: 'static>(&self) -> Option<TypedDictionaryArray<'_, K, V>> {
430
0
        let values = self.values.as_any().downcast_ref()?;
431
0
        Some(TypedDictionaryArray {
432
0
            dictionary: self,
433
0
            values,
434
0
        })
435
0
    }
436
437
    /// Returns a new dictionary with the same keys as the current instance
438
    /// but with a different set of dictionary values
439
    ///
440
    /// This can be used to perform an operation on the values of a dictionary
441
    ///
442
    /// # Panics
443
    ///
444
    /// Panics if `values` has a length less than the current values
445
    ///
446
    /// ```
447
    /// # use std::sync::Arc;
448
    /// # use arrow_array::builder::PrimitiveDictionaryBuilder;
449
    /// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor};
450
    /// # use arrow_array::types::{Int32Type, Int8Type};
451
    ///
452
    /// // Construct a Dict(Int32, Int8)
453
    /// let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int8Type>::with_capacity(2, 200);
454
    /// for i in 0..100 {
455
    ///     builder.append(i % 2).unwrap();
456
    /// }
457
    ///
458
    /// let dictionary = builder.finish();
459
    ///
460
    /// // Perform a widening cast of dictionary values
461
    /// let typed_dictionary = dictionary.downcast_dict::<Int8Array>().unwrap();
462
    /// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64);
463
    ///
464
    /// // Create a Dict(Int32,
465
    /// let new = dictionary.with_values(Arc::new(values));
466
    ///
467
    /// // Verify values are as expected
468
    /// let new_typed = new.downcast_dict::<Int64Array>().unwrap();
469
    /// for i in 0..100 {
470
    ///     assert_eq!(new_typed.value(i), (i % 2) as i64)
471
    /// }
472
    /// ```
473
    ///
474
0
    pub fn with_values(&self, values: ArrayRef) -> Self {
475
0
        assert!(values.len() >= self.values.len());
476
0
        let data_type =
477
0
            DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
478
0
        Self {
479
0
            data_type,
480
0
            keys: self.keys.clone(),
481
0
            values,
482
0
            is_ordered: false,
483
0
        }
484
0
    }
485
486
    /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating
487
    /// its keys and values if the underlying data buffer is not shared by others.
488
    #[allow(clippy::result_large_err)]
489
    pub fn into_primitive_dict_builder<V>(self) -> Result<PrimitiveDictionaryBuilder<K, V>, Self>
490
    where
491
        V: ArrowPrimitiveType,
492
    {
493
        if !self.value_type().is_primitive() {
494
            return Err(self);
495
        }
496
497
        let key_array = self.keys().clone();
498
        let value_array = self.values().as_primitive::<V>().clone();
499
500
        drop(self.keys);
501
        drop(self.values);
502
503
        let key_builder = key_array.into_builder();
504
        let value_builder = value_array.into_builder();
505
506
        match (key_builder, value_builder) {
507
            (Ok(key_builder), Ok(value_builder)) => Ok(unsafe {
508
                PrimitiveDictionaryBuilder::new_from_builders(key_builder, value_builder)
509
            }),
510
            (Err(key_array), Ok(mut value_builder)) => {
511
                Err(Self::try_new(key_array, Arc::new(value_builder.finish())).unwrap())
512
            }
513
            (Ok(mut key_builder), Err(value_array)) => {
514
                Err(Self::try_new(key_builder.finish(), Arc::new(value_array)).unwrap())
515
            }
516
            (Err(key_array), Err(value_array)) => {
517
                Err(Self::try_new(key_array, Arc::new(value_array)).unwrap())
518
            }
519
        }
520
    }
521
522
    /// Applies an unary and infallible function to a mutable dictionary array.
523
    /// Mutable dictionary array means that the buffers are not shared with other arrays.
524
    /// As a result, this mutates the buffers directly without allocating new buffers.
525
    ///
526
    /// # Implementation
527
    ///
528
    /// This will apply the function for all dictionary values, including those on null slots.
529
    /// This implies that the operation must be infallible for any value of the corresponding type
530
    /// or this function may panic.
531
    /// # Example
532
    /// ```
533
    /// # use std::sync::Arc;
534
    /// # use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}};
535
    /// # use arrow_array::{Int8Array, Int32Array};
536
    /// let values = Int32Array::from(vec![Some(10), Some(20), None]);
537
    /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
538
    /// let dictionary = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
539
    /// let c = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap();
540
    /// let typed = c.downcast_dict::<Int32Array>().unwrap();
541
    /// assert_eq!(typed.value(0), 11);
542
    /// assert_eq!(typed.value(1), 11);
543
    /// assert_eq!(typed.value(2), 21);
544
    /// ```
545
    #[allow(clippy::result_large_err)]
546
    pub fn unary_mut<F, V>(self, op: F) -> Result<DictionaryArray<K>, DictionaryArray<K>>
547
    where
548
        V: ArrowPrimitiveType,
549
        F: Fn(V::Native) -> V::Native,
550
    {
551
        let mut builder: PrimitiveDictionaryBuilder<K, V> = self.into_primitive_dict_builder()?;
552
        builder
553
            .values_slice_mut()
554
            .iter_mut()
555
            .for_each(|v| *v = op(*v));
556
        Ok(builder.finish())
557
    }
558
559
    /// Computes an occupancy mask for this dictionary's values
560
    ///
561
    /// For each value in [`Self::values`] the corresponding bit will be set in the
562
    /// returned mask if it is referenced by a key in this [`DictionaryArray`]
563
0
    pub fn occupancy(&self) -> BooleanBuffer {
564
0
        let len = self.values.len();
565
0
        let mut builder = BooleanBufferBuilder::new(len);
566
0
        builder.resize(len);
567
0
        let slice = builder.as_slice_mut();
568
0
        match self.keys.nulls().filter(|n| n.null_count() > 0) {
569
0
            Some(n) => {
570
0
                let v = self.keys.values();
571
0
                n.valid_indices()
572
0
                    .for_each(|idx| set_bit(slice, v[idx].as_usize()))
573
            }
574
            None => {
575
0
                let v = self.keys.values();
576
0
                v.iter().for_each(|v| set_bit(slice, v.as_usize()))
577
            }
578
        }
579
0
        builder.finish()
580
0
    }
581
}
582
583
/// Constructs a `DictionaryArray` from an array data reference.
584
impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
585
0
    fn from(data: ArrayData) -> Self {
586
0
        assert_eq!(
587
0
            data.buffers().len(),
588
            1,
589
0
            "DictionaryArray data should contain a single buffer only (keys)."
590
        );
591
0
        assert_eq!(
592
0
            data.child_data().len(),
593
            1,
594
0
            "DictionaryArray should contain a single child array (values)."
595
        );
596
597
0
        if let DataType::Dictionary(key_data_type, _) = data.data_type() {
598
0
            assert_eq!(
599
0
                &T::DATA_TYPE,
600
0
                key_data_type.as_ref(),
601
0
                "DictionaryArray's data type must match, expected {} got {}",
602
0
                T::DATA_TYPE,
603
                key_data_type
604
            );
605
606
0
            let values = make_array(data.child_data()[0].clone());
607
0
            let data_type = data.data_type().clone();
608
609
            // create a zero-copy of the keys' data
610
            // SAFETY:
611
            // ArrayData is valid and verified type above
612
613
0
            let keys = PrimitiveArray::<T>::from(unsafe {
614
0
                data.into_builder()
615
0
                    .data_type(T::DATA_TYPE)
616
0
                    .child_data(vec![])
617
0
                    .build_unchecked()
618
            });
619
620
0
            Self {
621
0
                data_type,
622
0
                keys,
623
0
                values,
624
0
                is_ordered: false,
625
0
            }
626
        } else {
627
0
            panic!("DictionaryArray must have Dictionary data type.")
628
        }
629
0
    }
630
}
631
632
impl<T: ArrowDictionaryKeyType> From<DictionaryArray<T>> for ArrayData {
633
12
    fn from(array: DictionaryArray<T>) -> Self {
634
12
        let builder = array
635
12
            .keys
636
12
            .into_data()
637
12
            .into_builder()
638
12
            .data_type(array.data_type)
639
12
            .child_data(vec![array.values.to_data()]);
640
641
12
        unsafe { builder.build_unchecked() }
642
12
    }
643
}
644
645
/// Constructs a `DictionaryArray` from an iterator of optional strings.
646
///
647
/// # Example:
648
/// ```
649
/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
650
///
651
/// let test = vec!["a", "a", "b", "c"];
652
/// let array: DictionaryArray<Int8Type> = test
653
///     .iter()
654
///     .map(|&x| if x == "b" { None } else { Some(x) })
655
///     .collect();
656
/// assert_eq!(
657
///     "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  null,\n  1,\n] values: StringArray\n[\n  \"a\",\n  \"c\",\n]}\n",
658
///     format!("{:?}", array)
659
/// );
660
/// ```
661
impl<'a, T: ArrowDictionaryKeyType> FromIterator<Option<&'a str>> for DictionaryArray<T> {
662
    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
663
        let it = iter.into_iter();
664
        let (lower, _) = it.size_hint();
665
        let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
666
        builder.extend(it);
667
        builder.finish()
668
    }
669
}
670
671
/// Constructs a `DictionaryArray` from an iterator of strings.
672
///
673
/// # Example:
674
///
675
/// ```
676
/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
677
///
678
/// let test = vec!["a", "a", "b", "c"];
679
/// let array: DictionaryArray<Int8Type> = test.into_iter().collect();
680
/// assert_eq!(
681
///     "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  1,\n  2,\n] values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
682
///     format!("{:?}", array)
683
/// );
684
/// ```
685
impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray<T> {
686
    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
687
        let it = iter.into_iter();
688
        let (lower, _) = it.size_hint();
689
        let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
690
        it.for_each(|i| {
691
            builder
692
                .append(i)
693
                .expect("Unable to append a value to a dictionary array.");
694
        });
695
696
        builder.finish()
697
    }
698
}
699
700
impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
701
11
    fn as_any(&self) -> &dyn Any {
702
11
        self
703
11
    }
704
705
12
    fn to_data(&self) -> ArrayData {
706
12
        self.clone().into()
707
12
    }
708
709
0
    fn into_data(self) -> ArrayData {
710
0
        self.into()
711
0
    }
712
713
29
    fn data_type(&self) -> &DataType {
714
29
        &self.data_type
715
29
    }
716
717
6
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
718
6
        Arc::new(self.slice(offset, length))
719
6
    }
720
721
43
    fn len(&self) -> usize {
722
43
        self.keys.len()
723
43
    }
724
725
0
    fn is_empty(&self) -> bool {
726
0
        self.keys.is_empty()
727
0
    }
728
729
0
    fn shrink_to_fit(&mut self) {
730
0
        self.keys.shrink_to_fit();
731
0
        self.values.shrink_to_fit();
732
0
    }
733
734
0
    fn offset(&self) -> usize {
735
0
        self.keys.offset()
736
0
    }
737
738
34
    fn nulls(&self) -> Option<&NullBuffer> {
739
34
        self.keys.nulls()
740
34
    }
741
742
0
    fn logical_nulls(&self) -> Option<NullBuffer> {
743
0
        match self.values.logical_nulls() {
744
0
            None => self.nulls().cloned(),
745
0
            Some(value_nulls) => {
746
0
                let mut builder = BooleanBufferBuilder::new(self.len());
747
0
                match self.keys.nulls() {
748
0
                    Some(n) => builder.append_buffer(n.inner()),
749
0
                    None => builder.append_n(self.len(), true),
750
                }
751
0
                for (idx, k) in self.keys.values().iter().enumerate() {
752
0
                    let k = k.as_usize();
753
                    // Check range to allow for nulls
754
0
                    if k < value_nulls.len() && value_nulls.is_null(k) {
755
0
                        builder.set_bit(idx, false);
756
0
                    }
757
                }
758
0
                Some(builder.finish().into())
759
            }
760
        }
761
0
    }
762
763
0
    fn logical_null_count(&self) -> usize {
764
0
        match (self.keys.nulls(), self.values.logical_nulls()) {
765
0
            (None, None) => 0,
766
0
            (Some(key_nulls), None) => key_nulls.null_count(),
767
0
            (None, Some(value_nulls)) => self
768
0
                .keys
769
0
                .values()
770
0
                .iter()
771
0
                .filter(|k| value_nulls.is_null(k.as_usize()))
772
0
                .count(),
773
0
            (Some(key_nulls), Some(value_nulls)) => self
774
0
                .keys
775
0
                .values()
776
0
                .iter()
777
0
                .enumerate()
778
0
                .filter(|(idx, k)| key_nulls.is_null(*idx) || value_nulls.is_null(k.as_usize()))
779
0
                .count(),
780
        }
781
0
    }
782
783
1
    fn is_nullable(&self) -> bool {
784
1
        !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable())
785
1
    }
786
787
0
    fn get_buffer_memory_size(&self) -> usize {
788
0
        self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size()
789
0
    }
790
791
0
    fn get_array_memory_size(&self) -> usize {
792
0
        std::mem::size_of::<Self>()
793
0
            + self.keys.get_buffer_memory_size()
794
0
            + self.values.get_array_memory_size()
795
0
    }
796
}
797
798
impl<T: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryArray<T> {
799
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
800
0
        writeln!(
801
0
            f,
802
0
            "DictionaryArray {{keys: {:?} values: {:?}}}",
803
            self.keys, self.values
804
        )
805
0
    }
806
}
807
808
/// A [`DictionaryArray`] typed on its child values array
809
///
810
/// Implements [`ArrayAccessor`] allowing fast access to its elements
811
///
812
/// ```
813
/// use arrow_array::{DictionaryArray, StringArray, types::Int32Type};
814
///
815
/// let orig = ["a", "b", "a", "b"];
816
/// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
817
///
818
/// // `TypedDictionaryArray` allows you to access the values directly
819
/// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
820
///
821
/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
822
///     assert_eq!(maybe_val.unwrap(), orig)
823
/// }
824
/// ```
825
pub struct TypedDictionaryArray<'a, K: ArrowDictionaryKeyType, V> {
826
    /// The dictionary array
827
    dictionary: &'a DictionaryArray<K>,
828
    /// The values of the dictionary
829
    values: &'a V,
830
}
831
832
// Manually implement `Clone` to avoid `V: Clone` type constraint
833
impl<K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'_, K, V> {
834
    fn clone(&self) -> Self {
835
        *self
836
    }
837
}
838
839
impl<K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'_, K, V> {}
840
841
impl<K: ArrowDictionaryKeyType, V> std::fmt::Debug for TypedDictionaryArray<'_, K, V> {
842
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
843
        writeln!(f, "TypedDictionaryArray({:?})", self.dictionary)
844
    }
845
}
846
847
impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> {
848
    /// Returns the keys of this [`TypedDictionaryArray`]
849
0
    pub fn keys(&self) -> &'a PrimitiveArray<K> {
850
0
        self.dictionary.keys()
851
0
    }
852
853
    /// Returns the values of this [`TypedDictionaryArray`]
854
0
    pub fn values(&self) -> &'a V {
855
0
        self.values
856
0
    }
857
}
858
859
impl<K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'_, K, V> {
860
    fn as_any(&self) -> &dyn Any {
861
        self.dictionary
862
    }
863
864
    fn to_data(&self) -> ArrayData {
865
        self.dictionary.to_data()
866
    }
867
868
    fn into_data(self) -> ArrayData {
869
        self.dictionary.into_data()
870
    }
871
872
    fn data_type(&self) -> &DataType {
873
        self.dictionary.data_type()
874
    }
875
876
    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
877
        Arc::new(self.dictionary.slice(offset, length))
878
    }
879
880
    fn len(&self) -> usize {
881
        self.dictionary.len()
882
    }
883
884
    fn is_empty(&self) -> bool {
885
        self.dictionary.is_empty()
886
    }
887
888
    fn offset(&self) -> usize {
889
        self.dictionary.offset()
890
    }
891
892
    fn nulls(&self) -> Option<&NullBuffer> {
893
        self.dictionary.nulls()
894
    }
895
896
    fn logical_nulls(&self) -> Option<NullBuffer> {
897
        self.dictionary.logical_nulls()
898
    }
899
900
    fn logical_null_count(&self) -> usize {
901
        self.dictionary.logical_null_count()
902
    }
903
904
    fn is_nullable(&self) -> bool {
905
        self.dictionary.is_nullable()
906
    }
907
908
    fn get_buffer_memory_size(&self) -> usize {
909
        self.dictionary.get_buffer_memory_size()
910
    }
911
912
    fn get_array_memory_size(&self) -> usize {
913
        self.dictionary.get_array_memory_size()
914
    }
915
}
916
917
impl<K, V> IntoIterator for TypedDictionaryArray<'_, K, V>
918
where
919
    K: ArrowDictionaryKeyType,
920
    Self: ArrayAccessor,
921
{
922
    type Item = Option<<Self as ArrayAccessor>::Item>;
923
    type IntoIter = ArrayIter<Self>;
924
925
    fn into_iter(self) -> Self::IntoIter {
926
        ArrayIter::new(self)
927
    }
928
}
929
930
impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V>
931
where
932
    K: ArrowDictionaryKeyType,
933
    V: Sync + Send,
934
    &'a V: ArrayAccessor,
935
    <&'a V as ArrayAccessor>::Item: Default,
936
{
937
    type Item = <&'a V as ArrayAccessor>::Item;
938
939
    fn value(&self, index: usize) -> Self::Item {
940
        assert!(
941
            index < self.len(),
942
            "Trying to access an element at index {} from a TypedDictionaryArray of length {}",
943
            index,
944
            self.len()
945
        );
946
        unsafe { self.value_unchecked(index) }
947
    }
948
949
    unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
950
        let val = self.dictionary.keys.value_unchecked(index);
951
        let value_idx = val.as_usize();
952
953
        // As dictionary keys are only verified for non-null indexes
954
        // we must check the value is within bounds
955
        match value_idx < self.values.len() {
956
            true => self.values.value_unchecked(value_idx),
957
            false => Default::default(),
958
        }
959
    }
960
}
961
962
/// A [`DictionaryArray`] with the key type erased
963
///
964
/// This can be used to efficiently implement kernels for all possible dictionary
965
/// keys without needing to create specialized implementations for each key type
966
///
967
/// For example
968
///
969
/// ```
970
/// # use arrow_array::*;
971
/// # use arrow_array::cast::AsArray;
972
/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
973
/// # use arrow_array::types::*;
974
/// # use arrow_schema::ArrowError;
975
/// # use std::sync::Arc;
976
///
977
/// fn to_string(a: &dyn Array) -> Result<ArrayRef, ArrowError> {
978
///     if let Some(d) = a.as_any_dictionary_opt() {
979
///         // Recursively handle dictionary input
980
///         let r = to_string(d.values().as_ref())?;
981
///         return Ok(d.with_values(r));
982
///     }
983
///     downcast_primitive_array! {
984
///         a => Ok(Arc::new(a.iter().map(|x| x.map(|x| format!("{x:?}"))).collect::<StringArray>())),
985
///         d => Err(ArrowError::InvalidArgumentError(format!("{d:?} not supported")))
986
///     }
987
/// }
988
///
989
/// let result = to_string(&Int32Array::from(vec![1, 2, 3])).unwrap();
990
/// let actual = result.as_string::<i32>().iter().map(Option::unwrap).collect::<Vec<_>>();
991
/// assert_eq!(actual, &["1", "2", "3"]);
992
///
993
/// let mut dict = PrimitiveDictionaryBuilder::<Int32Type, UInt16Type>::new();
994
/// dict.extend([Some(1), Some(1), Some(2), Some(3), Some(2)]);
995
/// let dict = dict.finish();
996
///
997
/// let r = to_string(&dict).unwrap();
998
/// let r = r.as_dictionary::<Int32Type>().downcast_dict::<StringArray>().unwrap();
999
/// assert_eq!(r.keys(), dict.keys()); // Keys are the same
1000
///
1001
/// let actual = r.into_iter().map(Option::unwrap).collect::<Vec<_>>();
1002
/// assert_eq!(actual, &["1", "1", "2", "3", "2"]);
1003
/// ```
1004
///
1005
/// See [`AsArray::as_any_dictionary_opt`] and [`AsArray::as_any_dictionary`]
1006
pub trait AnyDictionaryArray: Array {
1007
    /// Returns the primitive keys of this dictionary as an [`Array`]
1008
    fn keys(&self) -> &dyn Array;
1009
1010
    /// Returns the values of this dictionary
1011
    fn values(&self) -> &ArrayRef;
1012
1013
    /// Returns the keys of this dictionary as usize
1014
    ///
1015
    /// The values for nulls will be arbitrary, but are guaranteed
1016
    /// to be in the range `0..self.values.len()`
1017
    ///
1018
    /// # Panic
1019
    ///
1020
    /// Panics if `values.len() == 0`
1021
    fn normalized_keys(&self) -> Vec<usize>;
1022
1023
    /// Create a new [`DictionaryArray`] replacing `values` with the new values
1024
    ///
1025
    /// See [`DictionaryArray::with_values`]
1026
    fn with_values(&self, values: ArrayRef) -> ArrayRef;
1027
}
1028
1029
impl<K: ArrowDictionaryKeyType> AnyDictionaryArray for DictionaryArray<K> {
1030
0
    fn keys(&self) -> &dyn Array {
1031
0
        &self.keys
1032
0
    }
1033
1034
0
    fn values(&self) -> &ArrayRef {
1035
0
        self.values()
1036
0
    }
1037
1038
0
    fn normalized_keys(&self) -> Vec<usize> {
1039
0
        let v_len = self.values().len();
1040
0
        assert_ne!(v_len, 0);
1041
0
        let iter = self.keys().values().iter();
1042
0
        iter.map(|x| x.as_usize().min(v_len - 1)).collect()
1043
0
    }
1044
1045
0
    fn with_values(&self, values: ArrayRef) -> ArrayRef {
1046
0
        Arc::new(self.with_values(values))
1047
0
    }
1048
}
1049
1050
#[cfg(test)]
1051
mod tests {
1052
    use super::*;
1053
    use crate::cast::as_dictionary_array;
1054
    use crate::{Int16Array, Int32Array, Int8Array, RunArray};
1055
    use arrow_buffer::{Buffer, ToByteSlice};
1056
1057
    #[test]
1058
    fn test_dictionary_array() {
1059
        // Construct a value array
1060
        let value_data = ArrayData::builder(DataType::Int8)
1061
            .len(8)
1062
            .add_buffer(Buffer::from(
1063
                [10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(),
1064
            ))
1065
            .build()
1066
            .unwrap();
1067
1068
        // Construct a buffer for value offsets, for the nested array:
1069
        let keys = Buffer::from([2_i16, 3, 4].to_byte_slice());
1070
1071
        // Construct a dictionary array from the above two
1072
        let key_type = DataType::Int16;
1073
        let value_type = DataType::Int8;
1074
        let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type));
1075
        let dict_data = ArrayData::builder(dict_data_type.clone())
1076
            .len(3)
1077
            .add_buffer(keys.clone())
1078
            .add_child_data(value_data.clone())
1079
            .build()
1080
            .unwrap();
1081
        let dict_array = Int16DictionaryArray::from(dict_data);
1082
1083
        let values = dict_array.values();
1084
        assert_eq!(value_data, values.to_data());
1085
        assert_eq!(DataType::Int8, dict_array.value_type());
1086
        assert_eq!(3, dict_array.len());
1087
1088
        // Null count only makes sense in terms of the component arrays.
1089
        assert_eq!(0, dict_array.null_count());
1090
        assert_eq!(0, dict_array.values().null_count());
1091
        assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4]));
1092
1093
        // Now test with a non-zero offset
1094
        let dict_data = ArrayData::builder(dict_data_type)
1095
            .len(2)
1096
            .offset(1)
1097
            .add_buffer(keys)
1098
            .add_child_data(value_data.clone())
1099
            .build()
1100
            .unwrap();
1101
        let dict_array = Int16DictionaryArray::from(dict_data);
1102
1103
        let values = dict_array.values();
1104
        assert_eq!(value_data, values.to_data());
1105
        assert_eq!(DataType::Int8, dict_array.value_type());
1106
        assert_eq!(2, dict_array.len());
1107
        assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
1108
    }
1109
1110
    #[test]
1111
    fn test_dictionary_builder_append_many() {
1112
        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
1113
1114
        builder.append(1).unwrap();
1115
        builder.append_n(2, 2).unwrap();
1116
        builder.append_options(None, 2);
1117
        builder.append_options(Some(3), 3);
1118
1119
        let array = builder.finish();
1120
1121
        let values = array
1122
            .values()
1123
            .as_primitive::<UInt32Type>()
1124
            .iter()
1125
            .map(Option::unwrap)
1126
            .collect::<Vec<_>>();
1127
        assert_eq!(values, &[1, 2, 3]);
1128
        let keys = array.keys().iter().collect::<Vec<_>>();
1129
        assert_eq!(
1130
            keys,
1131
            &[
1132
                Some(0),
1133
                Some(1),
1134
                Some(1),
1135
                None,
1136
                None,
1137
                Some(2),
1138
                Some(2),
1139
                Some(2)
1140
            ]
1141
        );
1142
    }
1143
1144
    #[test]
1145
    fn test_string_dictionary_builder_append_many() {
1146
        let mut builder = StringDictionaryBuilder::<Int8Type>::new();
1147
1148
        builder.append("a").unwrap();
1149
        builder.append_n("b", 2).unwrap();
1150
        builder.append_options(None::<&str>, 2);
1151
        builder.append_options(Some("c"), 3);
1152
1153
        let array = builder.finish();
1154
1155
        let values = array
1156
            .values()
1157
            .as_string::<i32>()
1158
            .iter()
1159
            .map(Option::unwrap)
1160
            .collect::<Vec<_>>();
1161
        assert_eq!(values, &["a", "b", "c"]);
1162
        let keys = array.keys().iter().collect::<Vec<_>>();
1163
        assert_eq!(
1164
            keys,
1165
            &[
1166
                Some(0),
1167
                Some(1),
1168
                Some(1),
1169
                None,
1170
                None,
1171
                Some(2),
1172
                Some(2),
1173
                Some(2)
1174
            ]
1175
        );
1176
    }
1177
1178
    #[test]
1179
    fn test_dictionary_array_fmt_debug() {
1180
        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
1181
        builder.append(12345678).unwrap();
1182
        builder.append_null();
1183
        builder.append(22345678).unwrap();
1184
        let array = builder.finish();
1185
        assert_eq!(
1186
            "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n  0,\n  null,\n  1,\n] values: PrimitiveArray<UInt32>\n[\n  12345678,\n  22345678,\n]}\n",
1187
            format!("{array:?}")
1188
        );
1189
1190
        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(20, 2);
1191
        for _ in 0..20 {
1192
            builder.append(1).unwrap();
1193
        }
1194
        let array = builder.finish();
1195
        assert_eq!(
1196
            "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n  0,\n] values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
1197
            format!("{array:?}")
1198
        );
1199
    }
1200
1201
    #[test]
1202
    fn test_dictionary_array_from_iter() {
1203
        let test = vec!["a", "a", "b", "c"];
1204
        let array: DictionaryArray<Int8Type> = test
1205
            .iter()
1206
            .map(|&x| if x == "b" { None } else { Some(x) })
1207
            .collect();
1208
        assert_eq!(
1209
            "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  null,\n  1,\n] values: StringArray\n[\n  \"a\",\n  \"c\",\n]}\n",
1210
            format!("{array:?}")
1211
        );
1212
1213
        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1214
        assert_eq!(
1215
            "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n  0,\n  0,\n  1,\n  2,\n] values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
1216
            format!("{array:?}")
1217
        );
1218
    }
1219
1220
    #[test]
1221
    fn test_dictionary_array_reverse_lookup_key() {
1222
        let test = vec!["a", "a", "b", "c"];
1223
        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1224
1225
        assert_eq!(array.lookup_key("c"), Some(2));
1226
1227
        // Direction of building a dictionary is the iterator direction
1228
        let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"];
1229
        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1230
1231
        assert_eq!(array.lookup_key("t1"), Some(2));
1232
        assert_eq!(array.lookup_key("non-existent"), None);
1233
    }
1234
1235
    #[test]
1236
    fn test_dictionary_keys_as_primitive_array() {
1237
        let test = vec!["a", "b", "c", "a"];
1238
        let array: DictionaryArray<Int8Type> = test.into_iter().collect();
1239
1240
        let keys = array.keys();
1241
        assert_eq!(&DataType::Int8, keys.data_type());
1242
        assert_eq!(0, keys.null_count());
1243
        assert_eq!(&[0, 1, 2, 0], keys.values());
1244
    }
1245
1246
    #[test]
1247
    fn test_dictionary_keys_as_primitive_array_with_null() {
1248
        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
1249
        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1250
1251
        let keys = array.keys();
1252
        assert_eq!(&DataType::Int32, keys.data_type());
1253
        assert_eq!(3, keys.null_count());
1254
1255
        assert!(keys.is_valid(0));
1256
        assert!(!keys.is_valid(1));
1257
        assert!(keys.is_valid(2));
1258
        assert!(!keys.is_valid(3));
1259
        assert!(!keys.is_valid(4));
1260
        assert!(keys.is_valid(5));
1261
1262
        assert_eq!(0, keys.value(0));
1263
        assert_eq!(1, keys.value(2));
1264
        assert_eq!(0, keys.value(5));
1265
    }
1266
1267
    #[test]
1268
    fn test_dictionary_all_nulls() {
1269
        let test = vec![None, None, None];
1270
        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1271
        array
1272
            .into_data()
1273
            .validate_full()
1274
            .expect("All null array has valid array data");
1275
    }
1276
1277
    #[test]
1278
    fn test_dictionary_iter() {
1279
        // Construct a value array
1280
        let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
1281
        let keys = Int16Array::from_iter_values([2_i16, 3, 4]);
1282
1283
        // Construct a dictionary array from the above two
1284
        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1285
1286
        let mut key_iter = dict_array.keys_iter();
1287
        assert_eq!(2, key_iter.next().unwrap().unwrap());
1288
        assert_eq!(3, key_iter.next().unwrap().unwrap());
1289
        assert_eq!(4, key_iter.next().unwrap().unwrap());
1290
        assert!(key_iter.next().is_none());
1291
1292
        let mut iter = dict_array
1293
            .values()
1294
            .as_any()
1295
            .downcast_ref::<Int8Array>()
1296
            .unwrap()
1297
            .take_iter(dict_array.keys_iter());
1298
1299
        assert_eq!(12, iter.next().unwrap().unwrap());
1300
        assert_eq!(13, iter.next().unwrap().unwrap());
1301
        assert_eq!(14, iter.next().unwrap().unwrap());
1302
        assert!(iter.next().is_none());
1303
    }
1304
1305
    #[test]
1306
    fn test_dictionary_iter_with_null() {
1307
        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
1308
        let array: DictionaryArray<Int32Type> = test.into_iter().collect();
1309
1310
        let mut iter = array
1311
            .values()
1312
            .as_any()
1313
            .downcast_ref::<StringArray>()
1314
            .unwrap()
1315
            .take_iter(array.keys_iter());
1316
1317
        assert_eq!("a", iter.next().unwrap().unwrap());
1318
        assert!(iter.next().unwrap().is_none());
1319
        assert_eq!("b", iter.next().unwrap().unwrap());
1320
        assert!(iter.next().unwrap().is_none());
1321
        assert!(iter.next().unwrap().is_none());
1322
        assert_eq!("a", iter.next().unwrap().unwrap());
1323
        assert!(iter.next().is_none());
1324
    }
1325
1326
    #[test]
1327
    fn test_dictionary_key() {
1328
        let keys = Int8Array::from(vec![Some(2), None, Some(1)]);
1329
        let values = StringArray::from(vec!["foo", "bar", "baz", "blarg"]);
1330
1331
        let array = DictionaryArray::new(keys, Arc::new(values));
1332
        assert_eq!(array.key(0), Some(2));
1333
        assert_eq!(array.key(1), None);
1334
        assert_eq!(array.key(2), Some(1));
1335
    }
1336
1337
    #[test]
1338
    fn test_try_new() {
1339
        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
1340
            .into_iter()
1341
            .collect();
1342
        let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect();
1343
1344
        let array = DictionaryArray::new(keys, Arc::new(values));
1345
        assert_eq!(array.keys().data_type(), &DataType::Int32);
1346
        assert_eq!(array.values().data_type(), &DataType::Utf8);
1347
1348
        assert_eq!(array.null_count(), 1);
1349
        assert_eq!(array.logical_null_count(), 1);
1350
1351
        assert!(array.keys().is_valid(0));
1352
        assert!(array.keys().is_valid(1));
1353
        assert!(array.keys().is_null(2));
1354
        assert!(array.keys().is_valid(3));
1355
1356
        assert_eq!(array.keys().value(0), 0);
1357
        assert_eq!(array.keys().value(1), 2);
1358
        assert_eq!(array.keys().value(3), 1);
1359
1360
        assert_eq!(
1361
            "DictionaryArray {keys: PrimitiveArray<Int32>\n[\n  0,\n  2,\n  null,\n  1,\n] values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  \"baz\",\n]}\n",
1362
            format!("{array:?}")
1363
        );
1364
    }
1365
1366
    #[test]
1367
    #[should_panic(expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2")]
1368
    fn test_try_new_index_too_large() {
1369
        let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
1370
        // dictionary only has 2 values, so offset 3 is out of bounds
1371
        let keys: Int32Array = [Some(0), Some(3)].into_iter().collect();
1372
        DictionaryArray::new(keys, Arc::new(values));
1373
    }
1374
1375
    #[test]
1376
    #[should_panic(expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2")]
1377
    fn test_try_new_index_too_small() {
1378
        let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
1379
        let keys: Int32Array = [Some(-100)].into_iter().collect();
1380
        DictionaryArray::new(keys, Arc::new(values));
1381
    }
1382
1383
    #[test]
1384
    #[should_panic(expected = "DictionaryArray's data type must match, expected Int64 got Int32")]
1385
    fn test_from_array_data_validation() {
1386
        let a = DictionaryArray::<Int32Type>::from_iter(["32"]);
1387
        let _ = DictionaryArray::<Int64Type>::from(a.into_data());
1388
    }
1389
1390
    #[test]
1391
    fn test_into_primitive_dict_builder() {
1392
        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1393
        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1394
1395
        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1396
1397
        let boxed: ArrayRef = Arc::new(dict_array);
1398
        let col: DictionaryArray<Int8Type> = as_dictionary_array(&boxed).clone();
1399
1400
        drop(boxed);
1401
1402
        let mut builder = col.into_primitive_dict_builder::<Int32Type>().unwrap();
1403
1404
        let slice = builder.values_slice_mut();
1405
        assert_eq!(slice, &[10, 12, 15]);
1406
1407
        slice[0] = 4;
1408
        slice[1] = 2;
1409
        slice[2] = 1;
1410
1411
        let values = Int32Array::from_iter_values([4_i32, 2, 1]);
1412
        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1413
1414
        let expected = DictionaryArray::new(keys, Arc::new(values));
1415
1416
        let new_array = builder.finish();
1417
        assert_eq!(expected, new_array);
1418
    }
1419
1420
    #[test]
1421
    fn test_into_primitive_dict_builder_cloned_array() {
1422
        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1423
        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1424
1425
        let dict_array = DictionaryArray::new(keys, Arc::new(values));
1426
1427
        let boxed: ArrayRef = Arc::new(dict_array);
1428
1429
        let col: DictionaryArray<Int8Type> = DictionaryArray::<Int8Type>::from(boxed.to_data());
1430
        let err = col.into_primitive_dict_builder::<Int32Type>();
1431
1432
        let returned = err.unwrap_err();
1433
1434
        let values = Int32Array::from_iter_values([10_i32, 12, 15]);
1435
        let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
1436
1437
        let expected = DictionaryArray::new(keys, Arc::new(values));
1438
        assert_eq!(expected, returned);
1439
    }
1440
1441
    #[test]
1442
    fn test_occupancy() {
1443
        let keys = Int32Array::new((100..200).collect(), None);
1444
        let values = Int32Array::from(vec![0; 1024]);
1445
        let dict = DictionaryArray::new(keys, Arc::new(values));
1446
        for (idx, v) in dict.occupancy().iter().enumerate() {
1447
            let expected = (100..200).contains(&idx);
1448
            assert_eq!(v, expected, "{idx}");
1449
        }
1450
1451
        let keys = Int32Array::new(
1452
            (0..100).collect(),
1453
            Some((0..100).map(|x| x % 4 == 0).collect()),
1454
        );
1455
        let values = Int32Array::from(vec![0; 1024]);
1456
        let dict = DictionaryArray::new(keys, Arc::new(values));
1457
        for (idx, v) in dict.occupancy().iter().enumerate() {
1458
            let expected = idx % 4 == 0 && idx < 100;
1459
            assert_eq!(v, expected, "{idx}");
1460
        }
1461
    }
1462
1463
    #[test]
1464
    fn test_iterator_nulls() {
1465
        let keys = Int32Array::new(
1466
            vec![0, 700, 1, 2].into(),
1467
            Some(NullBuffer::from(vec![true, false, true, true])),
1468
        );
1469
        let values = Int32Array::from(vec![Some(50), None, Some(2)]);
1470
        let dict = DictionaryArray::new(keys, Arc::new(values));
1471
        let values: Vec<_> = dict
1472
            .downcast_dict::<Int32Array>()
1473
            .unwrap()
1474
            .into_iter()
1475
            .collect();
1476
        assert_eq!(values, &[Some(50), None, None, Some(2)])
1477
    }
1478
1479
    #[test]
1480
    fn test_logical_nulls() -> Result<(), ArrowError> {
1481
        let values = Arc::new(RunArray::try_new(
1482
            &Int32Array::from(vec![1, 3, 7]),
1483
            &Int32Array::from(vec![Some(1), None, Some(3)]),
1484
        )?) as ArrayRef;
1485
1486
        // For this test to be meaningful, the values array need to have different nulls and logical nulls
1487
        assert_eq!(values.null_count(), 0);
1488
        assert_eq!(values.logical_null_count(), 2);
1489
1490
        // Construct a trivial dictionary with 1-1 mapping to underlying array
1491
        let dictionary = DictionaryArray::<Int8Type>::try_new(
1492
            Int8Array::from((0..values.len()).map(|i| i as i8).collect::<Vec<_>>()),
1493
            Arc::clone(&values),
1494
        )?;
1495
1496
        // No keys are null
1497
        assert_eq!(dictionary.null_count(), 0);
1498
        // Dictionary array values are logically nullable
1499
        assert_eq!(dictionary.logical_null_count(), values.logical_null_count());
1500
        assert_eq!(dictionary.logical_nulls(), values.logical_nulls());
1501
        assert!(dictionary.is_nullable());
1502
1503
        // Construct a trivial dictionary with 1-1 mapping to underlying array except that key 0 is nulled out
1504
        let dictionary = DictionaryArray::<Int8Type>::try_new(
1505
            Int8Array::from(
1506
                (0..values.len())
1507
                    .map(|i| i as i8)
1508
                    .map(|i| if i == 0 { None } else { Some(i) })
1509
                    .collect::<Vec<_>>(),
1510
            ),
1511
            Arc::clone(&values),
1512
        )?;
1513
1514
        // One key is null
1515
        assert_eq!(dictionary.null_count(), 1);
1516
1517
        // Dictionary array values are logically nullable
1518
        assert_eq!(
1519
            dictionary.logical_null_count(),
1520
            values.logical_null_count() + 1
1521
        );
1522
        assert!(dictionary.is_nullable());
1523
1524
        Ok(())
1525
    }
1526
1527
    #[test]
1528
    fn test_normalized_keys() {
1529
        let values = vec![132, 0, 1].into();
1530
        let nulls = NullBuffer::from(vec![false, true, true]);
1531
        let keys = Int32Array::new(values, Some(nulls));
1532
        let dictionary = DictionaryArray::new(keys, Arc::new(Int32Array::new_null(2)));
1533
        assert_eq!(&dictionary.normalized_keys(), &[1, 0, 1])
1534
    }
1535
}