Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-cast/src/cast/dictionary.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::cast::*;
19
20
/// Attempts to cast an `ArrayDictionary` with index type K into
21
/// `to_type` for supported types.
22
///
23
/// K is the key type
24
0
pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25
0
    array: &dyn Array,
26
0
    to_type: &DataType,
27
0
    cast_options: &CastOptions,
28
0
) -> Result<ArrayRef, ArrowError> {
29
    use DataType::*;
30
31
0
    match to_type {
32
0
        Dictionary(to_index_type, to_value_type) => {
33
0
            let dict_array = array
34
0
                .as_any()
35
0
                .downcast_ref::<DictionaryArray<K>>()
36
0
                .ok_or_else(|| {
37
0
                    ArrowError::ComputeError(
38
0
                        "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39
0
                    )
40
0
                })?;
41
42
0
            let keys_array: ArrayRef =
43
0
                Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44
0
            let values_array = dict_array.values();
45
0
            let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46
0
            let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
47
48
            // Failure to cast keys (because they don't fit in the
49
            // target type) results in NULL values;
50
0
            if cast_keys.null_count() > keys_array.null_count() {
51
0
                return Err(ArrowError::ComputeError(format!(
52
0
                    "Could not convert {} dictionary indexes from {:?} to {:?}",
53
0
                    cast_keys.null_count() - keys_array.null_count(),
54
0
                    keys_array.data_type(),
55
0
                    to_index_type
56
0
                )));
57
0
            }
58
59
0
            let data = cast_keys.into_data();
60
0
            let builder = data
61
0
                .into_builder()
62
0
                .data_type(to_type.clone())
63
0
                .child_data(vec![cast_values.into_data()]);
64
65
            // Safety
66
            // Cast keys are still valid
67
0
            let data = unsafe { builder.build_unchecked() };
68
69
            // create the appropriate array type
70
0
            let new_array: ArrayRef = match **to_index_type {
71
0
                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72
0
                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73
0
                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74
0
                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75
0
                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76
0
                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77
0
                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78
0
                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79
                _ => {
80
0
                    return Err(ArrowError::CastError(format!(
81
0
                        "Unsupported type {to_index_type:?} for dictionary index"
82
0
                    )));
83
                }
84
            };
85
86
0
            Ok(new_array)
87
        }
88
        Utf8View => {
89
            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90
            // we handle it here to avoid the copy.
91
0
            let dict_array = array
92
0
                .as_dictionary::<K>()
93
0
                .downcast_dict::<StringArray>()
94
0
                .ok_or_else(|| {
95
0
                    ArrowError::ComputeError(
96
0
                        "Internal Error: Cannot cast Utf8View to StringArray of expected type"
97
0
                            .to_string(),
98
0
                    )
99
0
                })?;
100
101
0
            let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102
0
                dict_array.values(),
103
0
                dict_array.keys(),
104
0
            )?;
105
0
            Ok(Arc::new(string_view))
106
        }
107
        BinaryView => {
108
            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109
            // we handle it here to avoid the copy.
110
0
            let dict_array = array
111
0
                .as_dictionary::<K>()
112
0
                .downcast_dict::<BinaryArray>()
113
0
                .ok_or_else(|| {
114
0
                    ArrowError::ComputeError(
115
0
                        "Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116
0
                            .to_string(),
117
0
                    )
118
0
                })?;
119
120
0
            let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121
0
                dict_array.values(),
122
0
                dict_array.keys(),
123
0
            )?;
124
0
            Ok(Arc::new(binary_view))
125
        }
126
0
        _ => unpack_dictionary::<K>(array, to_type, cast_options),
127
    }
128
0
}
129
130
0
fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131
0
    array: &GenericByteArray<V>,
132
0
    keys: &PrimitiveArray<K>,
133
0
) -> Result<GenericByteViewArray<T>, ArrowError> {
134
0
    let value_buffer = array.values();
135
0
    let value_offsets = array.value_offsets();
136
0
    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137
0
    builder.append_block(value_buffer.clone());
138
0
    for i in keys.iter() {
139
0
        match i {
140
0
            Some(v) => {
141
0
                let idx = v.to_usize().ok_or_else(|| {
142
0
                    ArrowError::ComputeError("Invalid dictionary index".to_string())
143
0
                })?;
144
145
                // Safety
146
                // (1) The index is within bounds as they are offsets
147
                // (2) The append_view is safe
148
                unsafe {
149
0
                    let offset = value_offsets.get_unchecked(idx).as_usize();
150
0
                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
151
0
                    let length = end - offset;
152
0
                    builder.append_view_unchecked(0, offset as u32, length as u32)
153
                }
154
            }
155
0
            None => {
156
0
                builder.append_null();
157
0
            }
158
        }
159
    }
160
0
    Ok(builder.finish())
161
0
}
162
163
// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164
0
pub(crate) fn unpack_dictionary<K>(
165
0
    array: &dyn Array,
166
0
    to_type: &DataType,
167
0
    cast_options: &CastOptions,
168
0
) -> Result<ArrayRef, ArrowError>
169
0
where
170
0
    K: ArrowDictionaryKeyType,
171
{
172
0
    let dict_array = array.as_dictionary::<K>();
173
0
    let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174
0
    take(cast_dict_values.as_ref(), dict_array.keys(), None)
175
0
}
176
177
/// Pack a data type into a dictionary array passing the values through a primitive array
178
0
pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
179
0
    array: &dyn Array,
180
0
    primitive_type: DataType,
181
0
    dict_value_type: &DataType,
182
0
    cast_options: &CastOptions,
183
0
) -> Result<ArrayRef, ArrowError> {
184
0
    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
185
0
    let dict = cast_with_options(
186
0
        primitive.as_ref(),
187
0
        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
188
0
        cast_options,
189
0
    )?;
190
0
    cast_with_options(
191
0
        dict.as_ref(),
192
0
        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
193
0
        cast_options,
194
    )
195
0
}
196
197
/// Attempts to encode an array into an `ArrayDictionary` with index
198
/// type K and value (dictionary) type value_type
199
///
200
/// K is the key type
201
0
pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
202
0
    array: &dyn Array,
203
0
    dict_value_type: &DataType,
204
0
    cast_options: &CastOptions,
205
0
) -> Result<ArrayRef, ArrowError> {
206
    use DataType::*;
207
208
0
    match *dict_value_type {
209
0
        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
210
0
        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
211
0
        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
212
0
        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
213
0
        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
214
0
        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
215
0
        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
216
0
        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
217
0
        Decimal32(p, s) => pack_decimal_to_dictionary::<K, Decimal32Type>(
218
0
            array,
219
0
            dict_value_type,
220
0
            p,
221
0
            s,
222
0
            cast_options,
223
        ),
224
0
        Decimal64(p, s) => pack_decimal_to_dictionary::<K, Decimal64Type>(
225
0
            array,
226
0
            dict_value_type,
227
0
            p,
228
0
            s,
229
0
            cast_options,
230
        ),
231
0
        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
232
0
            array,
233
0
            dict_value_type,
234
0
            p,
235
0
            s,
236
0
            cast_options,
237
        ),
238
0
        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
239
0
            array,
240
0
            dict_value_type,
241
0
            p,
242
0
            s,
243
0
            cast_options,
244
        ),
245
        Float16 => {
246
0
            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
247
        }
248
        Float32 => {
249
0
            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
250
        }
251
        Float64 => {
252
0
            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
253
        }
254
0
        Date32 => pack_array_to_dictionary_via_primitive::<K>(
255
0
            array,
256
0
            DataType::Int32,
257
0
            dict_value_type,
258
0
            cast_options,
259
        ),
260
0
        Date64 => pack_array_to_dictionary_via_primitive::<K>(
261
0
            array,
262
0
            DataType::Int64,
263
0
            dict_value_type,
264
0
            cast_options,
265
        ),
266
0
        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
267
0
            array,
268
0
            DataType::Int32,
269
0
            dict_value_type,
270
0
            cast_options,
271
        ),
272
0
        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
273
0
            array,
274
0
            DataType::Int64,
275
0
            dict_value_type,
276
0
            cast_options,
277
        ),
278
0
        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
279
0
            array,
280
0
            DataType::Int64,
281
0
            dict_value_type,
282
0
            cast_options,
283
        ),
284
        Utf8 => {
285
            // If the input is a view type, we can avoid casting (thus copying) the data
286
0
            if array.data_type() == &DataType::Utf8View {
287
0
                return string_view_to_dictionary::<K, i32>(array);
288
0
            }
289
0
            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
290
        }
291
        LargeUtf8 => {
292
            // If the input is a view type, we can avoid casting (thus copying) the data
293
0
            if array.data_type() == &DataType::Utf8View {
294
0
                return string_view_to_dictionary::<K, i64>(array);
295
0
            }
296
0
            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
297
        }
298
        Binary => {
299
            // If the input is a view type, we can avoid casting (thus copying) the data
300
0
            if array.data_type() == &DataType::BinaryView {
301
0
                return binary_view_to_dictionary::<K, i32>(array);
302
0
            }
303
0
            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
304
        }
305
        LargeBinary => {
306
            // If the input is a view type, we can avoid casting (thus copying) the data
307
0
            if array.data_type() == &DataType::BinaryView {
308
0
                return binary_view_to_dictionary::<K, i64>(array);
309
0
            }
310
0
            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
311
        }
312
0
        FixedSizeBinary(byte_size) => {
313
0
            pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, byte_size)
314
        }
315
0
        _ => Err(ArrowError::CastError(format!(
316
0
            "Unsupported output type for dictionary packing: {dict_value_type:?}"
317
0
        ))),
318
    }
319
0
}
320
321
// Packs the data from the primitive array of type <V> to a
322
// DictionaryArray with keys of type K and values of value_type V
323
0
pub(crate) fn pack_numeric_to_dictionary<K, V>(
324
0
    array: &dyn Array,
325
0
    dict_value_type: &DataType,
326
0
    cast_options: &CastOptions,
327
0
) -> Result<ArrayRef, ArrowError>
328
0
where
329
0
    K: ArrowDictionaryKeyType,
330
0
    V: ArrowPrimitiveType,
331
{
332
    // attempt to cast the source array values to the target value type (the dictionary values type)
333
0
    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
334
0
    let values = cast_values.as_primitive::<V>();
335
336
0
    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
337
338
    // copy each element one at a time
339
0
    for i in 0..values.len() {
340
0
        if values.is_null(i) {
341
0
            b.append_null();
342
0
        } else {
343
0
            b.append(values.value(i))?;
344
        }
345
    }
346
0
    Ok(Arc::new(b.finish()))
347
0
}
348
349
0
pub(crate) fn pack_decimal_to_dictionary<K, D>(
350
0
    array: &dyn Array,
351
0
    dict_value_type: &DataType,
352
0
    precision: u8,
353
0
    scale: i8,
354
0
    cast_options: &CastOptions,
355
0
) -> Result<ArrayRef, ArrowError>
356
0
where
357
0
    K: ArrowDictionaryKeyType,
358
0
    D: DecimalType + ArrowPrimitiveType,
359
{
360
0
    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
361
0
    let dict = dict
362
0
        .as_dictionary::<K>()
363
0
        .downcast_dict::<PrimitiveArray<D>>()
364
0
        .ok_or_else(|| {
365
0
            ArrowError::ComputeError(format!(
366
0
                "Internal Error: Cannot cast dict to {}Array",
367
0
                D::PREFIX
368
0
            ))
369
0
        })?;
370
0
    let value = dict.values().clone();
371
    // Set correct precision/scale
372
0
    let value = value.with_precision_and_scale(precision, scale)?;
373
0
    Ok(Arc::new(DictionaryArray::<K>::try_new(
374
0
        dict.keys().clone(),
375
0
        Arc::new(value),
376
0
    )?))
377
0
}
378
379
0
pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
380
0
    array: &dyn Array,
381
0
) -> Result<ArrayRef, ArrowError>
382
0
where
383
0
    K: ArrowDictionaryKeyType,
384
{
385
0
    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
386
0
        array.len(),
387
        1024,
388
        1024,
389
    );
390
0
    let string_view = array
391
0
        .as_any()
392
0
        .downcast_ref::<StringViewArray>()
393
0
        .ok_or_else(|| {
394
0
            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
395
0
        })?;
396
0
    for v in string_view.iter() {
397
0
        match v {
398
0
            Some(v) => {
399
0
                b.append(v)?;
400
            }
401
0
            None => {
402
0
                b.append_null();
403
0
            }
404
        }
405
    }
406
407
0
    Ok(Arc::new(b.finish()))
408
0
}
409
410
0
pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
411
0
    array: &dyn Array,
412
0
) -> Result<ArrayRef, ArrowError>
413
0
where
414
0
    K: ArrowDictionaryKeyType,
415
{
416
0
    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
417
0
        array.len(),
418
        1024,
419
        1024,
420
    );
421
0
    let binary_view = array
422
0
        .as_any()
423
0
        .downcast_ref::<BinaryViewArray>()
424
0
        .ok_or_else(|| {
425
0
            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
426
0
        })?;
427
0
    for v in binary_view.iter() {
428
0
        match v {
429
0
            Some(v) => {
430
0
                b.append(v)?;
431
            }
432
0
            None => {
433
0
                b.append_null();
434
0
            }
435
        }
436
    }
437
438
0
    Ok(Arc::new(b.finish()))
439
0
}
440
441
// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
442
// key types of K
443
0
pub(crate) fn pack_byte_to_dictionary<K, T>(
444
0
    array: &dyn Array,
445
0
    cast_options: &CastOptions,
446
0
) -> Result<ArrayRef, ArrowError>
447
0
where
448
0
    K: ArrowDictionaryKeyType,
449
0
    T: ByteArrayType,
450
{
451
0
    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
452
0
    let values = cast_values
453
0
        .as_any()
454
0
        .downcast_ref::<GenericByteArray<T>>()
455
0
        .ok_or_else(|| {
456
0
            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
457
0
        })?;
458
0
    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
459
460
    // copy each element one at a time
461
0
    for i in 0..values.len() {
462
0
        if values.is_null(i) {
463
0
            b.append_null();
464
0
        } else {
465
0
            b.append(values.value(i))?;
466
        }
467
    }
468
0
    Ok(Arc::new(b.finish()))
469
0
}
470
471
// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
472
// key types of K
473
0
pub(crate) fn pack_byte_to_fixed_size_dictionary<K>(
474
0
    array: &dyn Array,
475
0
    cast_options: &CastOptions,
476
0
    byte_width: i32,
477
0
) -> Result<ArrayRef, ArrowError>
478
0
where
479
0
    K: ArrowDictionaryKeyType,
480
{
481
0
    let cast_values =
482
0
        cast_with_options(array, &DataType::FixedSizeBinary(byte_width), cast_options)?;
483
0
    let values = cast_values
484
0
        .as_any()
485
0
        .downcast_ref::<FixedSizeBinaryArray>()
486
0
        .ok_or_else(|| {
487
0
            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
488
0
        })?;
489
0
    let mut b = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(1024, 1024, byte_width);
490
491
    // copy each element one at a time
492
0
    for i in 0..values.len() {
493
0
        if values.is_null(i) {
494
0
            b.append_null();
495
0
        } else {
496
0
            b.append(values.value(i))?;
497
        }
498
    }
499
0
    Ok(Arc::new(b.finish()))
500
0
}