/Users/andrewlamb/Software/arrow-rs/arrow-array/src/builder/fixed_size_binary_dictionary_builder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder}; |
19 | | use crate::types::ArrowDictionaryKeyType; |
20 | | use crate::{Array, ArrayRef, DictionaryArray, PrimitiveArray}; |
21 | | use arrow_buffer::ArrowNativeType; |
22 | | use arrow_schema::DataType::FixedSizeBinary; |
23 | | use arrow_schema::{ArrowError, DataType}; |
24 | | use hashbrown::HashTable; |
25 | | use num::NumCast; |
26 | | use std::any::Any; |
27 | | use std::sync::Arc; |
28 | | |
29 | | /// Builder for [`DictionaryArray`] of [`FixedSizeBinaryArray`] |
30 | | /// |
31 | | /// The output array has a dictionary of unique, fixed-size binary values. The |
32 | | /// builder handles deduplication. |
33 | | /// |
34 | | /// # Example |
35 | | /// ``` |
36 | | /// # use arrow_array::builder::{FixedSizeBinaryDictionaryBuilder}; |
37 | | /// # use arrow_array::array::{Array, FixedSizeBinaryArray}; |
38 | | /// # use arrow_array::DictionaryArray; |
39 | | /// # use arrow_array::types::Int8Type; |
40 | | /// // Build 3 byte FixedBinaryArrays |
41 | | /// let byte_width = 3; |
42 | | /// let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3); |
43 | | /// builder.append("abc").unwrap(); |
44 | | /// builder.append_null(); |
45 | | /// builder.append(b"def").unwrap(); |
46 | | /// builder.append(b"def").unwrap(); // duplicate value |
47 | | /// // Result is a Dictionary Array |
48 | | /// let array = builder.finish(); |
49 | | /// let dict_array = array.as_any().downcast_ref::<DictionaryArray<Int8Type>>().unwrap(); |
50 | | /// // The array represents "abc", null, "def", "def" |
51 | | /// assert_eq!(array.keys().len(), 4); |
52 | | /// // but there are only 2 unique values |
53 | | /// assert_eq!(array.values().len(), 2); |
54 | | /// let values = dict_array.values().as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap(); |
55 | | /// assert_eq!(values.value(0), "abc".as_bytes()); |
56 | | /// assert_eq!(values.value(1), "def".as_bytes()); |
57 | | /// ``` |
58 | | /// |
59 | | /// [`FixedSizeBinaryArray`]: crate::FixedSizeBinaryArray |
60 | | #[derive(Debug)] |
61 | | pub struct FixedSizeBinaryDictionaryBuilder<K> |
62 | | where |
63 | | K: ArrowDictionaryKeyType, |
64 | | { |
65 | | state: ahash::RandomState, |
66 | | dedup: HashTable<usize>, |
67 | | |
68 | | keys_builder: PrimitiveBuilder<K>, |
69 | | values_builder: FixedSizeBinaryBuilder, |
70 | | byte_width: i32, |
71 | | } |
72 | | |
73 | | impl<K> FixedSizeBinaryDictionaryBuilder<K> |
74 | | where |
75 | | K: ArrowDictionaryKeyType, |
76 | | { |
77 | | /// Creates a new `FixedSizeBinaryDictionaryBuilder` |
78 | | pub fn new(byte_width: i32) -> Self { |
79 | | let keys_builder = PrimitiveBuilder::new(); |
80 | | let values_builder = FixedSizeBinaryBuilder::new(byte_width); |
81 | | Self { |
82 | | state: Default::default(), |
83 | | dedup: HashTable::with_capacity(keys_builder.capacity()), |
84 | | keys_builder, |
85 | | values_builder, |
86 | | byte_width, |
87 | | } |
88 | | } |
89 | | |
90 | | /// Creates a new `FixedSizeBinaryDictionaryBuilder` with the provided capacities |
91 | | /// |
92 | | /// `keys_capacity`: the number of keys, i.e. length of array to build |
93 | | /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary |
94 | | /// `byte_width`: the byte width for individual values in the values array |
95 | 0 | pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self { |
96 | 0 | Self { |
97 | 0 | state: Default::default(), |
98 | 0 | dedup: Default::default(), |
99 | 0 | keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), |
100 | 0 | values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width), |
101 | 0 | byte_width, |
102 | 0 | } |
103 | 0 | } |
104 | | |
105 | | /// Creates a new `FixedSizeBinaryDictionaryBuilder` from the existing builder with the same |
106 | | /// keys and values, but with a new data type for the keys. |
107 | | /// |
108 | | /// # Example |
109 | | /// ``` |
110 | | /// # use arrow_array::builder::FixedSizeBinaryDictionaryBuilder; |
111 | | /// # use arrow_array::types::{UInt8Type, UInt16Type, UInt64Type}; |
112 | | /// # use arrow_array::UInt16Array; |
113 | | /// # use arrow_schema::ArrowError; |
114 | | /// |
115 | | /// let mut u8_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt8Type>::new(2); |
116 | | /// // appending too many values causes the dictionary to overflow |
117 | | /// for i in 0..=255 { |
118 | | /// u8_keyed_builder.append_value(vec![0, i]); |
119 | | /// } |
120 | | /// let result = u8_keyed_builder.append(vec![1, 0]); |
121 | | /// assert!(matches!(result, Err(ArrowError::DictionaryKeyOverflowError{}))); |
122 | | /// |
123 | | /// // we need to upgrade to a larger key type |
124 | | /// let mut u16_keyed_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap(); |
125 | | /// let dictionary_array = u16_keyed_builder.finish(); |
126 | | /// let keys = dictionary_array.keys(); |
127 | | /// |
128 | | /// assert_eq!(keys, &UInt16Array::from_iter(0..256)); |
129 | | /// ``` |
130 | | pub fn try_new_from_builder<K2>( |
131 | | mut source: FixedSizeBinaryDictionaryBuilder<K2>, |
132 | | ) -> Result<Self, ArrowError> |
133 | | where |
134 | | K::Native: NumCast, |
135 | | K2: ArrowDictionaryKeyType, |
136 | | K2::Native: NumCast, |
137 | | { |
138 | | let state = source.state; |
139 | | let dedup = source.dedup; |
140 | | let values_builder = source.values_builder; |
141 | | let byte_width = source.byte_width; |
142 | | |
143 | | let source_keys = source.keys_builder.finish(); |
144 | | let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| { |
145 | | num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| { |
146 | | ArrowError::CastError(format!( |
147 | | "Can't cast dictionary keys from source type {:?} to type {:?}", |
148 | | K2::DATA_TYPE, |
149 | | K::DATA_TYPE |
150 | | )) |
151 | | }) |
152 | | })?; |
153 | | |
154 | | // drop source key here because currently source_keys and new_keys are holding reference to |
155 | | // the same underlying null_buffer. Below we want to call new_keys.into_builder() it must |
156 | | // be the only reference holder. |
157 | | drop(source_keys); |
158 | | |
159 | | Ok(Self { |
160 | | state, |
161 | | dedup, |
162 | | keys_builder: new_keys |
163 | | .into_builder() |
164 | | .expect("underlying buffer has no references"), |
165 | | values_builder, |
166 | | byte_width, |
167 | | }) |
168 | | } |
169 | | } |
170 | | |
171 | | impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K> |
172 | | where |
173 | | K: ArrowDictionaryKeyType, |
174 | | { |
175 | | /// Returns the builder as an non-mutable `Any` reference. |
176 | | fn as_any(&self) -> &dyn Any { |
177 | | self |
178 | | } |
179 | | |
180 | | /// Returns the builder as an mutable `Any` reference. |
181 | | fn as_any_mut(&mut self) -> &mut dyn Any { |
182 | | self |
183 | | } |
184 | | |
185 | | /// Returns the boxed builder as a box of `Any`. |
186 | | fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
187 | | self |
188 | | } |
189 | | |
190 | | /// Returns the number of array slots in the builder |
191 | | fn len(&self) -> usize { |
192 | | self.keys_builder.len() |
193 | | } |
194 | | |
195 | | /// Builds the array and reset this builder. |
196 | | fn finish(&mut self) -> ArrayRef { |
197 | | Arc::new(self.finish()) |
198 | | } |
199 | | |
200 | | /// Builds the array without resetting the builder. |
201 | | fn finish_cloned(&self) -> ArrayRef { |
202 | | Arc::new(self.finish_cloned()) |
203 | | } |
204 | | } |
205 | | |
206 | | impl<K> FixedSizeBinaryDictionaryBuilder<K> |
207 | | where |
208 | | K: ArrowDictionaryKeyType, |
209 | | { |
210 | 0 | fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> { |
211 | 0 | let value_bytes: &[u8] = value.as_ref(); |
212 | | |
213 | 0 | let state = &self.state; |
214 | 0 | let storage = &mut self.values_builder; |
215 | 0 | let hash = state.hash_one(value_bytes); |
216 | | |
217 | 0 | let idx = *self |
218 | 0 | .dedup |
219 | 0 | .entry( |
220 | 0 | hash, |
221 | 0 | |idx| value_bytes == get_bytes(storage, self.byte_width, *idx), |
222 | 0 | |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)), |
223 | | ) |
224 | 0 | .or_insert_with(|| { |
225 | 0 | let idx = storage.len(); |
226 | 0 | let _ = storage.append_value(value); |
227 | 0 | idx |
228 | 0 | }) |
229 | 0 | .get(); |
230 | | |
231 | 0 | let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?; |
232 | | |
233 | 0 | Ok(key) |
234 | 0 | } |
235 | | |
236 | | /// Append a value to the array. Return an existing index |
237 | | /// if already present in the values array or a new index if the |
238 | | /// value is appended to the values array. |
239 | | /// |
240 | | /// Returns an error if the new index would overflow the key type. |
241 | 0 | pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> { |
242 | 0 | if self.byte_width != value.as_ref().len() as i32 { |
243 | 0 | Err(ArrowError::InvalidArgumentError(format!( |
244 | 0 | "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}", |
245 | 0 | self.byte_width, |
246 | 0 | value.as_ref().len() |
247 | 0 | ))) |
248 | | } else { |
249 | 0 | let key = self.get_or_insert_key(value)?; |
250 | 0 | self.keys_builder.append_value(key); |
251 | 0 | Ok(key) |
252 | | } |
253 | 0 | } |
254 | | |
255 | | /// Appends a null slot into the builder |
256 | | #[inline] |
257 | 0 | pub fn append_null(&mut self) { |
258 | 0 | self.keys_builder.append_null() |
259 | 0 | } |
260 | | |
261 | | /// Appends `n` `null`s into the builder. |
262 | | #[inline] |
263 | | pub fn append_nulls(&mut self, n: usize) { |
264 | | self.keys_builder.append_nulls(n); |
265 | | } |
266 | | |
267 | | /// Infallibly append a value to this builder |
268 | | /// |
269 | | /// # Panics |
270 | | /// |
271 | | /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` |
272 | | pub fn append_value(&mut self, value: impl AsRef<[u8]>) { |
273 | | self.append(value).expect("dictionary key overflow"); |
274 | | } |
275 | | |
276 | | /// Builds the `DictionaryArray` and reset this builder. |
277 | 0 | pub fn finish(&mut self) -> DictionaryArray<K> { |
278 | 0 | self.dedup.clear(); |
279 | 0 | let values = self.values_builder.finish(); |
280 | 0 | let keys = self.keys_builder.finish(); |
281 | | |
282 | 0 | let data_type = DataType::Dictionary( |
283 | 0 | Box::new(K::DATA_TYPE), |
284 | 0 | Box::new(FixedSizeBinary(self.byte_width)), |
285 | 0 | ); |
286 | | |
287 | 0 | let builder = keys |
288 | 0 | .into_data() |
289 | 0 | .into_builder() |
290 | 0 | .data_type(data_type) |
291 | 0 | .child_data(vec![values.into_data()]); |
292 | | |
293 | 0 | DictionaryArray::from(unsafe { builder.build_unchecked() }) |
294 | 0 | } |
295 | | |
296 | | /// Builds the `DictionaryArray` without resetting the builder. |
297 | | pub fn finish_cloned(&self) -> DictionaryArray<K> { |
298 | | let values = self.values_builder.finish_cloned(); |
299 | | let keys = self.keys_builder.finish_cloned(); |
300 | | |
301 | | let data_type = DataType::Dictionary( |
302 | | Box::new(K::DATA_TYPE), |
303 | | Box::new(FixedSizeBinary(self.byte_width)), |
304 | | ); |
305 | | |
306 | | let builder = keys |
307 | | .into_data() |
308 | | .into_builder() |
309 | | .data_type(data_type) |
310 | | .child_data(vec![values.into_data()]); |
311 | | |
312 | | DictionaryArray::from(unsafe { builder.build_unchecked() }) |
313 | | } |
314 | | |
315 | | /// Builds the `DictionaryArray` without resetting the values builder or |
316 | | /// the internal de-duplication map. |
317 | | /// |
318 | | /// The advantage of doing this is that the values will represent the entire |
319 | | /// set of what has been built so-far by this builder and ensures |
320 | | /// consistency in the assignment of keys to values across multiple calls |
321 | | /// to `finish_preserve_values`. This enables ipc writers to efficiently |
322 | | /// emit delta dictionaries. |
323 | | /// |
324 | | /// The downside to this is that building the record requires creating a |
325 | | /// copy of the values, which can become slowly more expensive if the |
326 | | /// dictionary grows. |
327 | | /// |
328 | | /// Additionally, if record batches from multiple different dictionary |
329 | | /// builders for the same column are fed into a single ipc writer, beware |
330 | | /// that entire dictionaries are likely to be re-sent frequently even when |
331 | | /// the majority of the values are not used by the current record batch. |
332 | | pub fn finish_preserve_values(&mut self) -> DictionaryArray<K> { |
333 | | let values = self.values_builder.finish_cloned(); |
334 | | let keys = self.keys_builder.finish(); |
335 | | |
336 | | let data_type = DataType::Dictionary( |
337 | | Box::new(K::DATA_TYPE), |
338 | | Box::new(FixedSizeBinary(self.byte_width)), |
339 | | ); |
340 | | |
341 | | let builder = keys |
342 | | .into_data() |
343 | | .into_builder() |
344 | | .data_type(data_type) |
345 | | .child_data(vec![values.into_data()]); |
346 | | |
347 | | DictionaryArray::from(unsafe { builder.build_unchecked() }) |
348 | | } |
349 | | } |
350 | | |
351 | 0 | fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] { |
352 | 0 | let values = values.values_slice(); |
353 | 0 | let start = idx * byte_width.as_usize(); |
354 | 0 | let end = idx * byte_width.as_usize() + byte_width.as_usize(); |
355 | 0 | &values[start..end] |
356 | 0 | } |
357 | | |
358 | | #[cfg(test)] |
359 | | mod tests { |
360 | | use super::*; |
361 | | |
362 | | use crate::types::{Int16Type, Int32Type, Int8Type, UInt16Type, UInt8Type}; |
363 | | use crate::{ArrowPrimitiveType, FixedSizeBinaryArray, Int8Array}; |
364 | | |
365 | | #[test] |
366 | | fn test_fixed_size_dictionary_builder() { |
367 | | let values = ["abc", "def"]; |
368 | | |
369 | | let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3); |
370 | | assert_eq!(b.append(values[0]).unwrap(), 0); |
371 | | b.append_null(); |
372 | | assert_eq!(b.append(values[1]).unwrap(), 1); |
373 | | assert_eq!(b.append(values[1]).unwrap(), 1); |
374 | | assert_eq!(b.append(values[0]).unwrap(), 0); |
375 | | b.append_nulls(2); |
376 | | assert_eq!(b.append(values[0]).unwrap(), 0); |
377 | | let array = b.finish(); |
378 | | |
379 | | assert_eq!( |
380 | | array.keys(), |
381 | | &Int8Array::from(vec![ |
382 | | Some(0), |
383 | | None, |
384 | | Some(1), |
385 | | Some(1), |
386 | | Some(0), |
387 | | None, |
388 | | None, |
389 | | Some(0) |
390 | | ]), |
391 | | ); |
392 | | |
393 | | // Values are polymorphic and so require a downcast. |
394 | | let ava = array |
395 | | .values() |
396 | | .as_any() |
397 | | .downcast_ref::<FixedSizeBinaryArray>() |
398 | | .unwrap(); |
399 | | |
400 | | assert_eq!(ava.value(0), values[0].as_bytes()); |
401 | | assert_eq!(ava.value(1), values[1].as_bytes()); |
402 | | } |
403 | | |
404 | | #[test] |
405 | | fn test_fixed_size_dictionary_builder_wrong_size() { |
406 | | let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3); |
407 | | let err = b.append(b"too long").unwrap_err().to_string(); |
408 | | assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8"); |
409 | | let err = b.append("").unwrap_err().to_string(); |
410 | | assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0"); |
411 | | } |
412 | | |
413 | | #[test] |
414 | | fn test_fixed_size_dictionary_builder_finish_cloned() { |
415 | | let values = ["abc", "def", "ghi"]; |
416 | | |
417 | | let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3); |
418 | | |
419 | | builder.append(values[0]).unwrap(); |
420 | | builder.append_null(); |
421 | | builder.append(values[1]).unwrap(); |
422 | | builder.append(values[1]).unwrap(); |
423 | | builder.append(values[0]).unwrap(); |
424 | | let mut array = builder.finish_cloned(); |
425 | | |
426 | | assert_eq!( |
427 | | array.keys(), |
428 | | &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) |
429 | | ); |
430 | | |
431 | | // Values are polymorphic and so require a downcast. |
432 | | let ava = array |
433 | | .values() |
434 | | .as_any() |
435 | | .downcast_ref::<FixedSizeBinaryArray>() |
436 | | .unwrap(); |
437 | | |
438 | | assert_eq!(ava.value(0), values[0].as_bytes()); |
439 | | assert_eq!(ava.value(1), values[1].as_bytes()); |
440 | | |
441 | | builder.append(values[0]).unwrap(); |
442 | | builder.append(values[2]).unwrap(); |
443 | | builder.append(values[1]).unwrap(); |
444 | | |
445 | | array = builder.finish(); |
446 | | |
447 | | assert_eq!( |
448 | | array.keys(), |
449 | | &Int8Array::from(vec![ |
450 | | Some(0), |
451 | | None, |
452 | | Some(1), |
453 | | Some(1), |
454 | | Some(0), |
455 | | Some(0), |
456 | | Some(2), |
457 | | Some(1) |
458 | | ]) |
459 | | ); |
460 | | |
461 | | // Values are polymorphic and so require a downcast. |
462 | | let ava2 = array |
463 | | .values() |
464 | | .as_any() |
465 | | .downcast_ref::<FixedSizeBinaryArray>() |
466 | | .unwrap(); |
467 | | |
468 | | assert_eq!(ava2.value(0), values[0].as_bytes()); |
469 | | assert_eq!(ava2.value(1), values[1].as_bytes()); |
470 | | assert_eq!(ava2.value(2), values[2].as_bytes()); |
471 | | } |
472 | | |
473 | | fn _test_try_new_from_builder_generic_for_key_types<K1, K2>(values: Vec<[u8; 3]>) |
474 | | where |
475 | | K1: ArrowDictionaryKeyType, |
476 | | K1::Native: NumCast, |
477 | | K2: ArrowDictionaryKeyType, |
478 | | K2::Native: NumCast + From<u8>, |
479 | | { |
480 | | let mut source = FixedSizeBinaryDictionaryBuilder::<K1>::new(3); |
481 | | source.append_value(values[0]); |
482 | | source.append_null(); |
483 | | source.append_value(values[1]); |
484 | | source.append_value(values[2]); |
485 | | |
486 | | let mut result = |
487 | | FixedSizeBinaryDictionaryBuilder::<K2>::try_new_from_builder(source).unwrap(); |
488 | | let array = result.finish(); |
489 | | |
490 | | let mut expected_keys_builder = PrimitiveBuilder::<K2>::new(); |
491 | | expected_keys_builder |
492 | | .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(0u8)); |
493 | | expected_keys_builder.append_null(); |
494 | | expected_keys_builder |
495 | | .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(1u8)); |
496 | | expected_keys_builder |
497 | | .append_value(<<K2 as ArrowPrimitiveType>::Native as From<u8>>::from(2u8)); |
498 | | let expected_keys = expected_keys_builder.finish(); |
499 | | assert_eq!(array.keys(), &expected_keys); |
500 | | |
501 | | let av = array.values(); |
502 | | let ava = av.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap(); |
503 | | assert_eq!(ava.value(0), values[0]); |
504 | | assert_eq!(ava.value(1), values[1]); |
505 | | assert_eq!(ava.value(2), values[2]); |
506 | | } |
507 | | |
508 | | #[test] |
509 | | fn test_try_new_from_builder() { |
510 | | let values = vec![[1, 2, 3], [5, 6, 7], [6, 7, 8]]; |
511 | | // test cast to bigger size unsigned |
512 | | _test_try_new_from_builder_generic_for_key_types::<UInt8Type, UInt16Type>(values.clone()); |
513 | | // test cast going to smaller size unsigned |
514 | | _test_try_new_from_builder_generic_for_key_types::<UInt16Type, UInt8Type>(values.clone()); |
515 | | // test cast going to bigger size signed |
516 | | _test_try_new_from_builder_generic_for_key_types::<Int8Type, Int16Type>(values.clone()); |
517 | | // test cast going to smaller size signed |
518 | | _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone()); |
519 | | // test going from signed to signed for different size changes |
520 | | _test_try_new_from_builder_generic_for_key_types::<UInt8Type, Int16Type>(values.clone()); |
521 | | _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt8Type>(values.clone()); |
522 | | _test_try_new_from_builder_generic_for_key_types::<Int8Type, UInt16Type>(values.clone()); |
523 | | _test_try_new_from_builder_generic_for_key_types::<Int32Type, Int16Type>(values.clone()); |
524 | | } |
525 | | |
526 | | #[test] |
527 | | fn test_try_new_from_builder_cast_fails() { |
528 | | let mut source_builder = FixedSizeBinaryDictionaryBuilder::<UInt16Type>::new(2); |
529 | | for i in 0u16..257u16 { |
530 | | source_builder.append_value(vec![(i >> 8) as u8, i as u8]); |
531 | | } |
532 | | |
533 | | // there should be too many values that we can't downcast to the underlying type |
534 | | // we have keys that wouldn't fit into UInt8Type |
535 | | let result = |
536 | | FixedSizeBinaryDictionaryBuilder::<UInt8Type>::try_new_from_builder(source_builder); |
537 | | assert!(result.is_err()); |
538 | | if let Err(e) = result { |
539 | | assert!(matches!(e, ArrowError::CastError(_))); |
540 | | assert_eq!( |
541 | | e.to_string(), |
542 | | "Cast error: Can't cast dictionary keys from source type UInt16 to type UInt8" |
543 | | ); |
544 | | } |
545 | | } |
546 | | |
547 | | #[test] |
548 | | fn test_finish_preserve_values() { |
549 | | // Create the first dictionary |
550 | | let mut builder = FixedSizeBinaryDictionaryBuilder::<Int32Type>::new(3); |
551 | | builder.append_value("aaa"); |
552 | | builder.append_value("bbb"); |
553 | | builder.append_value("ccc"); |
554 | | let dict = builder.finish_preserve_values(); |
555 | | assert_eq!(dict.keys().values(), &[0, 1, 2]); |
556 | | let values = dict |
557 | | .downcast_dict::<FixedSizeBinaryArray>() |
558 | | .unwrap() |
559 | | .into_iter() |
560 | | .collect::<Vec<_>>(); |
561 | | assert_eq!( |
562 | | values, |
563 | | vec![ |
564 | | Some("aaa".as_bytes()), |
565 | | Some("bbb".as_bytes()), |
566 | | Some("ccc".as_bytes()) |
567 | | ] |
568 | | ); |
569 | | |
570 | | // Create a new dictionary |
571 | | builder.append_value("ddd"); |
572 | | builder.append_value("eee"); |
573 | | let dict2 = builder.finish_preserve_values(); |
574 | | |
575 | | // Make sure the keys are assigned after the old ones and we have the |
576 | | // right values |
577 | | assert_eq!(dict2.keys().values(), &[3, 4]); |
578 | | let values = dict2 |
579 | | .downcast_dict::<FixedSizeBinaryArray>() |
580 | | .unwrap() |
581 | | .into_iter() |
582 | | .collect::<Vec<_>>(); |
583 | | assert_eq!(values, [Some("ddd".as_bytes()), Some("eee".as_bytes())]); |
584 | | |
585 | | // Check that we have all of the expected values |
586 | | let all_values = dict2 |
587 | | .values() |
588 | | .as_any() |
589 | | .downcast_ref::<FixedSizeBinaryArray>() |
590 | | .unwrap() |
591 | | .into_iter() |
592 | | .collect::<Vec<_>>(); |
593 | | assert_eq!( |
594 | | all_values, |
595 | | [ |
596 | | Some("aaa".as_bytes()), |
597 | | Some("bbb".as_bytes()), |
598 | | Some("ccc".as_bytes()), |
599 | | Some("ddd".as_bytes()), |
600 | | Some("eee".as_bytes()) |
601 | | ] |
602 | | ); |
603 | | } |
604 | | } |