/Users/andrewlamb/Software/arrow-rs/arrow-data/src/transform/mod.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Low-level array data abstractions. |
19 | | //! |
20 | | //! Provides utilities for creating, manipulating, and converting Arrow arrays |
21 | | //! made of primitive types, strings, and nested types. |
22 | | |
23 | | use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView}; |
24 | | use crate::bit_mask::set_bits; |
25 | | use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; |
26 | | use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; |
27 | | use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; |
28 | | use half::f16; |
29 | | use num::Integer; |
30 | | use std::mem; |
31 | | |
32 | | mod boolean; |
33 | | mod fixed_binary; |
34 | | mod fixed_size_list; |
35 | | mod list; |
36 | | mod null; |
37 | | mod primitive; |
38 | | mod run; |
39 | | mod structure; |
40 | | mod union; |
41 | | mod utils; |
42 | | mod variable_size; |
43 | | |
44 | | type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>; |
45 | | // function that extends `[start..start+len]` to the mutable array. |
46 | | // this is dynamic because different data_types influence how buffers and children are extended. |
47 | | type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>; |
48 | | |
49 | | type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>; |
50 | | |
51 | | /// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. |
52 | | /// This is just a data container. |
53 | | #[derive(Debug)] |
54 | | struct _MutableArrayData<'a> { |
55 | | pub data_type: DataType, |
56 | | pub null_count: usize, |
57 | | |
58 | | pub len: usize, |
59 | | pub null_buffer: Option<MutableBuffer>, |
60 | | |
61 | | // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). |
62 | | // Thus, we place them in the stack to avoid bound checks and greater data locality. |
63 | | pub buffer1: MutableBuffer, |
64 | | pub buffer2: MutableBuffer, |
65 | | pub child_data: Vec<MutableArrayData<'a>>, |
66 | | } |
67 | | |
68 | | impl _MutableArrayData<'_> { |
69 | 47 | fn null_buffer(&mut self) -> &mut MutableBuffer { |
70 | 47 | self.null_buffer |
71 | 47 | .as_mut() |
72 | 47 | .expect("MutableArrayData not nullable") |
73 | 47 | } |
74 | | } |
75 | | |
76 | 51 | fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits<'_> { |
77 | 51 | if let Some(nulls12 ) = array.nulls() { |
78 | 12 | let bytes = nulls.validity(); |
79 | 12 | Box::new(move |mutable, start, len| { |
80 | 12 | let mutable_len = mutable.len; |
81 | 12 | let out = mutable.null_buffer(); |
82 | 12 | utils::resize_for_bits(out, mutable_len + len); |
83 | 12 | mutable.null_count += set_bits( |
84 | 12 | out.as_slice_mut(), |
85 | 12 | bytes, |
86 | 12 | mutable_len, |
87 | 12 | nulls.offset() + start, |
88 | 12 | len, |
89 | 12 | ); |
90 | 12 | }) |
91 | 39 | } else if use_nulls { |
92 | 35 | Box::new(|mutable, _, len| { |
93 | 35 | let mutable_len = mutable.len; |
94 | 35 | let out = mutable.null_buffer(); |
95 | 35 | utils::resize_for_bits(out, mutable_len + len); |
96 | 35 | let write_data = out.as_slice_mut(); |
97 | 45 | (0..len)35 .for_each35 (|i| { |
98 | 45 | bit_util::set_bit(write_data, mutable_len + i); |
99 | 45 | }); |
100 | 35 | }) |
101 | | } else { |
102 | 4 | Box::new(|_, _, _| {}) |
103 | | } |
104 | 51 | } |
105 | | |
106 | | /// Efficiently create an [ArrayData] from one or more existing [ArrayData]s by |
107 | | /// copying chunks. |
108 | | /// |
109 | | /// The main use case of this struct is to perform unary operations to arrays of |
110 | | /// arbitrary types, such as `filter` and `take`. |
111 | | /// |
112 | | /// # Example |
113 | | /// ``` |
114 | | /// use arrow_buffer::Buffer; |
115 | | /// use arrow_data::ArrayData; |
116 | | /// use arrow_data::transform::MutableArrayData; |
117 | | /// use arrow_schema::DataType; |
118 | | /// fn i32_array(values: &[i32]) -> ArrayData { |
119 | | /// ArrayData::try_new(DataType::Int32, 5, None, 0, vec![Buffer::from_slice_ref(values)], vec![]).unwrap() |
120 | | /// } |
121 | | /// let arr1 = i32_array(&[1, 2, 3, 4, 5]); |
122 | | /// let arr2 = i32_array(&[6, 7, 8, 9, 10]); |
123 | | /// // Create a mutable array for copying values from arr1 and arr2, with a capacity for 6 elements |
124 | | /// let capacity = 3 * std::mem::size_of::<i32>(); |
125 | | /// let mut mutable = MutableArrayData::new(vec![&arr1, &arr2], false, 10); |
126 | | /// // Copy the first 3 elements from arr1 |
127 | | /// mutable.extend(0, 0, 3); |
128 | | /// // Copy the last 3 elements from arr2 |
129 | | /// mutable.extend(1, 2, 4); |
130 | | /// // Complete the MutableArrayData into a new ArrayData |
131 | | /// let frozen = mutable.freeze(); |
132 | | /// assert_eq!(frozen, i32_array(&[1, 2, 3, 8, 9, 10])); |
133 | | /// ``` |
134 | | pub struct MutableArrayData<'a> { |
135 | | /// Input arrays: the data being read FROM. |
136 | | /// |
137 | | /// Note this is "dead code" because all actual references to the arrays are |
138 | | /// stored in closures for extending values and nulls. |
139 | | #[allow(dead_code)] |
140 | | arrays: Vec<&'a ArrayData>, |
141 | | |
142 | | /// In progress output array: The data being written TO |
143 | | /// |
144 | | /// Note these fields are in a separate struct, [_MutableArrayData], as they |
145 | | /// cannot be in [MutableArrayData] itself due to mutability invariants (interior |
146 | | /// mutability): [MutableArrayData] contains a function that can only mutate |
147 | | /// [_MutableArrayData], not [MutableArrayData] itself |
148 | | data: _MutableArrayData<'a>, |
149 | | |
150 | | /// The child data of the `Array` in Dictionary arrays. |
151 | | /// |
152 | | /// This is not stored in `_MutableArrayData` because these values are |
153 | | /// constant and only needed at the end, when freezing [_MutableArrayData]. |
154 | | dictionary: Option<ArrayData>, |
155 | | |
156 | | /// Variadic data buffers referenced by views. |
157 | | /// |
158 | | /// Note this this is not stored in `_MutableArrayData` because these values |
159 | | /// are constant and only needed at the end, when freezing |
160 | | /// [_MutableArrayData] |
161 | | variadic_data_buffers: Vec<Buffer>, |
162 | | |
163 | | /// function used to extend output array with values from input arrays. |
164 | | /// |
165 | | /// This function's lifetime is bound to the input arrays because it reads |
166 | | /// values from them. |
167 | | extend_values: Vec<Extend<'a>>, |
168 | | |
169 | | /// function used to extend the output array with nulls from input arrays. |
170 | | /// |
171 | | /// This function's lifetime is bound to the input arrays because it reads |
172 | | /// nulls from it. |
173 | | extend_null_bits: Vec<ExtendNullBits<'a>>, |
174 | | |
175 | | /// function used to extend the output array with null elements. |
176 | | /// |
177 | | /// This function is independent of the arrays and therefore has no lifetime. |
178 | | extend_nulls: ExtendNulls, |
179 | | } |
180 | | |
181 | | impl std::fmt::Debug for MutableArrayData<'_> { |
182 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
183 | | // ignores the closures. |
184 | 0 | f.debug_struct("MutableArrayData") |
185 | 0 | .field("data", &self.data) |
186 | 0 | .finish() |
187 | 0 | } |
188 | | } |
189 | | |
190 | | /// Builds an extend that adds `offset` to the source primitive |
191 | | /// Additionally validates that `max` fits into the |
192 | | /// the underlying primitive returning None if not |
193 | 0 | fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option<Extend<'_>> { |
194 | | macro_rules! validate_and_build { |
195 | | ($dt: ty) => {{ |
196 | | let _: $dt = max.try_into().ok()?; |
197 | | let offset: $dt = offset.try_into().ok()?; |
198 | | Some(primitive::build_extend_with_offset(array, offset)) |
199 | | }}; |
200 | | } |
201 | 0 | match array.data_type() { |
202 | 0 | DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { |
203 | 0 | DataType::UInt8 => validate_and_build!(u8), |
204 | 0 | DataType::UInt16 => validate_and_build!(u16), |
205 | 0 | DataType::UInt32 => validate_and_build!(u32), |
206 | 0 | DataType::UInt64 => validate_and_build!(u64), |
207 | 0 | DataType::Int8 => validate_and_build!(i8), |
208 | 0 | DataType::Int16 => validate_and_build!(i16), |
209 | 0 | DataType::Int32 => validate_and_build!(i32), |
210 | 0 | DataType::Int64 => validate_and_build!(i64), |
211 | 0 | _ => unreachable!(), |
212 | | }, |
213 | 0 | _ => None, |
214 | | } |
215 | 0 | } |
216 | | |
217 | | /// Builds an extend that adds `buffer_offset` to any buffer indices encountered |
218 | 0 | fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> { |
219 | 0 | let views = array.buffer::<u128>(0); |
220 | 0 | Box::new( |
221 | 0 | move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { |
222 | 0 | mutable |
223 | 0 | .buffer1 |
224 | 0 | .extend(views[start..start + len].iter().map(|v| { |
225 | 0 | let len = *v as u32; |
226 | 0 | if len <= 12 { |
227 | 0 | return *v; // Stored inline |
228 | 0 | } |
229 | 0 | let mut view = ByteView::from(*v); |
230 | 0 | view.buffer_index += buffer_offset; |
231 | 0 | view.into() |
232 | 0 | })) |
233 | 0 | }, |
234 | | ) |
235 | 0 | } |
236 | | |
237 | 51 | fn build_extend(array: &ArrayData) -> Extend<'_> { |
238 | 51 | match array.data_type() { |
239 | 0 | DataType::Null => null::build_extend(array), |
240 | 0 | DataType::Boolean => boolean::build_extend(array), |
241 | 0 | DataType::UInt8 => primitive::build_extend::<u8>(array), |
242 | 0 | DataType::UInt16 => primitive::build_extend::<u16>(array), |
243 | 0 | DataType::UInt32 => primitive::build_extend::<u32>(array), |
244 | 0 | DataType::UInt64 => primitive::build_extend::<u64>(array), |
245 | 0 | DataType::Int8 => primitive::build_extend::<i8>(array), |
246 | 0 | DataType::Int16 => primitive::build_extend::<i16>(array), |
247 | 6 | DataType::Int32 => primitive::build_extend::<i32>(array), |
248 | 0 | DataType::Int64 => primitive::build_extend::<i64>(array), |
249 | 0 | DataType::Float32 => primitive::build_extend::<f32>(array), |
250 | 3 | DataType::Float64 => primitive::build_extend::<f64>(array), |
251 | | DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { |
252 | 0 | primitive::build_extend::<i32>(array) |
253 | | } |
254 | | DataType::Date64 |
255 | | | DataType::Time64(_) |
256 | | | DataType::Timestamp(_, _) |
257 | | | DataType::Duration(_) |
258 | 0 | | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::<i64>(array), |
259 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::<i128>(array), |
260 | 0 | DataType::Decimal32(_, _) => primitive::build_extend::<i32>(array), |
261 | 0 | DataType::Decimal64(_, _) => primitive::build_extend::<i64>(array), |
262 | 0 | DataType::Decimal128(_, _) => primitive::build_extend::<i128>(array), |
263 | 0 | DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array), |
264 | 9 | DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array), |
265 | 0 | DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array), |
266 | 0 | DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"), |
267 | 12 | DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array), |
268 | | DataType::ListView(_) | DataType::LargeListView(_) => { |
269 | 0 | unimplemented!("ListView/LargeListView not implemented") |
270 | | } |
271 | 0 | DataType::LargeList(_) => list::build_extend::<i64>(array), |
272 | 0 | DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), |
273 | 15 | DataType::Struct(_) => structure::build_extend(array), |
274 | 6 | DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), |
275 | 0 | DataType::Float16 => primitive::build_extend::<f16>(array), |
276 | 0 | DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), |
277 | 0 | DataType::Union(_, mode) => match mode { |
278 | 0 | UnionMode::Sparse => union::build_extend_sparse(array), |
279 | 0 | UnionMode::Dense => union::build_extend_dense(array), |
280 | | }, |
281 | 0 | DataType::RunEndEncoded(_, _) => run::build_extend(array), |
282 | | } |
283 | 51 | } |
284 | | |
285 | 18 | fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { |
286 | 18 | Box::new(match data_type0 { |
287 | 0 | DataType::Null => null::extend_nulls, |
288 | 0 | DataType::Boolean => boolean::extend_nulls, |
289 | 0 | DataType::UInt8 => primitive::extend_nulls::<u8>, |
290 | 0 | DataType::UInt16 => primitive::extend_nulls::<u16>, |
291 | 0 | DataType::UInt32 => primitive::extend_nulls::<u32>, |
292 | 0 | DataType::UInt64 => primitive::extend_nulls::<u64>, |
293 | 0 | DataType::Int8 => primitive::extend_nulls::<i8>, |
294 | 0 | DataType::Int16 => primitive::extend_nulls::<i16>, |
295 | 2 | DataType::Int32 => primitive::extend_nulls::<i32>, |
296 | 0 | DataType::Int64 => primitive::extend_nulls::<i64>, |
297 | 0 | DataType::Float32 => primitive::extend_nulls::<f32>, |
298 | 1 | DataType::Float64 => primitive::extend_nulls::<f64>, |
299 | 0 | DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { |
300 | 0 | primitive::extend_nulls::<i32> |
301 | 0 | } |
302 | | DataType::Date64 |
303 | | | DataType::Time64(_) |
304 | | | DataType::Timestamp(_, _) |
305 | | | DataType::Duration(_) |
306 | 0 | | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>, |
307 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>, |
308 | 0 | DataType::Decimal32(_, _) => primitive::extend_nulls::<i32>, |
309 | 0 | DataType::Decimal64(_, _) => primitive::extend_nulls::<i64>, |
310 | 0 | DataType::Decimal128(_, _) => primitive::extend_nulls::<i128>, |
311 | 0 | DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>, |
312 | 3 | DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>, |
313 | 0 | DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>, |
314 | 0 | DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>, |
315 | 4 | DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>, |
316 | | DataType::ListView(_) | DataType::LargeListView(_) => { |
317 | 0 | unimplemented!("ListView/LargeListView not implemented") |
318 | | } |
319 | 0 | DataType::LargeList(_) => list::extend_nulls::<i64>, |
320 | 0 | DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { |
321 | 0 | DataType::UInt8 => primitive::extend_nulls::<u8>, |
322 | 0 | DataType::UInt16 => primitive::extend_nulls::<u16>, |
323 | 0 | DataType::UInt32 => primitive::extend_nulls::<u32>, |
324 | 0 | DataType::UInt64 => primitive::extend_nulls::<u64>, |
325 | 0 | DataType::Int8 => primitive::extend_nulls::<i8>, |
326 | 0 | DataType::Int16 => primitive::extend_nulls::<i16>, |
327 | 0 | DataType::Int32 => primitive::extend_nulls::<i32>, |
328 | 0 | DataType::Int64 => primitive::extend_nulls::<i64>, |
329 | 0 | _ => unreachable!(), |
330 | | }, |
331 | 5 | DataType::Struct(_) => structure::extend_nulls, |
332 | 3 | DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls, |
333 | 0 | DataType::Float16 => primitive::extend_nulls::<f16>, |
334 | 0 | DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, |
335 | 0 | DataType::Union(_, mode) => match mode { |
336 | 0 | UnionMode::Sparse => union::extend_nulls_sparse, |
337 | 0 | UnionMode::Dense => union::extend_nulls_dense, |
338 | | }, |
339 | 0 | DataType::RunEndEncoded(_, _) => run::extend_nulls, |
340 | | }) |
341 | 18 | } |
342 | | |
343 | 0 | fn preallocate_offset_and_binary_buffer<Offset: ArrowNativeType + Integer>( |
344 | 0 | capacity: usize, |
345 | 0 | binary_size: usize, |
346 | 0 | ) -> [MutableBuffer; 2] { |
347 | | // offsets |
348 | 0 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>()); |
349 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
350 | 0 | buffer.push(Offset::zero()); |
351 | | |
352 | 0 | [ |
353 | 0 | buffer, |
354 | 0 | MutableBuffer::new(binary_size * mem::size_of::<u8>()), |
355 | 0 | ] |
356 | 0 | } |
357 | | |
358 | | /// Define capacities to pre-allocate for child data or data buffers. |
359 | | #[derive(Debug, Clone)] |
360 | | pub enum Capacities { |
361 | | /// Binary, Utf8 and LargeUtf8 data types |
362 | | /// |
363 | | /// Defines |
364 | | /// * the capacity of the array offsets |
365 | | /// * the capacity of the binary/ str buffer |
366 | | Binary(usize, Option<usize>), |
367 | | /// List and LargeList data types |
368 | | /// |
369 | | /// Defines |
370 | | /// * the capacity of the array offsets |
371 | | /// * the capacity of the child data |
372 | | List(usize, Option<Box<Capacities>>), |
373 | | /// Struct type |
374 | | /// |
375 | | /// Defines |
376 | | /// * the capacity of the array |
377 | | /// * the capacities of the fields |
378 | | Struct(usize, Option<Vec<Capacities>>), |
379 | | /// Dictionary type |
380 | | /// |
381 | | /// Defines |
382 | | /// * the capacity of the array/keys |
383 | | /// * the capacity of the values |
384 | | Dictionary(usize, Option<Box<Capacities>>), |
385 | | /// Don't preallocate inner buffers and rely on array growth strategy |
386 | | Array(usize), |
387 | | } |
388 | | |
389 | | impl<'a> MutableArrayData<'a> { |
390 | | /// Returns a new [MutableArrayData] with capacity to `capacity` slots and |
391 | | /// specialized to create an [ArrayData] from multiple `arrays`. |
392 | | /// |
393 | | /// # Arguments |
394 | | /// * `arrays` - the source arrays to copy from |
395 | | /// * `use_nulls` - a flag used to optimize insertions |
396 | | /// - `false` if the only source of nulls are the arrays themselves |
397 | | /// - `true` if the user plans to call [MutableArrayData::extend_nulls]. |
398 | | /// * capacity - the preallocated capacity of the output array, in bytes |
399 | | /// |
400 | | /// Thus, if `use_nulls` is `false`, calling |
401 | | /// [MutableArrayData::extend_nulls] should not be used. |
402 | 8 | pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { |
403 | 8 | Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) |
404 | 8 | } |
405 | | |
406 | | /// Similar to [MutableArrayData::new], but lets users define the |
407 | | /// preallocated capacities of the array with more granularity. |
408 | | /// |
409 | | /// See [MutableArrayData::new] for more information on the arguments. |
410 | | /// |
411 | | /// # Panics |
412 | | /// |
413 | | /// This function panics if the given `capacities` don't match the data type |
414 | | /// of `arrays`. Or when a [Capacities] variant is not yet supported. |
415 | 18 | pub fn with_capacities( |
416 | 18 | arrays: Vec<&'a ArrayData>, |
417 | 18 | use_nulls: bool, |
418 | 18 | capacities: Capacities, |
419 | 18 | ) -> Self { |
420 | 18 | let data_type = arrays[0].data_type(); |
421 | | |
422 | 33 | for a in arrays.iter()18 .skip18 (1) { |
423 | 33 | assert_eq!( |
424 | | data_type, |
425 | 33 | a.data_type(), |
426 | 0 | "Arrays with inconsistent types passed to MutableArrayData" |
427 | | ) |
428 | | } |
429 | | |
430 | | // if any of the arrays has nulls, insertions from any array requires setting bits |
431 | | // as there is at least one array with nulls. |
432 | 35 | let use_nulls18 = use_nulls18 | arrays.iter()18 .any18 (|array| array.null_count() > 0); |
433 | | |
434 | | let mut array_capacity; |
435 | | |
436 | 18 | let [buffer1, buffer2] = match (data_type, &capacities) { |
437 | | ( |
438 | | DataType::LargeUtf8 | DataType::LargeBinary, |
439 | 0 | Capacities::Binary(capacity, Some(value_cap)), |
440 | | ) => { |
441 | 0 | array_capacity = *capacity; |
442 | 0 | preallocate_offset_and_binary_buffer::<i64>(*capacity, *value_cap) |
443 | | } |
444 | 0 | (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => { |
445 | 0 | array_capacity = *capacity; |
446 | 0 | preallocate_offset_and_binary_buffer::<i32>(*capacity, *value_cap) |
447 | | } |
448 | 18 | (_, Capacities::Array(capacity)) => { |
449 | 18 | array_capacity = *capacity; |
450 | 18 | new_buffers(data_type, *capacity) |
451 | | } |
452 | | ( |
453 | | DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _), |
454 | 0 | Capacities::List(capacity, _), |
455 | | ) => { |
456 | 0 | array_capacity = *capacity; |
457 | 0 | new_buffers(data_type, *capacity) |
458 | | } |
459 | 0 | _ => panic!("Capacities: {capacities:?} not yet supported"), |
460 | | }; |
461 | | |
462 | 18 | let child_data = match &data_type { |
463 | | DataType::Decimal32(_, _) |
464 | | | DataType::Decimal64(_, _) |
465 | | | DataType::Decimal128(_, _) |
466 | | | DataType::Decimal256(_, _) |
467 | | | DataType::Null |
468 | | | DataType::Boolean |
469 | | | DataType::UInt8 |
470 | | | DataType::UInt16 |
471 | | | DataType::UInt32 |
472 | | | DataType::UInt64 |
473 | | | DataType::Int8 |
474 | | | DataType::Int16 |
475 | | | DataType::Int32 |
476 | | | DataType::Int64 |
477 | | | DataType::Float16 |
478 | | | DataType::Float32 |
479 | | | DataType::Float64 |
480 | | | DataType::Date32 |
481 | | | DataType::Date64 |
482 | | | DataType::Time32(_) |
483 | | | DataType::Time64(_) |
484 | | | DataType::Duration(_) |
485 | | | DataType::Timestamp(_, _) |
486 | | | DataType::Utf8 |
487 | | | DataType::Binary |
488 | | | DataType::LargeUtf8 |
489 | | | DataType::LargeBinary |
490 | | | DataType::BinaryView |
491 | | | DataType::Utf8View |
492 | | | DataType::Interval(_) |
493 | 9 | | DataType::FixedSizeBinary(_) => vec![], |
494 | | DataType::ListView(_) | DataType::LargeListView(_) => { |
495 | 0 | unimplemented!("ListView/LargeListView not implemented") |
496 | | } |
497 | | DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { |
498 | 4 | let children = arrays |
499 | 4 | .iter() |
500 | 12 | .map4 (|array| &array.child_data()[0]) |
501 | 4 | .collect::<Vec<_>>(); |
502 | | |
503 | 4 | let capacities = |
504 | 4 | if let Capacities::List(capacity0 , ref child_capacities0 ) = capacities { |
505 | 0 | child_capacities |
506 | 0 | .clone() |
507 | 0 | .map(|c| *c) |
508 | 0 | .unwrap_or(Capacities::Array(capacity)) |
509 | | } else { |
510 | 4 | Capacities::Array(array_capacity) |
511 | | }; |
512 | | |
513 | 4 | vec![MutableArrayData::with_capacities( |
514 | 4 | children, use_nulls, capacities, |
515 | | )] |
516 | | } |
517 | | // the dictionary type just appends keys and clones the values. |
518 | 0 | DataType::Dictionary(_, _) => vec![], |
519 | 5 | DataType::Struct(fields) => match capacities0 { |
520 | 0 | Capacities::Struct(capacity, Some(ref child_capacities)) => { |
521 | 0 | array_capacity = capacity; |
522 | 0 | (0..fields.len()) |
523 | 0 | .zip(child_capacities) |
524 | 0 | .map(|(i, child_cap)| { |
525 | 0 | let child_arrays = arrays |
526 | 0 | .iter() |
527 | 0 | .map(|array| &array.child_data()[i]) |
528 | 0 | .collect::<Vec<_>>(); |
529 | 0 | MutableArrayData::with_capacities( |
530 | 0 | child_arrays, |
531 | 0 | use_nulls, |
532 | 0 | child_cap.clone(), |
533 | | ) |
534 | 0 | }) |
535 | 0 | .collect::<Vec<_>>() |
536 | | } |
537 | 0 | Capacities::Struct(capacity, None) => { |
538 | 0 | array_capacity = capacity; |
539 | 0 | (0..fields.len()) |
540 | 0 | .map(|i| { |
541 | 0 | let child_arrays = arrays |
542 | 0 | .iter() |
543 | 0 | .map(|array| &array.child_data()[i]) |
544 | 0 | .collect::<Vec<_>>(); |
545 | 0 | MutableArrayData::new(child_arrays, use_nulls, capacity) |
546 | 0 | }) |
547 | 0 | .collect::<Vec<_>>() |
548 | | } |
549 | 5 | _ => (0..fields.len()) |
550 | 8 | .map5 (|i| { |
551 | 8 | let child_arrays = arrays |
552 | 8 | .iter() |
553 | 24 | .map8 (|array| &array.child_data()[i]) |
554 | 8 | .collect::<Vec<_>>(); |
555 | 8 | MutableArrayData::new(child_arrays, use_nulls, array_capacity) |
556 | 8 | }) |
557 | 5 | .collect::<Vec<_>>(), |
558 | | }, |
559 | | DataType::RunEndEncoded(_, _) => { |
560 | 0 | let run_ends_child = arrays |
561 | 0 | .iter() |
562 | 0 | .map(|array| &array.child_data()[0]) |
563 | 0 | .collect::<Vec<_>>(); |
564 | 0 | let value_child = arrays |
565 | 0 | .iter() |
566 | 0 | .map(|array| &array.child_data()[1]) |
567 | 0 | .collect::<Vec<_>>(); |
568 | 0 | vec![ |
569 | 0 | MutableArrayData::new(run_ends_child, false, array_capacity), |
570 | 0 | MutableArrayData::new(value_child, use_nulls, array_capacity), |
571 | | ] |
572 | | } |
573 | 0 | DataType::FixedSizeList(_, size) => { |
574 | 0 | let children = arrays |
575 | 0 | .iter() |
576 | 0 | .map(|array| &array.child_data()[0]) |
577 | 0 | .collect::<Vec<_>>(); |
578 | 0 | let capacities = |
579 | 0 | if let Capacities::List(capacity, ref child_capacities) = capacities { |
580 | 0 | child_capacities |
581 | 0 | .clone() |
582 | 0 | .map(|c| *c) |
583 | 0 | .unwrap_or(Capacities::Array(capacity * *size as usize)) |
584 | | } else { |
585 | 0 | Capacities::Array(array_capacity * *size as usize) |
586 | | }; |
587 | 0 | vec![MutableArrayData::with_capacities( |
588 | 0 | children, use_nulls, capacities, |
589 | | )] |
590 | | } |
591 | 0 | DataType::Union(fields, _) => (0..fields.len()) |
592 | 0 | .map(|i| { |
593 | 0 | let child_arrays = arrays |
594 | 0 | .iter() |
595 | 0 | .map(|array| &array.child_data()[i]) |
596 | 0 | .collect::<Vec<_>>(); |
597 | 0 | MutableArrayData::new(child_arrays, use_nulls, array_capacity) |
598 | 0 | }) |
599 | 0 | .collect::<Vec<_>>(), |
600 | | }; |
601 | | |
602 | | // Get the dictionary if any, and if it is a concatenation of multiple |
603 | 18 | let (dictionary, dict_concat) = match &data_type { |
604 | | DataType::Dictionary(_, _) => { |
605 | | // If more than one dictionary, concatenate dictionaries together |
606 | 0 | let dict_concat = !arrays |
607 | 0 | .windows(2) |
608 | 0 | .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); |
609 | | |
610 | 0 | match dict_concat { |
611 | 0 | false => (Some(arrays[0].child_data()[0].clone()), false), |
612 | | true => { |
613 | 0 | if let Capacities::Dictionary(_, _) = capacities { |
614 | 0 | panic!("dictionary capacity not yet supported") |
615 | 0 | } |
616 | 0 | let dictionaries: Vec<_> = |
617 | 0 | arrays.iter().map(|array| &array.child_data()[0]).collect(); |
618 | 0 | let lengths: Vec<_> = dictionaries |
619 | 0 | .iter() |
620 | 0 | .map(|dictionary| dictionary.len()) |
621 | 0 | .collect(); |
622 | 0 | let capacity = lengths.iter().sum(); |
623 | | |
624 | 0 | let mut mutable = MutableArrayData::new(dictionaries, false, capacity); |
625 | | |
626 | 0 | for (i, len) in lengths.iter().enumerate() { |
627 | 0 | mutable.extend(i, 0, *len) |
628 | | } |
629 | | |
630 | 0 | (Some(mutable.freeze()), true) |
631 | | } |
632 | | } |
633 | | } |
634 | 18 | _ => (None, false), |
635 | | }; |
636 | | |
637 | 18 | let variadic_data_buffers = match &data_type { |
638 | 0 | DataType::BinaryView | DataType::Utf8View => arrays |
639 | 0 | .iter() |
640 | 0 | .flat_map(|x| x.buffers().iter().skip(1)) |
641 | 0 | .map(Buffer::clone) |
642 | 0 | .collect(), |
643 | 18 | _ => vec![], |
644 | | }; |
645 | | |
646 | 18 | let extend_nulls = build_extend_nulls(data_type); |
647 | | |
648 | 18 | let extend_null_bits = arrays |
649 | 18 | .iter() |
650 | 51 | .map18 (|array| build_extend_null_bits(array, use_nulls)) |
651 | 18 | .collect(); |
652 | | |
653 | 18 | let null_buffer = use_nulls.then(|| {16 |
654 | 16 | let null_bytes = bit_util::ceil(array_capacity, 8); |
655 | 16 | MutableBuffer::from_len_zeroed(null_bytes) |
656 | 16 | }); |
657 | | |
658 | 18 | let extend_values = match &data_type { |
659 | | DataType::Dictionary(_, _) => { |
660 | 0 | let mut next_offset = 0; |
661 | 0 | let extend_values: Result<Vec<_>, _> = arrays |
662 | 0 | .iter() |
663 | 0 | .map(|array| { |
664 | 0 | let offset = next_offset; |
665 | 0 | let dict_len = array.child_data()[0].len(); |
666 | | |
667 | 0 | if dict_concat { |
668 | 0 | next_offset += dict_len; |
669 | 0 | } |
670 | | |
671 | 0 | build_extend_dictionary(array, offset, offset + dict_len) |
672 | 0 | .ok_or(ArrowError::DictionaryKeyOverflowError) |
673 | 0 | }) |
674 | 0 | .collect(); |
675 | | |
676 | 0 | extend_values.expect("MutableArrayData::new is infallible") |
677 | | } |
678 | | DataType::BinaryView | DataType::Utf8View => { |
679 | 0 | let mut next_offset = 0u32; |
680 | 0 | arrays |
681 | 0 | .iter() |
682 | 0 | .map(|arr| { |
683 | 0 | let num_data_buffers = (arr.buffers().len() - 1) as u32; |
684 | 0 | let offset = next_offset; |
685 | 0 | next_offset = next_offset |
686 | 0 | .checked_add(num_data_buffers) |
687 | 0 | .expect("view buffer index overflow"); |
688 | 0 | build_extend_view(arr, offset) |
689 | 0 | }) |
690 | 0 | .collect() |
691 | | } |
692 | 51 | _ => arrays.iter()18 .map18 (|array| build_extend(array)).collect18 (), |
693 | | }; |
694 | | |
695 | 18 | let data = _MutableArrayData { |
696 | 18 | data_type: data_type.clone(), |
697 | 18 | len: 0, |
698 | 18 | null_count: 0, |
699 | 18 | null_buffer, |
700 | 18 | buffer1, |
701 | 18 | buffer2, |
702 | 18 | child_data, |
703 | 18 | }; |
704 | 18 | Self { |
705 | 18 | arrays, |
706 | 18 | data, |
707 | 18 | dictionary, |
708 | 18 | variadic_data_buffers, |
709 | 18 | extend_values, |
710 | 18 | extend_null_bits, |
711 | 18 | extend_nulls, |
712 | 18 | } |
713 | 18 | } |
714 | | |
715 | | /// Extends the in progress array with a region of the input arrays |
716 | | /// |
717 | | /// # Arguments |
718 | | /// * `index` - the index of array that you what to copy values from |
719 | | /// * `start` - the start index of the chunk (inclusive) |
720 | | /// * `end` - the end index of the chunk (exclusive) |
721 | | /// |
722 | | /// # Panic |
723 | | /// This function panics if there is an invalid index, |
724 | | /// i.e. `index` >= the number of source arrays |
725 | | /// or `end` > the length of the `index`th array |
726 | 51 | pub fn extend(&mut self, index: usize, start: usize, end: usize) { |
727 | 51 | let len = end - start; |
728 | 51 | (self.extend_null_bits[index])(&mut self.data, start, len); |
729 | 51 | (self.extend_values[index])(&mut self.data, index, start, len); |
730 | 51 | self.data.len += len; |
731 | 51 | } |
732 | | |
733 | | /// Extends the in progress array with null elements, ignoring the input arrays. |
734 | | /// |
735 | | /// # Panics |
736 | | /// |
737 | | /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays |
738 | 0 | pub fn extend_nulls(&mut self, len: usize) { |
739 | 0 | self.data.len += len; |
740 | 0 | let bit_len = bit_util::ceil(self.data.len, 8); |
741 | 0 | let nulls = self.data.null_buffer(); |
742 | 0 | nulls.resize(bit_len, 0); |
743 | 0 | self.data.null_count += len; |
744 | 0 | (self.extend_nulls)(&mut self.data, len); |
745 | 0 | } |
746 | | |
747 | | /// Returns the current length |
748 | | #[inline] |
749 | 0 | pub fn len(&self) -> usize { |
750 | 0 | self.data.len |
751 | 0 | } |
752 | | |
753 | | /// Returns true if len is 0 |
754 | | #[inline] |
755 | | pub fn is_empty(&self) -> bool { |
756 | | self.data.len == 0 |
757 | | } |
758 | | |
759 | | /// Returns the current null count |
760 | | #[inline] |
761 | | pub fn null_count(&self) -> usize { |
762 | | self.data.null_count |
763 | | } |
764 | | |
765 | | /// Creates a [ArrayData] from the in progress array, consuming `self`. |
766 | 18 | pub fn freeze(self) -> ArrayData { |
767 | 18 | unsafe { self.into_builder().build_unchecked() } |
768 | 18 | } |
769 | | |
770 | | /// Consume self and returns the in progress array as [`ArrayDataBuilder`]. |
771 | | /// |
772 | | /// This is useful for extending the default behavior of MutableArrayData. |
773 | 18 | pub fn into_builder(self) -> ArrayDataBuilder { |
774 | 18 | let data = self.data; |
775 | | |
776 | 18 | let buffers = match data.data_type { |
777 | | DataType::Null |
778 | | | DataType::Struct(_) |
779 | | | DataType::FixedSizeList(_, _) |
780 | | | DataType::RunEndEncoded(_, _) => { |
781 | 5 | vec![] |
782 | | } |
783 | | DataType::BinaryView | DataType::Utf8View => { |
784 | 0 | let mut b = self.variadic_data_buffers; |
785 | 0 | b.insert(0, data.buffer1.into()); |
786 | 0 | b |
787 | | } |
788 | | DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { |
789 | 3 | vec![data.buffer1.into(), data.buffer2.into()] |
790 | | } |
791 | 0 | DataType::Union(_, mode) => { |
792 | 0 | match mode { |
793 | | // Based on Union's DataTypeLayout |
794 | 0 | UnionMode::Sparse => vec![data.buffer1.into()], |
795 | 0 | UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()], |
796 | | } |
797 | | } |
798 | 10 | _ => vec![data.buffer1.into()], |
799 | | }; |
800 | | |
801 | 18 | let child_data = match data.data_type { |
802 | 0 | DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()], |
803 | 18 | _ => data.child_data.into_iter().map(|x| x12 .freeze12 ()).collect(), |
804 | | }; |
805 | | |
806 | 18 | let nulls = match data.data_type { |
807 | | // RunEndEncoded and Null arrays cannot have top-level null bitmasks |
808 | 0 | DataType::RunEndEncoded(_, _) | DataType::Null => None, |
809 | 18 | _ => data |
810 | 18 | .null_buffer |
811 | 18 | .map(|nulls| {16 |
812 | 16 | let bools = BooleanBuffer::new(nulls.into(), 0, data.len); |
813 | 16 | unsafe { NullBuffer::new_unchecked(bools, data.null_count) } |
814 | 16 | }) |
815 | 18 | .filter(|n| n16 .null_count16 () > 0), |
816 | | }; |
817 | | |
818 | 18 | ArrayDataBuilder::new(data.data_type) |
819 | 18 | .offset(0) |
820 | 18 | .len(data.len) |
821 | 18 | .nulls(nulls) |
822 | 18 | .buffers(buffers) |
823 | 18 | .child_data(child_data) |
824 | 18 | } |
825 | | } |
826 | | |
827 | | // See arrow/tests/array_transform.rs for tests of transform functionality |
828 | | |
829 | | #[cfg(test)] |
830 | | mod test { |
831 | | use super::*; |
832 | | use arrow_schema::Field; |
833 | | use std::sync::Arc; |
834 | | |
835 | | #[test] |
836 | | fn test_list_append_with_capacities() { |
837 | | let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new( |
838 | | "element", |
839 | | DataType::Int64, |
840 | | false, |
841 | | )))); |
842 | | |
843 | | let mutable = MutableArrayData::with_capacities( |
844 | | vec![&array], |
845 | | false, |
846 | | Capacities::List(6, Some(Box::new(Capacities::Array(17)))), |
847 | | ); |
848 | | |
849 | | // capacities are rounded up to multiples of 64 by MutableBuffer |
850 | | assert_eq!(mutable.data.buffer1.capacity(), 64); |
851 | | assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); |
852 | | } |
853 | | } |