/Users/andrewlamb/Software/arrow-rs/arrow-data/src/transform/mod.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Low-level array data abstractions. |
19 | | //! |
20 | | //! Provides utilities for creating, manipulating, and converting Arrow arrays |
21 | | //! made of primitive types, strings, and nested types. |
22 | | |
23 | | use super::{ArrayData, ArrayDataBuilder, ByteView, data::new_buffers}; |
24 | | use crate::bit_mask::set_bits; |
25 | | use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; |
26 | | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, bit_util, i256}; |
27 | | use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; |
28 | | use half::f16; |
29 | | use num_integer::Integer; |
30 | | use std::mem; |
31 | | |
32 | | mod boolean; |
33 | | mod fixed_binary; |
34 | | mod fixed_size_list; |
35 | | mod list; |
36 | | mod list_view; |
37 | | mod null; |
38 | | mod primitive; |
39 | | mod run; |
40 | | mod structure; |
41 | | mod union; |
42 | | mod utils; |
43 | | mod variable_size; |
44 | | |
45 | | type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>; |
46 | | // function that extends `[start..start+len]` to the mutable array. |
47 | | // this is dynamic because different data_types influence how buffers and children are extended. |
48 | | type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>; |
49 | | |
50 | | type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>; |
51 | | |
52 | | /// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. |
53 | | /// This is just a data container. |
54 | | #[derive(Debug)] |
55 | | struct _MutableArrayData<'a> { |
56 | | pub data_type: DataType, |
57 | | pub null_count: usize, |
58 | | |
59 | | pub len: usize, |
60 | | pub null_buffer: Option<MutableBuffer>, |
61 | | |
62 | | // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). |
63 | | // Thus, we place them in the stack to avoid bound checks and greater data locality. |
64 | | pub buffer1: MutableBuffer, |
65 | | pub buffer2: MutableBuffer, |
66 | | pub child_data: Vec<MutableArrayData<'a>>, |
67 | | } |
68 | | |
69 | | impl _MutableArrayData<'_> { |
70 | 112 | fn null_buffer(&mut self) -> &mut MutableBuffer { |
71 | 112 | self.null_buffer |
72 | 112 | .as_mut() |
73 | 112 | .expect("MutableArrayData not nullable") |
74 | 112 | } |
75 | | } |
76 | | |
77 | 200 | fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits<'_> { |
78 | 200 | if let Some(nulls47 ) = array.nulls() { |
79 | 47 | let bytes = nulls.validity(); |
80 | 73 | Box::new47 (move |mutable, start, len| { |
81 | 73 | let mutable_len = mutable.len; |
82 | 73 | let out = mutable.null_buffer(); |
83 | 73 | utils::resize_for_bits(out, mutable_len + len); |
84 | 73 | mutable.null_count += set_bits( |
85 | 73 | out.as_slice_mut(), |
86 | 73 | bytes, |
87 | 73 | mutable_len, |
88 | 73 | nulls.offset() + start, |
89 | 73 | len, |
90 | 73 | ); |
91 | 73 | }) |
92 | 153 | } else if use_nulls { |
93 | 37 | Box::new28 (|mutable, _, len| { |
94 | 37 | let mutable_len = mutable.len; |
95 | 37 | let out = mutable.null_buffer(); |
96 | 37 | utils::resize_for_bits(out, mutable_len + len); |
97 | 37 | let write_data = out.as_slice_mut(); |
98 | 47 | (0..len)37 .for_each37 (|i| { |
99 | 47 | bit_util::set_bit(write_data, mutable_len + i); |
100 | 47 | }); |
101 | 37 | }) |
102 | | } else { |
103 | 149 | Box::new125 (|_, _, _| {}) |
104 | | } |
105 | 200 | } |
106 | | |
107 | | /// Efficiently create an [ArrayData] from one or more existing [ArrayData]s by |
108 | | /// copying chunks. |
109 | | /// |
110 | | /// The main use case of this struct is to perform unary operations to arrays of |
111 | | /// arbitrary types, such as `filter` and `take`. |
112 | | /// |
113 | | /// # Example |
114 | | /// ``` |
115 | | /// use arrow_buffer::Buffer; |
116 | | /// use arrow_data::ArrayData; |
117 | | /// use arrow_data::transform::MutableArrayData; |
118 | | /// use arrow_schema::DataType; |
119 | | /// fn i32_array(values: &[i32]) -> ArrayData { |
120 | | /// ArrayData::try_new(DataType::Int32, 5, None, 0, vec![Buffer::from_slice_ref(values)], vec![]).unwrap() |
121 | | /// } |
122 | | /// let arr1 = i32_array(&[1, 2, 3, 4, 5]); |
123 | | /// let arr2 = i32_array(&[6, 7, 8, 9, 10]); |
124 | | /// // Create a mutable array for copying values from arr1 and arr2, with a capacity for 6 elements |
125 | | /// let capacity = 3 * std::mem::size_of::<i32>(); |
126 | | /// let mut mutable = MutableArrayData::new(vec![&arr1, &arr2], false, 10); |
127 | | /// // Copy the first 3 elements from arr1 |
128 | | /// mutable.extend(0, 0, 3); |
129 | | /// // Copy the last 3 elements from arr2 |
130 | | /// mutable.extend(1, 2, 4); |
131 | | /// // Complete the MutableArrayData into a new ArrayData |
132 | | /// let frozen = mutable.freeze(); |
133 | | /// assert_eq!(frozen, i32_array(&[1, 2, 3, 8, 9, 10])); |
134 | | /// ``` |
135 | | pub struct MutableArrayData<'a> { |
136 | | /// Input arrays: the data being read FROM. |
137 | | /// |
138 | | /// Note this is "dead code" because all actual references to the arrays are |
139 | | /// stored in closures for extending values and nulls. |
140 | | #[allow(dead_code)] |
141 | | arrays: Vec<&'a ArrayData>, |
142 | | |
143 | | /// In progress output array: The data being written TO |
144 | | /// |
145 | | /// Note these fields are in a separate struct, [_MutableArrayData], as they |
146 | | /// cannot be in [MutableArrayData] itself due to mutability invariants (interior |
147 | | /// mutability): [MutableArrayData] contains a function that can only mutate |
148 | | /// [_MutableArrayData], not [MutableArrayData] itself |
149 | | data: _MutableArrayData<'a>, |
150 | | |
151 | | /// The child data of the `Array` in Dictionary arrays. |
152 | | /// |
153 | | /// This is not stored in `_MutableArrayData` because these values are |
154 | | /// constant and only needed at the end, when freezing [_MutableArrayData]. |
155 | | dictionary: Option<ArrayData>, |
156 | | |
157 | | /// Variadic data buffers referenced by views. |
158 | | /// |
159 | | /// Note this this is not stored in `_MutableArrayData` because these values |
160 | | /// are constant and only needed at the end, when freezing |
161 | | /// [_MutableArrayData] |
162 | | variadic_data_buffers: Vec<Buffer>, |
163 | | |
164 | | /// function used to extend output array with values from input arrays. |
165 | | /// |
166 | | /// This function's lifetime is bound to the input arrays because it reads |
167 | | /// values from them. |
168 | | extend_values: Vec<Extend<'a>>, |
169 | | |
170 | | /// function used to extend the output array with nulls from input arrays. |
171 | | /// |
172 | | /// This function's lifetime is bound to the input arrays because it reads |
173 | | /// nulls from it. |
174 | | extend_null_bits: Vec<ExtendNullBits<'a>>, |
175 | | |
176 | | /// function used to extend the output array with null elements. |
177 | | /// |
178 | | /// This function is independent of the arrays and therefore has no lifetime. |
179 | | extend_nulls: ExtendNulls, |
180 | | } |
181 | | |
182 | | impl std::fmt::Debug for MutableArrayData<'_> { |
183 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
184 | | // ignores the closures. |
185 | 0 | f.debug_struct("MutableArrayData") |
186 | 0 | .field("data", &self.data) |
187 | 0 | .finish() |
188 | 0 | } |
189 | | } |
190 | | |
191 | | /// Builds an extend that adds `offset` to the source primitive |
192 | | /// Additionally validates that `max` fits into the |
193 | | /// the underlying primitive returning None if not |
194 | 17 | fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option<Extend<'_>> { |
195 | | macro_rules! validate_and_build { |
196 | | ($dt: ty) => {{ |
197 | | let _: $dt = max.try_into().ok()?; |
198 | | let offset: $dt = offset.try_into().ok()?; |
199 | | Some(primitive::build_extend_with_offset(array, offset)) |
200 | | }}; |
201 | | } |
202 | 17 | match array.data_type() { |
203 | 17 | DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { |
204 | 0 | DataType::UInt8 => validate_and_build!(u8), |
205 | 0 | DataType::UInt16 => validate_and_build!(u16), |
206 | 0 | DataType::UInt32 => validate_and_build!(u32), |
207 | 0 | DataType::UInt64 => validate_and_build!(u64), |
208 | 7 | DataType::Int8 => validate_and_build!(i8), |
209 | 0 | DataType::Int16 => validate_and_build!(i16), |
210 | 10 | DataType::Int32 => validate_and_build!(i32), |
211 | 0 | DataType::Int64 => validate_and_build!(i64), |
212 | 0 | _ => unreachable!(), |
213 | | }, |
214 | 0 | _ => None, |
215 | | } |
216 | 17 | } |
217 | | |
218 | | /// Builds an extend that adds `buffer_offset` to any buffer indices encountered |
219 | 4 | fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> { |
220 | 4 | let views = array.buffer::<u128>(0); |
221 | 4 | Box::new( |
222 | 10 | move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { |
223 | 10 | mutable |
224 | 10 | .buffer1 |
225 | 10 | .extend(views[start..start + len].iter().map(|v| { |
226 | 10 | let len = *v as u32; |
227 | 10 | if len <= 12 { |
228 | 8 | return *v; // Stored inline |
229 | 2 | } |
230 | 2 | let mut view = ByteView::from(*v); |
231 | 2 | view.buffer_index += buffer_offset; |
232 | 2 | view.into() |
233 | 10 | })) |
234 | 10 | }, |
235 | | ) |
236 | 4 | } |
237 | | |
238 | 179 | fn build_extend(array: &ArrayData) -> Extend<'_> { |
239 | 179 | match array.data_type() { |
240 | 0 | DataType::Null => null::build_extend(array), |
241 | 0 | DataType::Boolean => boolean::build_extend(array), |
242 | 0 | DataType::UInt8 => primitive::build_extend::<u8>(array), |
243 | 0 | DataType::UInt16 => primitive::build_extend::<u16>(array), |
244 | 0 | DataType::UInt32 => primitive::build_extend::<u32>(array), |
245 | 0 | DataType::UInt64 => primitive::build_extend::<u64>(array), |
246 | 0 | DataType::Int8 => primitive::build_extend::<i8>(array), |
247 | 2 | DataType::Int16 => primitive::build_extend::<i16>(array), |
248 | 44 | DataType::Int32 => primitive::build_extend::<i32>(array), |
249 | 6 | DataType::Int64 => primitive::build_extend::<i64>(array), |
250 | 0 | DataType::Float32 => primitive::build_extend::<f32>(array), |
251 | 5 | DataType::Float64 => primitive::build_extend::<f64>(array), |
252 | | DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { |
253 | 0 | primitive::build_extend::<i32>(array) |
254 | | } |
255 | | DataType::Date64 |
256 | | | DataType::Time64(_) |
257 | | | DataType::Timestamp(_, _) |
258 | | | DataType::Duration(_) |
259 | 30 | | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::<i64>(array), |
260 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::<i128>(array), |
261 | 0 | DataType::Decimal32(_, _) => primitive::build_extend::<i32>(array), |
262 | 0 | DataType::Decimal64(_, _) => primitive::build_extend::<i64>(array), |
263 | 30 | DataType::Decimal128(_, _) => primitive::build_extend::<i128>(array), |
264 | 0 | DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array), |
265 | 29 | DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array), |
266 | 0 | DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array), |
267 | 0 | DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"), |
268 | 5 | DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array), |
269 | 1 | DataType::LargeList(_) => list::build_extend::<i64>(array), |
270 | 0 | DataType::ListView(_) => list_view::build_extend::<i32>(array), |
271 | 0 | DataType::LargeListView(_) => list_view::build_extend::<i64>(array), |
272 | 0 | DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), |
273 | 1 | DataType::Struct(_) => structure::build_extend(array), |
274 | 0 | DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), |
275 | 0 | DataType::Float16 => primitive::build_extend::<f16>(array), |
276 | 6 | DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), |
277 | 8 | DataType::Union(_, mode) => match mode { |
278 | 2 | UnionMode::Sparse => union::build_extend_sparse(array), |
279 | 6 | UnionMode::Dense => union::build_extend_dense(array), |
280 | | }, |
281 | 12 | DataType::RunEndEncoded(_, _) => run::build_extend(array), |
282 | | } |
283 | 179 | } |
284 | | |
285 | 112 | fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { |
286 | 112 | Box::new(match data_type0 { |
287 | 0 | DataType::Null => null::extend_nulls, |
288 | 0 | DataType::Boolean => boolean::extend_nulls, |
289 | 0 | DataType::UInt8 => primitive::extend_nulls::<u8>, |
290 | 0 | DataType::UInt16 => primitive::extend_nulls::<u16>, |
291 | 0 | DataType::UInt32 => primitive::extend_nulls::<u32>, |
292 | 0 | DataType::UInt64 => primitive::extend_nulls::<u64>, |
293 | 0 | DataType::Int8 => primitive::extend_nulls::<i8>, |
294 | 1 | DataType::Int16 => primitive::extend_nulls::<i16>, |
295 | 27 | DataType::Int32 => primitive::extend_nulls::<i32>, |
296 | 3 | DataType::Int64 => primitive::extend_nulls::<i64>, |
297 | 0 | DataType::Float32 => primitive::extend_nulls::<f32>, |
298 | 5 | DataType::Float64 => primitive::extend_nulls::<f64>, |
299 | 0 | DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { |
300 | 0 | primitive::extend_nulls::<i32> |
301 | 0 | } |
302 | | DataType::Date64 |
303 | | | DataType::Time64(_) |
304 | | | DataType::Timestamp(_, _) |
305 | | | DataType::Duration(_) |
306 | 15 | | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>, |
307 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>, |
308 | 0 | DataType::Decimal32(_, _) => primitive::extend_nulls::<i32>, |
309 | 0 | DataType::Decimal64(_, _) => primitive::extend_nulls::<i64>, |
310 | 15 | DataType::Decimal128(_, _) => primitive::extend_nulls::<i128>, |
311 | 0 | DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>, |
312 | 14 | DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>, |
313 | 0 | DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>, |
314 | 2 | DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>, |
315 | 3 | DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>, |
316 | 1 | DataType::LargeList(_) => list::extend_nulls::<i64>, |
317 | 0 | DataType::ListView(_) => list_view::extend_nulls::<i32>, |
318 | 0 | DataType::LargeListView(_) => list_view::extend_nulls::<i64>, |
319 | 8 | DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { |
320 | 0 | DataType::UInt8 => primitive::extend_nulls::<u8>, |
321 | 0 | DataType::UInt16 => primitive::extend_nulls::<u16>, |
322 | 0 | DataType::UInt32 => primitive::extend_nulls::<u32>, |
323 | 0 | DataType::UInt64 => primitive::extend_nulls::<u64>, |
324 | 3 | DataType::Int8 => primitive::extend_nulls::<i8>, |
325 | 0 | DataType::Int16 => primitive::extend_nulls::<i16>, |
326 | 5 | DataType::Int32 => primitive::extend_nulls::<i32>, |
327 | 0 | DataType::Int64 => primitive::extend_nulls::<i64>, |
328 | 0 | _ => unreachable!(), |
329 | | }, |
330 | 1 | DataType::Struct(_) => structure::extend_nulls, |
331 | 0 | DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls, |
332 | 0 | DataType::Float16 => primitive::extend_nulls::<f16>, |
333 | 4 | DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, |
334 | 7 | DataType::Union(_, mode) => match mode { |
335 | 1 | UnionMode::Sparse => union::extend_nulls_sparse, |
336 | 6 | UnionMode::Dense => union::extend_nulls_dense, |
337 | | }, |
338 | 6 | DataType::RunEndEncoded(_, _) => run::extend_nulls, |
339 | | }) |
340 | 112 | } |
341 | | |
342 | 0 | fn preallocate_offset_and_binary_buffer<Offset: ArrowNativeType + Integer>( |
343 | 0 | capacity: usize, |
344 | 0 | binary_size: usize, |
345 | 0 | ) -> [MutableBuffer; 2] { |
346 | | // offsets |
347 | 0 | let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>()); |
348 | | // safety: `unsafe` code assumes that this buffer is initialized with one element |
349 | 0 | buffer.push(Offset::zero()); |
350 | | |
351 | 0 | [ |
352 | 0 | buffer, |
353 | 0 | MutableBuffer::new(binary_size * mem::size_of::<u8>()), |
354 | 0 | ] |
355 | 0 | } |
356 | | |
357 | | /// Define capacities to pre-allocate for child data or data buffers. |
358 | | #[derive(Debug, Clone)] |
359 | | pub enum Capacities { |
360 | | /// Binary, Utf8 and LargeUtf8 data types |
361 | | /// |
362 | | /// Defines |
363 | | /// * the capacity of the array offsets |
364 | | /// * the capacity of the binary/ str buffer |
365 | | Binary(usize, Option<usize>), |
366 | | /// List and LargeList data types |
367 | | /// |
368 | | /// Defines |
369 | | /// * the capacity of the array offsets |
370 | | /// * the capacity of the child data |
371 | | List(usize, Option<Box<Capacities>>), |
372 | | /// Struct type |
373 | | /// |
374 | | /// Defines |
375 | | /// * the capacity of the array |
376 | | /// * the capacities of the fields |
377 | | Struct(usize, Option<Vec<Capacities>>), |
378 | | /// Dictionary type |
379 | | /// |
380 | | /// Defines |
381 | | /// * the capacity of the array/keys |
382 | | /// * the capacity of the values |
383 | | Dictionary(usize, Option<Box<Capacities>>), |
384 | | /// Don't preallocate inner buffers and rely on array growth strategy |
385 | | Array(usize), |
386 | | } |
387 | | |
388 | | impl<'a> MutableArrayData<'a> { |
389 | | /// Returns a new [MutableArrayData] with capacity to `capacity` slots and |
390 | | /// specialized to create an [ArrayData] from multiple `arrays`. |
391 | | /// |
392 | | /// # Arguments |
393 | | /// * `arrays` - the source arrays to copy from |
394 | | /// * `use_nulls` - a flag used to optimize insertions |
395 | | /// - `false` if the only source of nulls are the arrays themselves |
396 | | /// - `true` if the user plans to call [MutableArrayData::extend_nulls]. |
397 | | /// * capacity - the preallocated capacity of the output array, in bytes |
398 | | /// |
399 | | /// Thus, if `use_nulls` is `false`, calling |
400 | | /// [MutableArrayData::extend_nulls] should not be used. |
401 | 96 | pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { |
402 | 96 | Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) |
403 | 96 | } |
404 | | |
405 | | /// Similar to [MutableArrayData::new], but lets users define the |
406 | | /// preallocated capacities of the array with more granularity. |
407 | | /// |
408 | | /// See [MutableArrayData::new] for more information on the arguments. |
409 | | /// |
410 | | /// # Panics |
411 | | /// |
412 | | /// This function panics if the given `capacities` don't match the data type |
413 | | /// of `arrays`. Or when a [Capacities] variant is not yet supported. |
414 | 112 | pub fn with_capacities( |
415 | 112 | arrays: Vec<&'a ArrayData>, |
416 | 112 | use_nulls: bool, |
417 | 112 | capacities: Capacities, |
418 | 112 | ) -> Self { |
419 | 112 | let data_type = arrays[0].data_type(); |
420 | | |
421 | 112 | for a88 in arrays.iter().skip(1) { |
422 | 88 | assert_eq!( |
423 | | data_type, |
424 | 88 | a.data_type(), |
425 | 0 | "Arrays with inconsistent types passed to MutableArrayData" |
426 | | ) |
427 | | } |
428 | | |
429 | | // if any of the arrays has nulls, insertions from any array requires setting bits |
430 | | // as there is at least one array with nulls. |
431 | 177 | let use_nulls112 = use_nulls112 | arrays.iter()112 .any112 (|array| array.null_count() > 0); |
432 | | |
433 | | let mut array_capacity; |
434 | | |
435 | 112 | let [buffer1, buffer2] = match (data_type, &capacities) { |
436 | | ( |
437 | | DataType::LargeUtf8 | DataType::LargeBinary, |
438 | 0 | Capacities::Binary(capacity, Some(value_cap)), |
439 | | ) => { |
440 | 0 | array_capacity = *capacity; |
441 | 0 | preallocate_offset_and_binary_buffer::<i64>(*capacity, *value_cap) |
442 | | } |
443 | 0 | (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => { |
444 | 0 | array_capacity = *capacity; |
445 | 0 | preallocate_offset_and_binary_buffer::<i32>(*capacity, *value_cap) |
446 | | } |
447 | 112 | (_, Capacities::Array(capacity)) => { |
448 | 112 | array_capacity = *capacity; |
449 | 112 | new_buffers(data_type, *capacity) |
450 | | } |
451 | | ( |
452 | | DataType::List(_) |
453 | | | DataType::LargeList(_) |
454 | | | DataType::ListView(_) |
455 | | | DataType::LargeListView(_) |
456 | | | DataType::FixedSizeList(_, _), |
457 | 0 | Capacities::List(capacity, _), |
458 | | ) => { |
459 | 0 | array_capacity = *capacity; |
460 | 0 | new_buffers(data_type, *capacity) |
461 | | } |
462 | 0 | _ => panic!("Capacities: {capacities:?} not yet supported"), |
463 | | }; |
464 | | |
465 | 112 | let child_data = match &data_type { |
466 | | DataType::Decimal32(_, _) |
467 | | | DataType::Decimal64(_, _) |
468 | | | DataType::Decimal128(_, _) |
469 | | | DataType::Decimal256(_, _) |
470 | | | DataType::Null |
471 | | | DataType::Boolean |
472 | | | DataType::UInt8 |
473 | | | DataType::UInt16 |
474 | | | DataType::UInt32 |
475 | | | DataType::UInt64 |
476 | | | DataType::Int8 |
477 | | | DataType::Int16 |
478 | | | DataType::Int32 |
479 | | | DataType::Int64 |
480 | | | DataType::Float16 |
481 | | | DataType::Float32 |
482 | | | DataType::Float64 |
483 | | | DataType::Date32 |
484 | | | DataType::Date64 |
485 | | | DataType::Time32(_) |
486 | | | DataType::Time64(_) |
487 | | | DataType::Duration(_) |
488 | | | DataType::Timestamp(_, _) |
489 | | | DataType::Utf8 |
490 | | | DataType::Binary |
491 | | | DataType::LargeUtf8 |
492 | | | DataType::LargeBinary |
493 | | | DataType::BinaryView |
494 | | | DataType::Utf8View |
495 | | | DataType::Interval(_) |
496 | 82 | | DataType::FixedSizeBinary(_) => vec![], |
497 | | DataType::Map(_, _) |
498 | | | DataType::List(_) |
499 | | | DataType::LargeList(_) |
500 | | | DataType::ListView(_) |
501 | | | DataType::LargeListView(_) => { |
502 | 4 | let children = arrays |
503 | 4 | .iter() |
504 | 6 | .map4 (|array| &array.child_data()[0]) |
505 | 4 | .collect::<Vec<_>>(); |
506 | | |
507 | 4 | let capacities = |
508 | 4 | if let Capacities::List(capacity0 , ref child_capacities0 ) = capacities { |
509 | 0 | child_capacities |
510 | 0 | .clone() |
511 | 0 | .map(|c| *c) |
512 | 0 | .unwrap_or(Capacities::Array(capacity)) |
513 | | } else { |
514 | 4 | Capacities::Array(array_capacity) |
515 | | }; |
516 | | |
517 | 4 | vec![MutableArrayData::with_capacities( |
518 | 4 | children, use_nulls, capacities, |
519 | | )] |
520 | | } |
521 | | // the dictionary type just appends keys and clones the values. |
522 | 8 | DataType::Dictionary(_, _) => vec![], |
523 | 1 | DataType::Struct(fields) => match capacities0 { |
524 | 0 | Capacities::Struct(capacity, Some(ref child_capacities)) => { |
525 | 0 | array_capacity = capacity; |
526 | 0 | (0..fields.len()) |
527 | 0 | .zip(child_capacities) |
528 | 0 | .map(|(i, child_cap)| { |
529 | 0 | let child_arrays = arrays |
530 | 0 | .iter() |
531 | 0 | .map(|array| &array.child_data()[i]) |
532 | 0 | .collect::<Vec<_>>(); |
533 | 0 | MutableArrayData::with_capacities( |
534 | 0 | child_arrays, |
535 | 0 | use_nulls, |
536 | 0 | child_cap.clone(), |
537 | | ) |
538 | 0 | }) |
539 | 0 | .collect::<Vec<_>>() |
540 | | } |
541 | 0 | Capacities::Struct(capacity, None) => { |
542 | 0 | array_capacity = capacity; |
543 | 0 | (0..fields.len()) |
544 | 0 | .map(|i| { |
545 | 0 | let child_arrays = arrays |
546 | 0 | .iter() |
547 | 0 | .map(|array| &array.child_data()[i]) |
548 | 0 | .collect::<Vec<_>>(); |
549 | 0 | MutableArrayData::new(child_arrays, use_nulls, capacity) |
550 | 0 | }) |
551 | 0 | .collect::<Vec<_>>() |
552 | | } |
553 | 1 | _ => (0..fields.len()) |
554 | 2 | .map1 (|i| { |
555 | 2 | let child_arrays = arrays |
556 | 2 | .iter() |
557 | 2 | .map(|array| &array.child_data()[i]) |
558 | 2 | .collect::<Vec<_>>(); |
559 | 2 | MutableArrayData::new(child_arrays, use_nulls, array_capacity) |
560 | 2 | }) |
561 | 1 | .collect::<Vec<_>>(), |
562 | | }, |
563 | | DataType::RunEndEncoded(_, _) => { |
564 | 6 | let run_ends_child = arrays |
565 | 6 | .iter() |
566 | 12 | .map6 (|array| &array.child_data()[0]) |
567 | 6 | .collect::<Vec<_>>(); |
568 | 6 | let value_child = arrays |
569 | 6 | .iter() |
570 | 12 | .map6 (|array| &array.child_data()[1]) |
571 | 6 | .collect::<Vec<_>>(); |
572 | 6 | vec![ |
573 | 6 | MutableArrayData::new(run_ends_child, false, array_capacity), |
574 | 6 | MutableArrayData::new(value_child, use_nulls, array_capacity), |
575 | | ] |
576 | | } |
577 | 4 | DataType::FixedSizeList(_, size) => { |
578 | 4 | let children = arrays |
579 | 4 | .iter() |
580 | 6 | .map4 (|array| &array.child_data()[0]) |
581 | 4 | .collect::<Vec<_>>(); |
582 | 4 | let capacities = |
583 | 4 | if let Capacities::List(capacity0 , ref child_capacities0 ) = capacities { |
584 | 0 | child_capacities |
585 | 0 | .clone() |
586 | 0 | .map(|c| *c) |
587 | 0 | .unwrap_or(Capacities::Array(capacity * *size as usize)) |
588 | | } else { |
589 | 4 | Capacities::Array(array_capacity * *size as usize) |
590 | | }; |
591 | 4 | vec![MutableArrayData::with_capacities( |
592 | 4 | children, use_nulls, capacities, |
593 | | )] |
594 | | } |
595 | 7 | DataType::Union(fields, _) => (0..fields.len()) |
596 | 12 | .map7 (|i| { |
597 | 12 | let child_arrays = arrays |
598 | 12 | .iter() |
599 | 13 | .map12 (|array| &array.child_data()[i]) |
600 | 12 | .collect::<Vec<_>>(); |
601 | 12 | MutableArrayData::new(child_arrays, use_nulls, array_capacity) |
602 | 12 | }) |
603 | 7 | .collect::<Vec<_>>(), |
604 | | }; |
605 | | |
606 | | // Get the dictionary if any, and if it is a concatenation of multiple |
607 | 112 | let (dictionary, dict_concat) = match &data_type { |
608 | | DataType::Dictionary(_, _) => { |
609 | | // If more than one dictionary, concatenate dictionaries together |
610 | 8 | let dict_concat = !arrays |
611 | 8 | .windows(2) |
612 | 9 | .all8 (|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); |
613 | | |
614 | 8 | match dict_concat { |
615 | 2 | false => (Some(arrays[0].child_data()[0].clone()), false), |
616 | | true => { |
617 | 6 | if let Capacities::Dictionary(_, _) = capacities { |
618 | 0 | panic!("dictionary capacity not yet supported") |
619 | 6 | } |
620 | 6 | let dictionaries: Vec<_> = |
621 | 13 | arrays.iter()6 .map6 (|array| &array.child_data()[0]).collect6 (); |
622 | 6 | let lengths: Vec<_> = dictionaries |
623 | 6 | .iter() |
624 | 13 | .map6 (|dictionary| dictionary.len()) |
625 | 6 | .collect(); |
626 | 6 | let capacity = lengths.iter().sum(); |
627 | | |
628 | 6 | let mut mutable = MutableArrayData::new(dictionaries, false, capacity); |
629 | | |
630 | 13 | for (i, len) in lengths.iter()6 .enumerate6 () { |
631 | 13 | mutable.extend(i, 0, *len) |
632 | | } |
633 | | |
634 | 6 | (Some(mutable.freeze()), true) |
635 | | } |
636 | | } |
637 | | } |
638 | 104 | _ => (None, false), |
639 | | }; |
640 | | |
641 | 112 | let variadic_data_buffers = match &data_type { |
642 | 2 | DataType::BinaryView | DataType::Utf8View => arrays |
643 | 2 | .iter() |
644 | 4 | .flat_map2 (|x| x.buffers().iter().skip(1)) |
645 | 2 | .map(Buffer::clone) |
646 | 2 | .collect(), |
647 | 110 | _ => vec![], |
648 | | }; |
649 | | |
650 | 112 | let extend_nulls = build_extend_nulls(data_type); |
651 | | |
652 | 112 | let extend_null_bits = arrays |
653 | 112 | .iter() |
654 | 200 | .map112 (|array| build_extend_null_bits(array, use_nulls)) |
655 | 112 | .collect(); |
656 | | |
657 | 112 | let null_buffer = use_nulls.then(|| {41 |
658 | 41 | let null_bytes = bit_util::ceil(array_capacity, 8); |
659 | 41 | MutableBuffer::from_len_zeroed(null_bytes) |
660 | 41 | }); |
661 | | |
662 | 112 | let extend_values = match &data_type { |
663 | | DataType::Dictionary(_, _) => { |
664 | 8 | let mut next_offset = 0; |
665 | 8 | let extend_values: Result<Vec<_>, _> = arrays |
666 | 8 | .iter() |
667 | 17 | .map8 (|array| { |
668 | 17 | let offset = next_offset; |
669 | 17 | let dict_len = array.child_data()[0].len(); |
670 | | |
671 | 17 | if dict_concat { |
672 | 13 | next_offset += dict_len; |
673 | 13 | }4 |
674 | | |
675 | 17 | build_extend_dictionary(array, offset, offset + dict_len) |
676 | 17 | .ok_or(ArrowError::DictionaryKeyOverflowError) |
677 | 17 | }) |
678 | 8 | .collect(); |
679 | | |
680 | 8 | extend_values.expect("MutableArrayData::new is infallible") |
681 | | } |
682 | | DataType::BinaryView | DataType::Utf8View => { |
683 | 2 | let mut next_offset = 0u32; |
684 | 2 | arrays |
685 | 2 | .iter() |
686 | 4 | .map2 (|arr| { |
687 | 4 | let num_data_buffers = (arr.buffers().len() - 1) as u32; |
688 | 4 | let offset = next_offset; |
689 | 4 | next_offset = next_offset |
690 | 4 | .checked_add(num_data_buffers) |
691 | 4 | .expect("view buffer index overflow"); |
692 | 4 | build_extend_view(arr, offset) |
693 | 4 | }) |
694 | 2 | .collect() |
695 | | } |
696 | 179 | _ => arrays.iter()102 .map102 (|array| build_extend(array)).collect102 (), |
697 | | }; |
698 | | |
699 | 112 | let data = _MutableArrayData { |
700 | 112 | data_type: data_type.clone(), |
701 | 112 | len: 0, |
702 | 112 | null_count: 0, |
703 | 112 | null_buffer, |
704 | 112 | buffer1, |
705 | 112 | buffer2, |
706 | 112 | child_data, |
707 | 112 | }; |
708 | 112 | Self { |
709 | 112 | arrays, |
710 | 112 | data, |
711 | 112 | dictionary, |
712 | 112 | variadic_data_buffers, |
713 | 112 | extend_values, |
714 | 112 | extend_null_bits, |
715 | 112 | extend_nulls, |
716 | 112 | } |
717 | 112 | } |
718 | | |
719 | | /// Extends the in progress array with a region of the input arrays |
720 | | /// |
721 | | /// # Arguments |
722 | | /// * `index` - the index of array that you what to copy values from |
723 | | /// * `start` - the start index of the chunk (inclusive) |
724 | | /// * `end` - the end index of the chunk (exclusive) |
725 | | /// |
726 | | /// # Panic |
727 | | /// This function panics if there is an invalid index, |
728 | | /// i.e. `index` >= the number of source arrays |
729 | | /// or `end` > the length of the `index`th array |
730 | 259 | pub fn extend(&mut self, index: usize, start: usize, end: usize) { |
731 | 259 | let len = end - start; |
732 | 259 | (self.extend_null_bits[index])(&mut self.data, start, len); |
733 | 259 | (self.extend_values[index])(&mut self.data, index, start, len); |
734 | 259 | self.data.len += len; |
735 | 259 | } |
736 | | |
737 | | /// Extends the in progress array with null elements, ignoring the input arrays. |
738 | | /// |
739 | | /// # Panics |
740 | | /// |
741 | | /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays |
742 | 2 | pub fn extend_nulls(&mut self, len: usize) { |
743 | 2 | self.data.len += len; |
744 | 2 | let bit_len = bit_util::ceil(self.data.len, 8); |
745 | 2 | let nulls = self.data.null_buffer(); |
746 | 2 | nulls.resize(bit_len, 0); |
747 | 2 | self.data.null_count += len; |
748 | 2 | (self.extend_nulls)(&mut self.data, len); |
749 | 2 | } |
750 | | |
751 | | /// Returns the current length |
752 | | #[inline] |
753 | 11 | pub fn len(&self) -> usize { |
754 | 11 | self.data.len |
755 | 11 | } |
756 | | |
757 | | /// Returns true if len is 0 |
758 | | #[inline] |
759 | | pub fn is_empty(&self) -> bool { |
760 | | self.data.len == 0 |
761 | | } |
762 | | |
763 | | /// Returns the current null count |
764 | | #[inline] |
765 | | pub fn null_count(&self) -> usize { |
766 | | self.data.null_count |
767 | | } |
768 | | |
769 | | /// Creates a [ArrayData] from the in progress array, consuming `self`. |
770 | 112 | pub fn freeze(self) -> ArrayData { |
771 | 112 | unsafe { self.into_builder().build_unchecked() } |
772 | 112 | } |
773 | | |
774 | | /// Consume self and returns the in progress array as [`ArrayDataBuilder`]. |
775 | | /// |
776 | | /// This is useful for extending the default behavior of MutableArrayData. |
777 | 112 | pub fn into_builder(self) -> ArrayDataBuilder { |
778 | 112 | let data = self.data; |
779 | | |
780 | 112 | let buffers = match data.data_type { |
781 | | DataType::Null |
782 | | | DataType::Struct(_) |
783 | | | DataType::FixedSizeList(_, _) |
784 | | | DataType::RunEndEncoded(_, _) => { |
785 | 11 | vec![] |
786 | | } |
787 | | DataType::BinaryView | DataType::Utf8View => { |
788 | 2 | let mut b = self.variadic_data_buffers; |
789 | 2 | b.insert(0, data.buffer1.into()); |
790 | 2 | b |
791 | | } |
792 | | DataType::Utf8 |
793 | | | DataType::Binary |
794 | | | DataType::LargeUtf8 |
795 | | | DataType::LargeBinary |
796 | | | DataType::ListView(_) |
797 | | | DataType::LargeListView(_) => { |
798 | 14 | vec![data.buffer1.into(), data.buffer2.into()] |
799 | | } |
800 | 7 | DataType::Union(_, mode) => { |
801 | 7 | match mode { |
802 | | // Based on Union's DataTypeLayout |
803 | 1 | UnionMode::Sparse => vec![data.buffer1.into()], |
804 | 6 | UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()], |
805 | | } |
806 | | } |
807 | 78 | _ => vec![data.buffer1.into()], |
808 | | }; |
809 | | |
810 | 112 | let child_data = match data.data_type { |
811 | 8 | DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()], |
812 | 104 | _ => data.child_data.into_iter().map(|x| x34 .freeze34 ()).collect(), |
813 | | }; |
814 | | |
815 | 112 | let nulls = match data.data_type { |
816 | | // RunEndEncoded and Null arrays cannot have top-level null bitmasks |
817 | 6 | DataType::RunEndEncoded(_, _) | DataType::Null => None, |
818 | 106 | _ => data |
819 | 106 | .null_buffer |
820 | 106 | .map(|nulls| {41 |
821 | 41 | let bools = BooleanBuffer::new(nulls.into(), 0, data.len); |
822 | 41 | unsafe { NullBuffer::new_unchecked(bools, data.null_count) } |
823 | 41 | }) |
824 | 106 | .filter(|n| n41 .null_count41 () > 0), |
825 | | }; |
826 | | |
827 | 112 | ArrayDataBuilder::new(data.data_type) |
828 | 112 | .offset(0) |
829 | 112 | .len(data.len) |
830 | 112 | .nulls(nulls) |
831 | 112 | .buffers(buffers) |
832 | 112 | .child_data(child_data) |
833 | 112 | } |
834 | | } |
835 | | |
836 | | // See arrow/tests/array_transform.rs for tests of transform functionality |
837 | | |
838 | | #[cfg(test)] |
839 | | mod test { |
840 | | use super::*; |
841 | | use arrow_schema::Field; |
842 | | use std::sync::Arc; |
843 | | |
844 | | #[test] |
845 | | fn test_list_append_with_capacities() { |
846 | | let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new( |
847 | | "element", |
848 | | DataType::Int64, |
849 | | false, |
850 | | )))); |
851 | | |
852 | | let mutable = MutableArrayData::with_capacities( |
853 | | vec![&array], |
854 | | false, |
855 | | Capacities::List(6, Some(Box::new(Capacities::Array(17)))), |
856 | | ); |
857 | | |
858 | | // capacities are rounded up to multiples of 64 by MutableBuffer |
859 | | assert_eq!(mutable.data.buffer1.capacity(), 64); |
860 | | assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); |
861 | | } |
862 | | } |