/Users/andrewlamb/Software/arrow-rs/arrow-row/src/list.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::{fixed, null_sentinel, LengthTracker, RowConverter, Rows, SortField}; |
19 | | use arrow_array::{new_null_array, Array, FixedSizeListArray, GenericListArray, OffsetSizeTrait}; |
20 | | use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; |
21 | | use arrow_data::ArrayDataBuilder; |
22 | | use arrow_schema::{ArrowError, DataType, SortOptions}; |
23 | | use std::{ops::Range, sync::Arc}; |
24 | | |
25 | 0 | pub fn compute_lengths<O: OffsetSizeTrait>( |
26 | 0 | lengths: &mut [usize], |
27 | 0 | rows: &Rows, |
28 | 0 | array: &GenericListArray<O>, |
29 | 0 | ) { |
30 | 0 | let shift = array.value_offsets()[0].as_usize(); |
31 | | |
32 | 0 | let offsets = array.value_offsets().windows(2); |
33 | 0 | lengths |
34 | 0 | .iter_mut() |
35 | 0 | .zip(offsets) |
36 | 0 | .enumerate() |
37 | 0 | .for_each(|(idx, (length, offsets))| { |
38 | 0 | let start = offsets[0].as_usize() - shift; |
39 | 0 | let end = offsets[1].as_usize() - shift; |
40 | 0 | let range = array.is_valid(idx).then_some(start..end); |
41 | 0 | *length += encoded_len(rows, range); |
42 | 0 | }); |
43 | 0 | } |
44 | | |
45 | 0 | fn encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize { |
46 | 0 | match range { |
47 | 0 | None => 1, |
48 | 0 | Some(range) => { |
49 | 0 | 1 + range |
50 | 0 | .map(|i| super::variable::padded_length(Some(rows.row(i).as_ref().len()))) |
51 | 0 | .sum::<usize>() |
52 | | } |
53 | | } |
54 | 0 | } |
55 | | |
56 | | /// Encodes the provided `GenericListArray` to `out` with the provided `SortOptions` |
57 | | /// |
58 | | /// `rows` should contain the encoded child elements |
59 | 0 | pub fn encode<O: OffsetSizeTrait>( |
60 | 0 | data: &mut [u8], |
61 | 0 | offsets: &mut [usize], |
62 | 0 | rows: &Rows, |
63 | 0 | opts: SortOptions, |
64 | 0 | array: &GenericListArray<O>, |
65 | 0 | ) { |
66 | 0 | let shift = array.value_offsets()[0].as_usize(); |
67 | | |
68 | 0 | offsets |
69 | 0 | .iter_mut() |
70 | 0 | .skip(1) |
71 | 0 | .zip(array.value_offsets().windows(2)) |
72 | 0 | .enumerate() |
73 | 0 | .for_each(|(idx, (offset, offsets))| { |
74 | 0 | let start = offsets[0].as_usize() - shift; |
75 | 0 | let end = offsets[1].as_usize() - shift; |
76 | 0 | let range = array.is_valid(idx).then_some(start..end); |
77 | 0 | let out = &mut data[*offset..]; |
78 | 0 | *offset += encode_one(out, rows, range, opts) |
79 | 0 | }); |
80 | 0 | } |
81 | | |
82 | | #[inline] |
83 | 0 | fn encode_one( |
84 | 0 | out: &mut [u8], |
85 | 0 | rows: &Rows, |
86 | 0 | range: Option<Range<usize>>, |
87 | 0 | opts: SortOptions, |
88 | 0 | ) -> usize { |
89 | 0 | match range { |
90 | 0 | None => super::variable::encode_null(out, opts), |
91 | 0 | Some(range) if range.start == range.end => super::variable::encode_empty(out, opts), |
92 | 0 | Some(range) => { |
93 | 0 | let mut offset = 0; |
94 | 0 | for i in range { |
95 | 0 | let row = rows.row(i); |
96 | 0 | offset += super::variable::encode_one(&mut out[offset..], Some(row.data), opts); |
97 | 0 | } |
98 | 0 | offset += super::variable::encode_empty(&mut out[offset..], opts); |
99 | 0 | offset |
100 | | } |
101 | | } |
102 | 0 | } |
103 | | |
104 | | /// Decodes an array from `rows` with the provided `options` |
105 | | /// |
106 | | /// # Safety |
107 | | /// |
108 | | /// `rows` must contain valid data for the provided `converter` |
109 | 0 | pub unsafe fn decode<O: OffsetSizeTrait>( |
110 | 0 | converter: &RowConverter, |
111 | 0 | rows: &mut [&[u8]], |
112 | 0 | field: &SortField, |
113 | 0 | validate_utf8: bool, |
114 | 0 | ) -> Result<GenericListArray<O>, ArrowError> { |
115 | 0 | let opts = field.options; |
116 | | |
117 | 0 | let mut values_bytes = 0; |
118 | | |
119 | 0 | let mut offset = 0; |
120 | 0 | let mut offsets = Vec::with_capacity(rows.len() + 1); |
121 | 0 | offsets.push(O::usize_as(0)); |
122 | | |
123 | 0 | for row in rows.iter_mut() { |
124 | 0 | let mut row_offset = 0; |
125 | | loop { |
126 | 0 | let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| { |
127 | 0 | values_bytes += x.len(); |
128 | 0 | }); |
129 | 0 | if decoded <= 1 { |
130 | 0 | offsets.push(O::usize_as(offset)); |
131 | 0 | break; |
132 | 0 | } |
133 | 0 | row_offset += decoded; |
134 | 0 | offset += 1; |
135 | | } |
136 | | } |
137 | 0 | O::from_usize(offset).expect("overflow"); |
138 | | |
139 | 0 | let mut null_count = 0; |
140 | 0 | let nulls = MutableBuffer::collect_bool(rows.len(), |x| { |
141 | 0 | let valid = rows[x][0] != null_sentinel(opts); |
142 | 0 | null_count += !valid as usize; |
143 | 0 | valid |
144 | 0 | }); |
145 | | |
146 | 0 | let mut values_offsets = Vec::with_capacity(offset); |
147 | 0 | let mut values_bytes = Vec::with_capacity(values_bytes); |
148 | 0 | for row in rows.iter_mut() { |
149 | 0 | let mut row_offset = 0; |
150 | | loop { |
151 | 0 | let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| { |
152 | 0 | values_bytes.extend_from_slice(x) |
153 | 0 | }); |
154 | 0 | row_offset += decoded; |
155 | 0 | if decoded <= 1 { |
156 | 0 | break; |
157 | 0 | } |
158 | 0 | values_offsets.push(values_bytes.len()); |
159 | | } |
160 | 0 | *row = &row[row_offset..]; |
161 | | } |
162 | | |
163 | 0 | if opts.descending { |
164 | 0 | values_bytes.iter_mut().for_each(|o| *o = !*o); |
165 | 0 | } |
166 | | |
167 | 0 | let mut last_value_offset = 0; |
168 | 0 | let mut child_rows: Vec<_> = values_offsets |
169 | 0 | .into_iter() |
170 | 0 | .map(|offset| { |
171 | 0 | let v = &values_bytes[last_value_offset..offset]; |
172 | 0 | last_value_offset = offset; |
173 | 0 | v |
174 | 0 | }) |
175 | 0 | .collect(); |
176 | | |
177 | 0 | let child = converter.convert_raw(&mut child_rows, validate_utf8)?; |
178 | 0 | assert_eq!(child.len(), 1); |
179 | | |
180 | 0 | let child_data = child[0].to_data(); |
181 | | |
182 | | // Since RowConverter flattens certain data types (i.e. Dictionary), |
183 | | // we need to use updated data type instead of original field |
184 | 0 | let corrected_type = match &field.data_type { |
185 | 0 | DataType::List(inner_field) => DataType::List(Arc::new( |
186 | 0 | inner_field |
187 | 0 | .as_ref() |
188 | 0 | .clone() |
189 | 0 | .with_data_type(child_data.data_type().clone()), |
190 | 0 | )), |
191 | 0 | DataType::LargeList(inner_field) => DataType::LargeList(Arc::new( |
192 | 0 | inner_field |
193 | 0 | .as_ref() |
194 | 0 | .clone() |
195 | 0 | .with_data_type(child_data.data_type().clone()), |
196 | 0 | )), |
197 | 0 | _ => unreachable!(), |
198 | | }; |
199 | | |
200 | 0 | let builder = ArrayDataBuilder::new(corrected_type) |
201 | 0 | .len(rows.len()) |
202 | 0 | .null_count(null_count) |
203 | 0 | .null_bit_buffer(Some(nulls.into())) |
204 | 0 | .add_buffer(Buffer::from_vec(offsets)) |
205 | 0 | .add_child_data(child_data); |
206 | | |
207 | 0 | Ok(GenericListArray::from(unsafe { builder.build_unchecked() })) |
208 | 0 | } |
209 | | |
210 | 0 | pub fn compute_lengths_fixed_size_list( |
211 | 0 | tracker: &mut LengthTracker, |
212 | 0 | rows: &Rows, |
213 | 0 | array: &FixedSizeListArray, |
214 | 0 | ) { |
215 | 0 | let value_length = array.value_length().as_usize(); |
216 | 0 | tracker.push_variable((0..array.len()).map(|idx| { |
217 | 0 | match array.is_valid(idx) { |
218 | | true => { |
219 | 0 | 1 + ((idx * value_length)..(idx + 1) * value_length) |
220 | 0 | .map(|child_idx| rows.row(child_idx).as_ref().len()) |
221 | 0 | .sum::<usize>() |
222 | | } |
223 | 0 | false => 1, |
224 | | } |
225 | 0 | })) |
226 | 0 | } |
227 | | |
228 | | /// Encodes the provided `FixedSizeListArray` to `out` with the provided `SortOptions` |
229 | | /// |
230 | | /// `rows` should contain the encoded child elements |
231 | 0 | pub fn encode_fixed_size_list( |
232 | 0 | data: &mut [u8], |
233 | 0 | offsets: &mut [usize], |
234 | 0 | rows: &Rows, |
235 | 0 | opts: SortOptions, |
236 | 0 | array: &FixedSizeListArray, |
237 | 0 | ) { |
238 | 0 | let null_sentinel = null_sentinel(opts); |
239 | 0 | offsets |
240 | 0 | .iter_mut() |
241 | 0 | .skip(1) |
242 | 0 | .enumerate() |
243 | 0 | .for_each(|(idx, offset)| { |
244 | 0 | let value_length = array.value_length().as_usize(); |
245 | 0 | match array.is_valid(idx) { |
246 | | true => { |
247 | 0 | data[*offset] = 0x01; |
248 | 0 | *offset += 1; |
249 | 0 | for child_idx in (idx * value_length)..(idx + 1) * value_length { |
250 | 0 | let row = rows.row(child_idx); |
251 | 0 | let end_offset = *offset + row.as_ref().len(); |
252 | 0 | data[*offset..end_offset].copy_from_slice(row.as_ref()); |
253 | 0 | *offset = end_offset; |
254 | 0 | } |
255 | | } |
256 | 0 | false => { |
257 | 0 | data[*offset] = null_sentinel; |
258 | 0 | *offset += 1; |
259 | 0 | } |
260 | | }; |
261 | 0 | }) |
262 | 0 | } |
263 | | |
264 | | /// Decodes a fixed size list array from `rows` with the provided `options` |
265 | | /// |
266 | | /// # Safety |
267 | | /// |
268 | | /// `rows` must contain valid data for the provided `converter` |
269 | 0 | pub unsafe fn decode_fixed_size_list( |
270 | 0 | converter: &RowConverter, |
271 | 0 | rows: &mut [&[u8]], |
272 | 0 | field: &SortField, |
273 | 0 | validate_utf8: bool, |
274 | 0 | value_length: usize, |
275 | 0 | ) -> Result<FixedSizeListArray, ArrowError> { |
276 | 0 | let list_type = &field.data_type; |
277 | 0 | let element_type = match list_type { |
278 | 0 | DataType::FixedSizeList(element_field, _) => element_field.data_type(), |
279 | | _ => { |
280 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
281 | 0 | "Expected FixedSizeListArray, found: {list_type:?}", |
282 | 0 | ))) |
283 | | } |
284 | | }; |
285 | | |
286 | 0 | let len = rows.len(); |
287 | 0 | let (null_count, nulls) = fixed::decode_nulls(rows); |
288 | | |
289 | 0 | let null_element_encoded = converter.convert_columns(&[new_null_array(element_type, 1)])?; |
290 | 0 | let null_element_encoded = null_element_encoded.row(0); |
291 | 0 | let null_element_slice = null_element_encoded.as_ref(); |
292 | | |
293 | 0 | let mut child_rows = Vec::new(); |
294 | 0 | for row in rows { |
295 | 0 | let valid = row[0] == 1; |
296 | 0 | let mut row_offset = 1; |
297 | 0 | if !valid { |
298 | 0 | for _ in 0..value_length { |
299 | 0 | child_rows.push(null_element_slice); |
300 | 0 | } |
301 | | } else { |
302 | 0 | for _ in 0..value_length { |
303 | 0 | let mut temp_child_rows = vec![&row[row_offset..]]; |
304 | 0 | converter.convert_raw(&mut temp_child_rows, validate_utf8)?; |
305 | 0 | let decoded_bytes = row.len() - row_offset - temp_child_rows[0].len(); |
306 | 0 | let next_offset = row_offset + decoded_bytes; |
307 | 0 | child_rows.push(&row[row_offset..next_offset]); |
308 | 0 | row_offset = next_offset; |
309 | | } |
310 | | } |
311 | 0 | *row = &row[row_offset..]; // Update row for the next decoder |
312 | | } |
313 | | |
314 | 0 | let children = converter.convert_raw(&mut child_rows, validate_utf8)?; |
315 | 0 | let child_data = children.iter().map(|c| c.to_data()).collect(); |
316 | 0 | let builder = ArrayDataBuilder::new(list_type.clone()) |
317 | 0 | .len(len) |
318 | 0 | .null_count(null_count) |
319 | 0 | .null_bit_buffer(Some(nulls)) |
320 | 0 | .child_data(child_data); |
321 | | |
322 | 0 | Ok(FixedSizeListArray::from(builder.build_unchecked())) |
323 | 0 | } |