Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-row/src/list.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::{fixed, null_sentinel, LengthTracker, RowConverter, Rows, SortField};
19
use arrow_array::{new_null_array, Array, FixedSizeListArray, GenericListArray, OffsetSizeTrait};
20
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
21
use arrow_data::ArrayDataBuilder;
22
use arrow_schema::{ArrowError, DataType, SortOptions};
23
use std::{ops::Range, sync::Arc};
24
25
0
pub fn compute_lengths<O: OffsetSizeTrait>(
26
0
    lengths: &mut [usize],
27
0
    rows: &Rows,
28
0
    array: &GenericListArray<O>,
29
0
) {
30
0
    let shift = array.value_offsets()[0].as_usize();
31
32
0
    let offsets = array.value_offsets().windows(2);
33
0
    lengths
34
0
        .iter_mut()
35
0
        .zip(offsets)
36
0
        .enumerate()
37
0
        .for_each(|(idx, (length, offsets))| {
38
0
            let start = offsets[0].as_usize() - shift;
39
0
            let end = offsets[1].as_usize() - shift;
40
0
            let range = array.is_valid(idx).then_some(start..end);
41
0
            *length += encoded_len(rows, range);
42
0
        });
43
0
}
44
45
0
fn encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize {
46
0
    match range {
47
0
        None => 1,
48
0
        Some(range) => {
49
0
            1 + range
50
0
                .map(|i| super::variable::padded_length(Some(rows.row(i).as_ref().len())))
51
0
                .sum::<usize>()
52
        }
53
    }
54
0
}
55
56
/// Encodes the provided `GenericListArray` to `out` with the provided `SortOptions`
57
///
58
/// `rows` should contain the encoded child elements
59
0
pub fn encode<O: OffsetSizeTrait>(
60
0
    data: &mut [u8],
61
0
    offsets: &mut [usize],
62
0
    rows: &Rows,
63
0
    opts: SortOptions,
64
0
    array: &GenericListArray<O>,
65
0
) {
66
0
    let shift = array.value_offsets()[0].as_usize();
67
68
0
    offsets
69
0
        .iter_mut()
70
0
        .skip(1)
71
0
        .zip(array.value_offsets().windows(2))
72
0
        .enumerate()
73
0
        .for_each(|(idx, (offset, offsets))| {
74
0
            let start = offsets[0].as_usize() - shift;
75
0
            let end = offsets[1].as_usize() - shift;
76
0
            let range = array.is_valid(idx).then_some(start..end);
77
0
            let out = &mut data[*offset..];
78
0
            *offset += encode_one(out, rows, range, opts)
79
0
        });
80
0
}
81
82
#[inline]
83
0
fn encode_one(
84
0
    out: &mut [u8],
85
0
    rows: &Rows,
86
0
    range: Option<Range<usize>>,
87
0
    opts: SortOptions,
88
0
) -> usize {
89
0
    match range {
90
0
        None => super::variable::encode_null(out, opts),
91
0
        Some(range) if range.start == range.end => super::variable::encode_empty(out, opts),
92
0
        Some(range) => {
93
0
            let mut offset = 0;
94
0
            for i in range {
95
0
                let row = rows.row(i);
96
0
                offset += super::variable::encode_one(&mut out[offset..], Some(row.data), opts);
97
0
            }
98
0
            offset += super::variable::encode_empty(&mut out[offset..], opts);
99
0
            offset
100
        }
101
    }
102
0
}
103
104
/// Decodes an array from `rows` with the provided `options`
105
///
106
/// # Safety
107
///
108
/// `rows` must contain valid data for the provided `converter`
109
0
pub unsafe fn decode<O: OffsetSizeTrait>(
110
0
    converter: &RowConverter,
111
0
    rows: &mut [&[u8]],
112
0
    field: &SortField,
113
0
    validate_utf8: bool,
114
0
) -> Result<GenericListArray<O>, ArrowError> {
115
0
    let opts = field.options;
116
117
0
    let mut values_bytes = 0;
118
119
0
    let mut offset = 0;
120
0
    let mut offsets = Vec::with_capacity(rows.len() + 1);
121
0
    offsets.push(O::usize_as(0));
122
123
0
    for row in rows.iter_mut() {
124
0
        let mut row_offset = 0;
125
        loop {
126
0
            let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| {
127
0
                values_bytes += x.len();
128
0
            });
129
0
            if decoded <= 1 {
130
0
                offsets.push(O::usize_as(offset));
131
0
                break;
132
0
            }
133
0
            row_offset += decoded;
134
0
            offset += 1;
135
        }
136
    }
137
0
    O::from_usize(offset).expect("overflow");
138
139
0
    let mut null_count = 0;
140
0
    let nulls = MutableBuffer::collect_bool(rows.len(), |x| {
141
0
        let valid = rows[x][0] != null_sentinel(opts);
142
0
        null_count += !valid as usize;
143
0
        valid
144
0
    });
145
146
0
    let mut values_offsets = Vec::with_capacity(offset);
147
0
    let mut values_bytes = Vec::with_capacity(values_bytes);
148
0
    for row in rows.iter_mut() {
149
0
        let mut row_offset = 0;
150
        loop {
151
0
            let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| {
152
0
                values_bytes.extend_from_slice(x)
153
0
            });
154
0
            row_offset += decoded;
155
0
            if decoded <= 1 {
156
0
                break;
157
0
            }
158
0
            values_offsets.push(values_bytes.len());
159
        }
160
0
        *row = &row[row_offset..];
161
    }
162
163
0
    if opts.descending {
164
0
        values_bytes.iter_mut().for_each(|o| *o = !*o);
165
0
    }
166
167
0
    let mut last_value_offset = 0;
168
0
    let mut child_rows: Vec<_> = values_offsets
169
0
        .into_iter()
170
0
        .map(|offset| {
171
0
            let v = &values_bytes[last_value_offset..offset];
172
0
            last_value_offset = offset;
173
0
            v
174
0
        })
175
0
        .collect();
176
177
0
    let child = converter.convert_raw(&mut child_rows, validate_utf8)?;
178
0
    assert_eq!(child.len(), 1);
179
180
0
    let child_data = child[0].to_data();
181
182
    // Since RowConverter flattens certain data types (i.e. Dictionary),
183
    // we need to use updated data type instead of original field
184
0
    let corrected_type = match &field.data_type {
185
0
        DataType::List(inner_field) => DataType::List(Arc::new(
186
0
            inner_field
187
0
                .as_ref()
188
0
                .clone()
189
0
                .with_data_type(child_data.data_type().clone()),
190
0
        )),
191
0
        DataType::LargeList(inner_field) => DataType::LargeList(Arc::new(
192
0
            inner_field
193
0
                .as_ref()
194
0
                .clone()
195
0
                .with_data_type(child_data.data_type().clone()),
196
0
        )),
197
0
        _ => unreachable!(),
198
    };
199
200
0
    let builder = ArrayDataBuilder::new(corrected_type)
201
0
        .len(rows.len())
202
0
        .null_count(null_count)
203
0
        .null_bit_buffer(Some(nulls.into()))
204
0
        .add_buffer(Buffer::from_vec(offsets))
205
0
        .add_child_data(child_data);
206
207
0
    Ok(GenericListArray::from(unsafe { builder.build_unchecked() }))
208
0
}
209
210
0
pub fn compute_lengths_fixed_size_list(
211
0
    tracker: &mut LengthTracker,
212
0
    rows: &Rows,
213
0
    array: &FixedSizeListArray,
214
0
) {
215
0
    let value_length = array.value_length().as_usize();
216
0
    tracker.push_variable((0..array.len()).map(|idx| {
217
0
        match array.is_valid(idx) {
218
            true => {
219
0
                1 + ((idx * value_length)..(idx + 1) * value_length)
220
0
                    .map(|child_idx| rows.row(child_idx).as_ref().len())
221
0
                    .sum::<usize>()
222
            }
223
0
            false => 1,
224
        }
225
0
    }))
226
0
}
227
228
/// Encodes the provided `FixedSizeListArray` to `out` with the provided `SortOptions`
229
///
230
/// `rows` should contain the encoded child elements
231
0
pub fn encode_fixed_size_list(
232
0
    data: &mut [u8],
233
0
    offsets: &mut [usize],
234
0
    rows: &Rows,
235
0
    opts: SortOptions,
236
0
    array: &FixedSizeListArray,
237
0
) {
238
0
    let null_sentinel = null_sentinel(opts);
239
0
    offsets
240
0
        .iter_mut()
241
0
        .skip(1)
242
0
        .enumerate()
243
0
        .for_each(|(idx, offset)| {
244
0
            let value_length = array.value_length().as_usize();
245
0
            match array.is_valid(idx) {
246
                true => {
247
0
                    data[*offset] = 0x01;
248
0
                    *offset += 1;
249
0
                    for child_idx in (idx * value_length)..(idx + 1) * value_length {
250
0
                        let row = rows.row(child_idx);
251
0
                        let end_offset = *offset + row.as_ref().len();
252
0
                        data[*offset..end_offset].copy_from_slice(row.as_ref());
253
0
                        *offset = end_offset;
254
0
                    }
255
                }
256
0
                false => {
257
0
                    data[*offset] = null_sentinel;
258
0
                    *offset += 1;
259
0
                }
260
            };
261
0
        })
262
0
}
263
264
/// Decodes a fixed size list array from `rows` with the provided `options`
265
///
266
/// # Safety
267
///
268
/// `rows` must contain valid data for the provided `converter`
269
0
pub unsafe fn decode_fixed_size_list(
270
0
    converter: &RowConverter,
271
0
    rows: &mut [&[u8]],
272
0
    field: &SortField,
273
0
    validate_utf8: bool,
274
0
    value_length: usize,
275
0
) -> Result<FixedSizeListArray, ArrowError> {
276
0
    let list_type = &field.data_type;
277
0
    let element_type = match list_type {
278
0
        DataType::FixedSizeList(element_field, _) => element_field.data_type(),
279
        _ => {
280
0
            return Err(ArrowError::InvalidArgumentError(format!(
281
0
                "Expected FixedSizeListArray, found: {list_type:?}",
282
0
            )))
283
        }
284
    };
285
286
0
    let len = rows.len();
287
0
    let (null_count, nulls) = fixed::decode_nulls(rows);
288
289
0
    let null_element_encoded = converter.convert_columns(&[new_null_array(element_type, 1)])?;
290
0
    let null_element_encoded = null_element_encoded.row(0);
291
0
    let null_element_slice = null_element_encoded.as_ref();
292
293
0
    let mut child_rows = Vec::new();
294
0
    for row in rows {
295
0
        let valid = row[0] == 1;
296
0
        let mut row_offset = 1;
297
0
        if !valid {
298
0
            for _ in 0..value_length {
299
0
                child_rows.push(null_element_slice);
300
0
            }
301
        } else {
302
0
            for _ in 0..value_length {
303
0
                let mut temp_child_rows = vec![&row[row_offset..]];
304
0
                converter.convert_raw(&mut temp_child_rows, validate_utf8)?;
305
0
                let decoded_bytes = row.len() - row_offset - temp_child_rows[0].len();
306
0
                let next_offset = row_offset + decoded_bytes;
307
0
                child_rows.push(&row[row_offset..next_offset]);
308
0
                row_offset = next_offset;
309
            }
310
        }
311
0
        *row = &row[row_offset..]; // Update row for the next decoder
312
    }
313
314
0
    let children = converter.convert_raw(&mut child_rows, validate_utf8)?;
315
0
    let child_data = children.iter().map(|c| c.to_data()).collect();
316
0
    let builder = ArrayDataBuilder::new(list_type.clone())
317
0
        .len(len)
318
0
        .null_count(null_count)
319
0
        .null_bit_buffer(Some(nulls))
320
0
        .child_data(child_data);
321
322
0
    Ok(FixedSizeListArray::from(builder.build_unchecked()))
323
0
}