Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-row/src/variable.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::null_sentinel;
19
use arrow_array::builder::BufferBuilder;
20
use arrow_array::*;
21
use arrow_buffer::bit_util::ceil;
22
use arrow_buffer::MutableBuffer;
23
use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
24
use arrow_schema::{DataType, SortOptions};
25
use builder::make_view;
26
27
/// The block size of the variable length encoding
28
pub const BLOCK_SIZE: usize = 32;
29
30
/// The first block is split into `MINI_BLOCK_COUNT` mini-blocks
31
///
32
/// This helps to reduce the space amplification for small strings
33
pub const MINI_BLOCK_COUNT: usize = 4;
34
35
/// The mini block size
36
pub const MINI_BLOCK_SIZE: usize = BLOCK_SIZE / MINI_BLOCK_COUNT;
37
38
/// The continuation token
39
pub const BLOCK_CONTINUATION: u8 = 0xFF;
40
41
/// Indicates an empty string
42
pub const EMPTY_SENTINEL: u8 = 1;
43
44
/// Indicates a non-empty string
45
pub const NON_EMPTY_SENTINEL: u8 = 2;
46
47
/// Returns the length of the encoded representation of a byte array, including the null byte
48
#[inline]
49
0
pub fn encoded_len(a: Option<&[u8]>) -> usize {
50
0
    padded_length(a.map(|x| x.len()))
51
0
}
52
53
/// Returns the padded length of the encoded length of the given length
54
#[inline]
55
0
pub fn padded_length(a: Option<usize>) -> usize {
56
0
    match a {
57
0
        Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1),
58
        // Each miniblock ends with a 1 byte continuation, therefore add
59
        // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size
60
0
        Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1),
61
0
        None => 1,
62
    }
63
0
}
64
65
/// Variable length values are encoded as
66
///
67
/// - single `0_u8` if null
68
/// - single `1_u8` if empty array
69
/// - `2_u8` if not empty, followed by one or more blocks
70
///
71
/// where a block is encoded as
72
///
73
/// - [`BLOCK_SIZE`] bytes of string data, padded with 0s
74
/// - `0xFF_u8` if this is not the last block for this string
75
/// - otherwise the length of the block as a `u8`
76
0
pub fn encode<'a, I: Iterator<Item = Option<&'a [u8]>>>(
77
0
    data: &mut [u8],
78
0
    offsets: &mut [usize],
79
0
    i: I,
80
0
    opts: SortOptions,
81
0
) {
82
0
    for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(i) {
83
0
        *offset += encode_one(&mut data[*offset..], maybe_val, opts);
84
0
    }
85
0
}
86
87
0
pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize {
88
0
    out[0] = null_sentinel(opts);
89
0
    1
90
0
}
91
92
0
pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize {
93
0
    out[0] = match opts.descending {
94
0
        true => !EMPTY_SENTINEL,
95
0
        false => EMPTY_SENTINEL,
96
    };
97
0
    1
98
0
}
99
100
0
pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize {
101
0
    match val {
102
0
        None => encode_null(out, opts),
103
0
        Some([]) => encode_empty(out, opts),
104
0
        Some(val) => {
105
            // Write `2_u8` to demarcate as non-empty, non-null string
106
0
            out[0] = NON_EMPTY_SENTINEL;
107
108
0
            let len = if val.len() <= BLOCK_SIZE {
109
0
                1 + encode_blocks::<MINI_BLOCK_SIZE>(&mut out[1..], val)
110
            } else {
111
0
                let (initial, rem) = val.split_at(BLOCK_SIZE);
112
0
                let offset = encode_blocks::<MINI_BLOCK_SIZE>(&mut out[1..], initial);
113
0
                out[offset] = BLOCK_CONTINUATION;
114
0
                1 + offset + encode_blocks::<BLOCK_SIZE>(&mut out[1 + offset..], rem)
115
            };
116
117
0
            if opts.descending {
118
                // Invert bits
119
0
                out[..len].iter_mut().for_each(|v| *v = !*v)
120
0
            }
121
0
            len
122
        }
123
    }
124
0
}
125
126
/// Writes `val` in `SIZE` blocks with the appropriate continuation tokens
127
#[inline]
128
0
fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize {
129
0
    let block_count = ceil(val.len(), SIZE);
130
0
    let end_offset = block_count * (SIZE + 1);
131
0
    let to_write = &mut out[..end_offset];
132
133
0
    let chunks = val.chunks_exact(SIZE);
134
0
    let remainder = chunks.remainder();
135
0
    for (input, output) in chunks.clone().zip(to_write.chunks_exact_mut(SIZE + 1)) {
136
0
        let input: &[u8; SIZE] = input.try_into().unwrap();
137
0
        let out_block: &mut [u8; SIZE] = (&mut output[..SIZE]).try_into().unwrap();
138
0
139
0
        *out_block = *input;
140
0
141
0
        // Indicate that there are further blocks to follow
142
0
        output[SIZE] = BLOCK_CONTINUATION;
143
0
    }
144
145
0
    if !remainder.is_empty() {
146
0
        let start_offset = (block_count - 1) * (SIZE + 1);
147
0
        to_write[start_offset..start_offset + remainder.len()].copy_from_slice(remainder);
148
0
        *to_write.last_mut().unwrap() = remainder.len() as u8;
149
0
    } else {
150
0
        // We must overwrite the continuation marker written by the loop above
151
0
        *to_write.last_mut().unwrap() = SIZE as u8;
152
0
    }
153
0
    end_offset
154
0
}
155
156
/// Decodes a single block of data
157
/// The `f` function accepts a slice of the decoded data, it may be called multiple times
158
0
pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize {
159
0
    let (non_empty_sentinel, continuation) = match options.descending {
160
0
        true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION),
161
0
        false => (NON_EMPTY_SENTINEL, BLOCK_CONTINUATION),
162
    };
163
164
0
    if row[0] != non_empty_sentinel {
165
        // Empty or null string
166
0
        return 1;
167
0
    }
168
169
    // Extracts the block length from the sentinel
170
0
    let block_len = |sentinel: u8| match options.descending {
171
0
        true => !sentinel as usize,
172
0
        false => sentinel as usize,
173
0
    };
174
175
0
    let mut idx = 1;
176
0
    for _ in 0..MINI_BLOCK_COUNT {
177
0
        let sentinel = row[idx + MINI_BLOCK_SIZE];
178
0
        if sentinel != continuation {
179
0
            f(&row[idx..idx + block_len(sentinel)]);
180
0
            return idx + MINI_BLOCK_SIZE + 1;
181
0
        }
182
0
        f(&row[idx..idx + MINI_BLOCK_SIZE]);
183
0
        idx += MINI_BLOCK_SIZE + 1;
184
    }
185
186
    loop {
187
0
        let sentinel = row[idx + BLOCK_SIZE];
188
0
        if sentinel != continuation {
189
0
            f(&row[idx..idx + block_len(sentinel)]);
190
0
            return idx + BLOCK_SIZE + 1;
191
0
        }
192
0
        f(&row[idx..idx + BLOCK_SIZE]);
193
0
        idx += BLOCK_SIZE + 1;
194
    }
195
0
}
196
197
/// Returns the number of bytes of encoded data
198
0
fn decoded_len(row: &[u8], options: SortOptions) -> usize {
199
0
    let mut len = 0;
200
0
    decode_blocks(row, options, |block| len += block.len());
201
0
    len
202
0
}
203
204
/// Decodes a binary array from `rows` with the provided `options`
205
0
pub fn decode_binary<I: OffsetSizeTrait>(
206
0
    rows: &mut [&[u8]],
207
0
    options: SortOptions,
208
0
) -> GenericBinaryArray<I> {
209
0
    let len = rows.len();
210
0
    let mut null_count = 0;
211
0
    let nulls = MutableBuffer::collect_bool(len, |x| {
212
0
        let valid = rows[x][0] != null_sentinel(options);
213
0
        null_count += !valid as usize;
214
0
        valid
215
0
    });
216
217
0
    let values_capacity = rows.iter().map(|row| decoded_len(row, options)).sum();
218
0
    let mut offsets = BufferBuilder::<I>::new(len + 1);
219
0
    offsets.append(I::zero());
220
0
    let mut values = MutableBuffer::new(values_capacity);
221
222
0
    for row in rows {
223
0
        let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
224
0
        *row = &row[offset..];
225
0
        offsets.append(I::from_usize(values.len()).expect("offset overflow"))
226
    }
227
228
0
    if options.descending {
229
0
        values.as_slice_mut().iter_mut().for_each(|o| *o = !*o)
230
0
    }
231
232
0
    let d = match I::IS_LARGE {
233
0
        true => DataType::LargeBinary,
234
0
        false => DataType::Binary,
235
    };
236
237
0
    let builder = ArrayDataBuilder::new(d)
238
0
        .len(len)
239
0
        .null_count(null_count)
240
0
        .null_bit_buffer(Some(nulls.into()))
241
0
        .add_buffer(offsets.finish())
242
0
        .add_buffer(values.into());
243
244
    // SAFETY:
245
    // Valid by construction above
246
0
    unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
247
0
}
248
249
0
fn decode_binary_view_inner(
250
0
    rows: &mut [&[u8]],
251
0
    options: SortOptions,
252
0
    validate_utf8: bool,
253
0
) -> BinaryViewArray {
254
0
    let len = rows.len();
255
0
    let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize;
256
257
0
    let mut null_count = 0;
258
259
0
    let nulls = MutableBuffer::collect_bool(len, |x| {
260
0
        let valid = rows[x][0] != null_sentinel(options);
261
0
        null_count += !valid as usize;
262
0
        valid
263
0
    });
264
265
    // If we are validating UTF-8, decode all string values (including short strings)
266
    // into the values buffer and validate UTF-8 once. If not validating,
267
    // we save memory by only copying long strings to the values buffer, as short strings
268
    // will be inlined into the view and do not need to be stored redundantly.
269
0
    let values_capacity = if validate_utf8 {
270
        // Capacity for all long and short strings
271
0
        rows.iter().map(|row| decoded_len(row, options)).sum()
272
    } else {
273
        // Capacity for all long strings plus room for one short string
274
0
        rows.iter().fold(0, |acc, row| {
275
0
            let len = decoded_len(row, options);
276
0
            if len > inline_str_max_len {
277
0
                acc + len
278
            } else {
279
0
                acc
280
            }
281
0
        }) + inline_str_max_len
282
    };
283
0
    let mut values = MutableBuffer::new(values_capacity);
284
285
0
    let mut views = BufferBuilder::<u128>::new(len);
286
0
    for row in rows {
287
0
        let start_offset = values.len();
288
0
        let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
289
        // Measure string length via change in values buffer.
290
        // Used to check if decoded value should be truncated (short string) when validate_utf8 is false
291
0
        let decoded_len = values.len() - start_offset;
292
0
        if row[0] == null_sentinel(options) {
293
0
            debug_assert_eq!(offset, 1);
294
0
            debug_assert_eq!(start_offset, values.len());
295
0
            views.append(0);
296
        } else {
297
            // Safety: we just appended the data to the end of the buffer
298
0
            let val = unsafe { values.get_unchecked_mut(start_offset..) };
299
300
0
            if options.descending {
301
0
                val.iter_mut().for_each(|o| *o = !*o);
302
0
            }
303
304
0
            let view = make_view(val, 0, start_offset as u32);
305
0
            views.append(view);
306
307
            // truncate inline string in values buffer if validate_utf8 is false
308
0
            if !validate_utf8 && decoded_len <= inline_str_max_len {
309
0
                values.truncate(start_offset);
310
0
            }
311
        }
312
0
        *row = &row[offset..];
313
    }
314
315
0
    if validate_utf8 {
316
0
        // the values contains all data, no matter if it is short or long
317
0
        // we can validate utf8 in one go.
318
0
        std::str::from_utf8(values.as_slice()).unwrap();
319
0
    }
320
321
0
    let builder = ArrayDataBuilder::new(DataType::BinaryView)
322
0
        .len(len)
323
0
        .null_count(null_count)
324
0
        .null_bit_buffer(Some(nulls.into()))
325
0
        .add_buffer(views.finish())
326
0
        .add_buffer(values.into());
327
328
    // SAFETY:
329
    // Valid by construction above
330
0
    unsafe { BinaryViewArray::from(builder.build_unchecked()) }
331
0
}
332
333
/// Decodes a binary view array from `rows` with the provided `options`
334
0
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray {
335
0
    decode_binary_view_inner(rows, options, false)
336
0
}
337
338
/// Decodes a string array from `rows` with the provided `options`
339
///
340
/// # Safety
341
///
342
/// The row must contain valid UTF-8 data
343
0
pub unsafe fn decode_string<I: OffsetSizeTrait>(
344
0
    rows: &mut [&[u8]],
345
0
    options: SortOptions,
346
0
    validate_utf8: bool,
347
0
) -> GenericStringArray<I> {
348
0
    let decoded = decode_binary::<I>(rows, options);
349
350
0
    if validate_utf8 {
351
0
        return GenericStringArray::from(decoded);
352
0
    }
353
354
0
    let builder = decoded
355
0
        .into_data()
356
0
        .into_builder()
357
0
        .data_type(GenericStringArray::<I>::DATA_TYPE);
358
359
    // SAFETY:
360
    // Row data must have come from a valid UTF-8 array
361
0
    GenericStringArray::from(builder.build_unchecked())
362
0
}
363
364
/// Decodes a string view array from `rows` with the provided `options`
365
///
366
/// # Safety
367
///
368
/// The row must contain valid UTF-8 data
369
0
pub unsafe fn decode_string_view(
370
0
    rows: &mut [&[u8]],
371
0
    options: SortOptions,
372
0
    validate_utf8: bool,
373
0
) -> StringViewArray {
374
0
    let view = decode_binary_view_inner(rows, options, validate_utf8);
375
0
    view.to_string_view_unchecked()
376
0
}