Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-row/src/fixed.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::array::PrimitiveArray;
19
use crate::null_sentinel;
20
use arrow_array::builder::BufferBuilder;
21
use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray};
22
use arrow_buffer::{
23
    bit_util, i256, ArrowNativeType, BooleanBuffer, Buffer, IntervalDayTime, IntervalMonthDayNano,
24
    MutableBuffer, NullBuffer,
25
};
26
use arrow_data::{ArrayData, ArrayDataBuilder};
27
use arrow_schema::{DataType, SortOptions};
28
use half::f16;
29
30
pub trait FromSlice {
31
    fn from_slice(slice: &[u8], invert: bool) -> Self;
32
}
33
34
impl<const N: usize> FromSlice for [u8; N] {
35
    #[inline]
36
0
    fn from_slice(slice: &[u8], invert: bool) -> Self {
37
0
        let mut t: Self = slice.try_into().unwrap();
38
0
        if invert {
39
0
            t.iter_mut().for_each(|o| *o = !*o);
40
0
        }
41
0
        t
42
0
    }
43
}
44
45
/// Encodes a value of a particular fixed width type into bytes according to the rules
46
/// described on [`super::RowConverter`]
47
pub trait FixedLengthEncoding: Copy {
48
    const ENCODED_LEN: usize = 1 + std::mem::size_of::<Self::Encoded>();
49
50
    type Encoded: Sized + Copy + FromSlice + AsRef<[u8]> + AsMut<[u8]>;
51
52
    fn encode(self) -> Self::Encoded;
53
54
    fn decode(encoded: Self::Encoded) -> Self;
55
}
56
57
impl FixedLengthEncoding for bool {
58
    type Encoded = [u8; 1];
59
60
0
    fn encode(self) -> [u8; 1] {
61
0
        [self as u8]
62
0
    }
63
64
0
    fn decode(encoded: Self::Encoded) -> Self {
65
0
        encoded[0] != 0
66
0
    }
67
}
68
69
macro_rules! encode_signed {
70
    ($n:expr, $t:ty) => {
71
        impl FixedLengthEncoding for $t {
72
            type Encoded = [u8; $n];
73
74
0
            fn encode(self) -> [u8; $n] {
75
0
                let mut b = self.to_be_bytes();
76
                // Toggle top "sign" bit to ensure consistent sort order
77
0
                b[0] ^= 0x80;
78
0
                b
79
0
            }
80
81
0
            fn decode(mut encoded: Self::Encoded) -> Self {
82
                // Toggle top "sign" bit
83
0
                encoded[0] ^= 0x80;
84
0
                Self::from_be_bytes(encoded)
85
0
            }
86
        }
87
    };
88
}
89
90
encode_signed!(1, i8);
91
encode_signed!(2, i16);
92
encode_signed!(4, i32);
93
encode_signed!(8, i64);
94
encode_signed!(16, i128);
95
encode_signed!(32, i256);
96
97
macro_rules! encode_unsigned {
98
    ($n:expr, $t:ty) => {
99
        impl FixedLengthEncoding for $t {
100
            type Encoded = [u8; $n];
101
102
0
            fn encode(self) -> [u8; $n] {
103
0
                self.to_be_bytes()
104
0
            }
105
106
0
            fn decode(encoded: Self::Encoded) -> Self {
107
0
                Self::from_be_bytes(encoded)
108
0
            }
109
        }
110
    };
111
}
112
113
encode_unsigned!(1, u8);
114
encode_unsigned!(2, u16);
115
encode_unsigned!(4, u32);
116
encode_unsigned!(8, u64);
117
118
impl FixedLengthEncoding for f16 {
119
    type Encoded = [u8; 2];
120
121
0
    fn encode(self) -> [u8; 2] {
122
        // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260
123
0
        let s = self.to_bits() as i16;
124
0
        let val = s ^ (((s >> 15) as u16) >> 1) as i16;
125
0
        val.encode()
126
0
    }
127
128
0
    fn decode(encoded: Self::Encoded) -> Self {
129
0
        let bits = i16::decode(encoded);
130
0
        let val = bits ^ (((bits >> 15) as u16) >> 1) as i16;
131
0
        Self::from_bits(val as u16)
132
0
    }
133
}
134
135
impl FixedLengthEncoding for f32 {
136
    type Encoded = [u8; 4];
137
138
0
    fn encode(self) -> [u8; 4] {
139
        // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260
140
0
        let s = self.to_bits() as i32;
141
0
        let val = s ^ (((s >> 31) as u32) >> 1) as i32;
142
0
        val.encode()
143
0
    }
144
145
0
    fn decode(encoded: Self::Encoded) -> Self {
146
0
        let bits = i32::decode(encoded);
147
0
        let val = bits ^ (((bits >> 31) as u32) >> 1) as i32;
148
0
        Self::from_bits(val as u32)
149
0
    }
150
}
151
152
impl FixedLengthEncoding for f64 {
153
    type Encoded = [u8; 8];
154
155
0
    fn encode(self) -> [u8; 8] {
156
        // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260
157
0
        let s = self.to_bits() as i64;
158
0
        let val = s ^ (((s >> 63) as u64) >> 1) as i64;
159
0
        val.encode()
160
0
    }
161
162
0
    fn decode(encoded: Self::Encoded) -> Self {
163
0
        let bits = i64::decode(encoded);
164
0
        let val = bits ^ (((bits >> 63) as u64) >> 1) as i64;
165
0
        Self::from_bits(val as u64)
166
0
    }
167
}
168
169
impl FixedLengthEncoding for IntervalDayTime {
170
    type Encoded = [u8; 8];
171
172
0
    fn encode(self) -> Self::Encoded {
173
0
        let mut out = [0_u8; 8];
174
0
        out[..4].copy_from_slice(&self.days.encode());
175
0
        out[4..].copy_from_slice(&self.milliseconds.encode());
176
0
        out
177
0
    }
178
179
0
    fn decode(encoded: Self::Encoded) -> Self {
180
0
        Self {
181
0
            days: i32::decode(encoded[..4].try_into().unwrap()),
182
0
            milliseconds: i32::decode(encoded[4..].try_into().unwrap()),
183
0
        }
184
0
    }
185
}
186
187
impl FixedLengthEncoding for IntervalMonthDayNano {
188
    type Encoded = [u8; 16];
189
190
0
    fn encode(self) -> Self::Encoded {
191
0
        let mut out = [0_u8; 16];
192
0
        out[..4].copy_from_slice(&self.months.encode());
193
0
        out[4..8].copy_from_slice(&self.days.encode());
194
0
        out[8..].copy_from_slice(&self.nanoseconds.encode());
195
0
        out
196
0
    }
197
198
0
    fn decode(encoded: Self::Encoded) -> Self {
199
0
        Self {
200
0
            months: i32::decode(encoded[..4].try_into().unwrap()),
201
0
            days: i32::decode(encoded[4..8].try_into().unwrap()),
202
0
            nanoseconds: i64::decode(encoded[8..].try_into().unwrap()),
203
0
        }
204
0
    }
205
}
206
207
/// Returns the total encoded length (including null byte) for a value of type `T::Native`
208
0
pub const fn encoded_len<T>(_col: &PrimitiveArray<T>) -> usize
209
0
where
210
0
    T: ArrowPrimitiveType,
211
0
    T::Native: FixedLengthEncoding,
212
{
213
0
    T::Native::ENCODED_LEN
214
0
}
215
216
/// Fixed width types are encoded as
217
///
218
/// - 1 byte `0` if null or `1` if valid
219
/// - bytes of [`FixedLengthEncoding`]
220
0
pub fn encode<T: FixedLengthEncoding>(
221
0
    data: &mut [u8],
222
0
    offsets: &mut [usize],
223
0
    values: &[T],
224
0
    nulls: &NullBuffer,
225
0
    opts: SortOptions,
226
0
) {
227
0
    for (value_idx, is_valid) in nulls.iter().enumerate() {
228
0
        let offset = &mut offsets[value_idx + 1];
229
0
        let end_offset = *offset + T::ENCODED_LEN;
230
0
        if is_valid {
231
0
            let to_write = &mut data[*offset..end_offset];
232
0
            to_write[0] = 1;
233
0
            let mut encoded = values[value_idx].encode();
234
0
            if opts.descending {
235
                // Flip bits to reverse order
236
0
                encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
237
0
            }
238
0
            to_write[1..].copy_from_slice(encoded.as_ref())
239
0
        } else {
240
0
            data[*offset] = null_sentinel(opts);
241
0
        }
242
0
        *offset = end_offset;
243
    }
244
0
}
245
246
/// Encoding for non-nullable primitive arrays.
247
/// Iterates directly over the `values`, and skips NULLs-checking.
248
0
pub fn encode_not_null<T: FixedLengthEncoding>(
249
0
    data: &mut [u8],
250
0
    offsets: &mut [usize],
251
0
    values: &[T],
252
0
    opts: SortOptions,
253
0
) {
254
0
    for (value_idx, val) in values.iter().enumerate() {
255
0
        let offset = &mut offsets[value_idx + 1];
256
0
        let end_offset = *offset + T::ENCODED_LEN;
257
258
0
        let to_write = &mut data[*offset..end_offset];
259
0
        to_write[0] = 1;
260
0
        let mut encoded = val.encode();
261
0
        if opts.descending {
262
            // Flip bits to reverse order
263
0
            encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
264
0
        }
265
0
        to_write[1..].copy_from_slice(encoded.as_ref());
266
267
0
        *offset = end_offset;
268
    }
269
0
}
270
271
/// Boolean values are encoded as
272
///
273
/// - 1 byte `0` if null or `1` if valid
274
/// - bytes of [`FixedLengthEncoding`]
275
0
pub fn encode_boolean(
276
0
    data: &mut [u8],
277
0
    offsets: &mut [usize],
278
0
    values: &BooleanBuffer,
279
0
    nulls: &NullBuffer,
280
0
    opts: SortOptions,
281
0
) {
282
0
    for (idx, is_valid) in nulls.iter().enumerate() {
283
0
        let offset = &mut offsets[idx + 1];
284
0
        let end_offset = *offset + bool::ENCODED_LEN;
285
0
        if is_valid {
286
0
            let to_write = &mut data[*offset..end_offset];
287
0
            to_write[0] = 1;
288
0
            let mut encoded = values.value(idx).encode();
289
0
            if opts.descending {
290
                // Flip bits to reverse order
291
0
                encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
292
0
            }
293
0
            to_write[1..].copy_from_slice(encoded.as_ref())
294
0
        } else {
295
0
            data[*offset] = null_sentinel(opts);
296
0
        }
297
0
        *offset = end_offset;
298
    }
299
0
}
300
301
/// Encoding for non-nullable boolean arrays.
302
/// Iterates directly over `values`, and skips NULLs-checking.
303
0
pub fn encode_boolean_not_null(
304
0
    data: &mut [u8],
305
0
    offsets: &mut [usize],
306
0
    values: &BooleanBuffer,
307
0
    opts: SortOptions,
308
0
) {
309
0
    for (value_idx, val) in values.iter().enumerate() {
310
0
        let offset = &mut offsets[value_idx + 1];
311
0
        let end_offset = *offset + bool::ENCODED_LEN;
312
313
0
        let to_write = &mut data[*offset..end_offset];
314
0
        to_write[0] = 1;
315
0
        let mut encoded = val.encode();
316
0
        if opts.descending {
317
            // Flip bits to reverse order
318
0
            encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
319
0
        }
320
0
        to_write[1..].copy_from_slice(encoded.as_ref());
321
322
0
        *offset = end_offset;
323
    }
324
0
}
325
326
0
pub fn encode_fixed_size_binary(
327
0
    data: &mut [u8],
328
0
    offsets: &mut [usize],
329
0
    array: &FixedSizeBinaryArray,
330
0
    opts: SortOptions,
331
0
) {
332
0
    let len = array.value_length() as usize;
333
0
    for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) {
334
0
        let end_offset = *offset + len + 1;
335
0
        if let Some(val) = maybe_val {
336
0
            let to_write = &mut data[*offset..end_offset];
337
0
            to_write[0] = 1;
338
0
            to_write[1..].copy_from_slice(&val[..len]);
339
0
            if opts.descending {
340
                // Flip bits to reverse order
341
0
                to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v)
342
0
            }
343
0
        } else {
344
0
            data[*offset] = null_sentinel(opts);
345
0
        }
346
0
        *offset = end_offset;
347
    }
348
0
}
349
350
/// Splits `len` bytes from `src`
351
#[inline]
352
0
fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] {
353
0
    let v = &src[..len];
354
0
    *src = &src[len..];
355
0
    v
356
0
}
357
358
/// Decodes a `BooleanArray` from rows
359
0
pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray {
360
0
    let true_val = match options.descending {
361
0
        true => !1,
362
0
        false => 1,
363
    };
364
365
0
    let len = rows.len();
366
367
0
    let mut null_count = 0;
368
0
    let mut nulls = MutableBuffer::new(bit_util::ceil(len, 64) * 8);
369
0
    let mut values = MutableBuffer::new(bit_util::ceil(len, 64) * 8);
370
371
0
    let chunks = len / 64;
372
0
    let remainder = len % 64;
373
0
    for chunk in 0..chunks {
374
0
        let mut null_packed = 0;
375
0
        let mut values_packed = 0;
376
377
0
        for bit_idx in 0..64 {
378
0
            let i = split_off(&mut rows[bit_idx + chunk * 64], 2);
379
0
            let (null, value) = (i[0] == 1, i[1] == true_val);
380
0
            null_count += !null as usize;
381
0
            null_packed |= (null as u64) << bit_idx;
382
0
            values_packed |= (value as u64) << bit_idx;
383
0
        }
384
385
0
        nulls.push(null_packed);
386
0
        values.push(values_packed);
387
    }
388
389
0
    if remainder != 0 {
390
0
        let mut null_packed = 0;
391
0
        let mut values_packed = 0;
392
393
0
        for bit_idx in 0..remainder {
394
0
            let i = split_off(&mut rows[bit_idx + chunks * 64], 2);
395
0
            let (null, value) = (i[0] == 1, i[1] == true_val);
396
0
            null_count += !null as usize;
397
0
            null_packed |= (null as u64) << bit_idx;
398
0
            values_packed |= (value as u64) << bit_idx;
399
0
        }
400
401
0
        nulls.push(null_packed);
402
0
        values.push(values_packed);
403
0
    }
404
405
0
    let builder = ArrayDataBuilder::new(DataType::Boolean)
406
0
        .len(rows.len())
407
0
        .null_count(null_count)
408
0
        .add_buffer(values.into())
409
0
        .null_bit_buffer(Some(nulls.into()));
410
411
    // SAFETY:
412
    // Buffers are the correct length
413
0
    unsafe { BooleanArray::from(builder.build_unchecked()) }
414
0
}
415
416
/// Decodes a single byte from each row, interpreting `0x01` as a valid value
417
/// and all other values as a null
418
///
419
/// Returns the null count and null buffer
420
0
pub fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) {
421
0
    let mut null_count = 0;
422
0
    let buffer = MutableBuffer::collect_bool(rows.len(), |idx| {
423
0
        let valid = rows[idx][0] == 1;
424
0
        null_count += !valid as usize;
425
0
        valid
426
0
    })
427
0
    .into();
428
0
    (null_count, buffer)
429
0
}
430
431
/// Decodes a `ArrayData` from rows based on the provided `FixedLengthEncoding` `T`
432
///
433
/// # Safety
434
///
435
/// `data_type` must be appropriate native type for `T`
436
0
unsafe fn decode_fixed<T: FixedLengthEncoding + ArrowNativeType>(
437
0
    rows: &mut [&[u8]],
438
0
    data_type: DataType,
439
0
    options: SortOptions,
440
0
) -> ArrayData {
441
0
    let len = rows.len();
442
443
0
    let mut values = BufferBuilder::<T>::new(len);
444
0
    let (null_count, nulls) = decode_nulls(rows);
445
446
0
    for row in rows {
447
0
        let i = split_off(row, T::ENCODED_LEN);
448
0
        let value = T::Encoded::from_slice(&i[1..], options.descending);
449
0
        values.append(T::decode(value));
450
0
    }
451
452
0
    let builder = ArrayDataBuilder::new(data_type)
453
0
        .len(len)
454
0
        .null_count(null_count)
455
0
        .add_buffer(values.finish())
456
0
        .null_bit_buffer(Some(nulls));
457
458
    // SAFETY: Buffers correct length
459
0
    builder.build_unchecked()
460
0
}
461
462
/// Decodes a `PrimitiveArray` from rows
463
0
pub fn decode_primitive<T: ArrowPrimitiveType>(
464
0
    rows: &mut [&[u8]],
465
0
    data_type: DataType,
466
0
    options: SortOptions,
467
0
) -> PrimitiveArray<T>
468
0
where
469
0
    T::Native: FixedLengthEncoding,
470
{
471
0
    assert!(PrimitiveArray::<T>::is_compatible(&data_type));
472
    // SAFETY:
473
    // Validated data type above
474
0
    unsafe { decode_fixed::<T::Native>(rows, data_type, options).into() }
475
0
}
476
477
/// Decodes a `FixedLengthBinary` from rows
478
0
pub fn decode_fixed_size_binary(
479
0
    rows: &mut [&[u8]],
480
0
    size: i32,
481
0
    options: SortOptions,
482
0
) -> FixedSizeBinaryArray {
483
0
    let len = rows.len();
484
485
0
    let mut values = MutableBuffer::new(size as usize * rows.len());
486
0
    let (null_count, nulls) = decode_nulls(rows);
487
488
0
    let encoded_len = size as usize + 1;
489
490
0
    for row in rows {
491
0
        let i = split_off(row, encoded_len);
492
0
        values.extend_from_slice(&i[1..]);
493
0
    }
494
495
0
    if options.descending {
496
0
        for v in values.as_slice_mut() {
497
0
            *v = !*v;
498
0
        }
499
0
    }
500
501
0
    let builder = ArrayDataBuilder::new(DataType::FixedSizeBinary(size))
502
0
        .len(len)
503
0
        .null_count(null_count)
504
0
        .add_buffer(values.into())
505
0
        .null_bit_buffer(Some(nulls));
506
507
    // SAFETY: Buffers correct length
508
0
    unsafe { builder.build_unchecked().into() }
509
0
}