/Users/andrewlamb/Software/arrow-rs/arrow-row/src/fixed.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::array::PrimitiveArray; |
19 | | use crate::null_sentinel; |
20 | | use arrow_array::builder::BufferBuilder; |
21 | | use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; |
22 | | use arrow_buffer::{ |
23 | | bit_util, i256, ArrowNativeType, BooleanBuffer, Buffer, IntervalDayTime, IntervalMonthDayNano, |
24 | | MutableBuffer, NullBuffer, |
25 | | }; |
26 | | use arrow_data::{ArrayData, ArrayDataBuilder}; |
27 | | use arrow_schema::{DataType, SortOptions}; |
28 | | use half::f16; |
29 | | |
30 | | pub trait FromSlice { |
31 | | fn from_slice(slice: &[u8], invert: bool) -> Self; |
32 | | } |
33 | | |
34 | | impl<const N: usize> FromSlice for [u8; N] { |
35 | | #[inline] |
36 | 0 | fn from_slice(slice: &[u8], invert: bool) -> Self { |
37 | 0 | let mut t: Self = slice.try_into().unwrap(); |
38 | 0 | if invert { |
39 | 0 | t.iter_mut().for_each(|o| *o = !*o); |
40 | 0 | } |
41 | 0 | t |
42 | 0 | } |
43 | | } |
44 | | |
45 | | /// Encodes a value of a particular fixed width type into bytes according to the rules |
46 | | /// described on [`super::RowConverter`] |
47 | | pub trait FixedLengthEncoding: Copy { |
48 | | const ENCODED_LEN: usize = 1 + std::mem::size_of::<Self::Encoded>(); |
49 | | |
50 | | type Encoded: Sized + Copy + FromSlice + AsRef<[u8]> + AsMut<[u8]>; |
51 | | |
52 | | fn encode(self) -> Self::Encoded; |
53 | | |
54 | | fn decode(encoded: Self::Encoded) -> Self; |
55 | | } |
56 | | |
57 | | impl FixedLengthEncoding for bool { |
58 | | type Encoded = [u8; 1]; |
59 | | |
60 | 0 | fn encode(self) -> [u8; 1] { |
61 | 0 | [self as u8] |
62 | 0 | } |
63 | | |
64 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
65 | 0 | encoded[0] != 0 |
66 | 0 | } |
67 | | } |
68 | | |
69 | | macro_rules! encode_signed { |
70 | | ($n:expr, $t:ty) => { |
71 | | impl FixedLengthEncoding for $t { |
72 | | type Encoded = [u8; $n]; |
73 | | |
74 | 0 | fn encode(self) -> [u8; $n] { |
75 | 0 | let mut b = self.to_be_bytes(); |
76 | | // Toggle top "sign" bit to ensure consistent sort order |
77 | 0 | b[0] ^= 0x80; |
78 | 0 | b |
79 | 0 | } |
80 | | |
81 | 0 | fn decode(mut encoded: Self::Encoded) -> Self { |
82 | | // Toggle top "sign" bit |
83 | 0 | encoded[0] ^= 0x80; |
84 | 0 | Self::from_be_bytes(encoded) |
85 | 0 | } |
86 | | } |
87 | | }; |
88 | | } |
89 | | |
90 | | encode_signed!(1, i8); |
91 | | encode_signed!(2, i16); |
92 | | encode_signed!(4, i32); |
93 | | encode_signed!(8, i64); |
94 | | encode_signed!(16, i128); |
95 | | encode_signed!(32, i256); |
96 | | |
97 | | macro_rules! encode_unsigned { |
98 | | ($n:expr, $t:ty) => { |
99 | | impl FixedLengthEncoding for $t { |
100 | | type Encoded = [u8; $n]; |
101 | | |
102 | 0 | fn encode(self) -> [u8; $n] { |
103 | 0 | self.to_be_bytes() |
104 | 0 | } |
105 | | |
106 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
107 | 0 | Self::from_be_bytes(encoded) |
108 | 0 | } |
109 | | } |
110 | | }; |
111 | | } |
112 | | |
113 | | encode_unsigned!(1, u8); |
114 | | encode_unsigned!(2, u16); |
115 | | encode_unsigned!(4, u32); |
116 | | encode_unsigned!(8, u64); |
117 | | |
118 | | impl FixedLengthEncoding for f16 { |
119 | | type Encoded = [u8; 2]; |
120 | | |
121 | 0 | fn encode(self) -> [u8; 2] { |
122 | | // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 |
123 | 0 | let s = self.to_bits() as i16; |
124 | 0 | let val = s ^ (((s >> 15) as u16) >> 1) as i16; |
125 | 0 | val.encode() |
126 | 0 | } |
127 | | |
128 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
129 | 0 | let bits = i16::decode(encoded); |
130 | 0 | let val = bits ^ (((bits >> 15) as u16) >> 1) as i16; |
131 | 0 | Self::from_bits(val as u16) |
132 | 0 | } |
133 | | } |
134 | | |
135 | | impl FixedLengthEncoding for f32 { |
136 | | type Encoded = [u8; 4]; |
137 | | |
138 | 0 | fn encode(self) -> [u8; 4] { |
139 | | // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 |
140 | 0 | let s = self.to_bits() as i32; |
141 | 0 | let val = s ^ (((s >> 31) as u32) >> 1) as i32; |
142 | 0 | val.encode() |
143 | 0 | } |
144 | | |
145 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
146 | 0 | let bits = i32::decode(encoded); |
147 | 0 | let val = bits ^ (((bits >> 31) as u32) >> 1) as i32; |
148 | 0 | Self::from_bits(val as u32) |
149 | 0 | } |
150 | | } |
151 | | |
152 | | impl FixedLengthEncoding for f64 { |
153 | | type Encoded = [u8; 8]; |
154 | | |
155 | 0 | fn encode(self) -> [u8; 8] { |
156 | | // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 |
157 | 0 | let s = self.to_bits() as i64; |
158 | 0 | let val = s ^ (((s >> 63) as u64) >> 1) as i64; |
159 | 0 | val.encode() |
160 | 0 | } |
161 | | |
162 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
163 | 0 | let bits = i64::decode(encoded); |
164 | 0 | let val = bits ^ (((bits >> 63) as u64) >> 1) as i64; |
165 | 0 | Self::from_bits(val as u64) |
166 | 0 | } |
167 | | } |
168 | | |
169 | | impl FixedLengthEncoding for IntervalDayTime { |
170 | | type Encoded = [u8; 8]; |
171 | | |
172 | 0 | fn encode(self) -> Self::Encoded { |
173 | 0 | let mut out = [0_u8; 8]; |
174 | 0 | out[..4].copy_from_slice(&self.days.encode()); |
175 | 0 | out[4..].copy_from_slice(&self.milliseconds.encode()); |
176 | 0 | out |
177 | 0 | } |
178 | | |
179 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
180 | 0 | Self { |
181 | 0 | days: i32::decode(encoded[..4].try_into().unwrap()), |
182 | 0 | milliseconds: i32::decode(encoded[4..].try_into().unwrap()), |
183 | 0 | } |
184 | 0 | } |
185 | | } |
186 | | |
187 | | impl FixedLengthEncoding for IntervalMonthDayNano { |
188 | | type Encoded = [u8; 16]; |
189 | | |
190 | 0 | fn encode(self) -> Self::Encoded { |
191 | 0 | let mut out = [0_u8; 16]; |
192 | 0 | out[..4].copy_from_slice(&self.months.encode()); |
193 | 0 | out[4..8].copy_from_slice(&self.days.encode()); |
194 | 0 | out[8..].copy_from_slice(&self.nanoseconds.encode()); |
195 | 0 | out |
196 | 0 | } |
197 | | |
198 | 0 | fn decode(encoded: Self::Encoded) -> Self { |
199 | 0 | Self { |
200 | 0 | months: i32::decode(encoded[..4].try_into().unwrap()), |
201 | 0 | days: i32::decode(encoded[4..8].try_into().unwrap()), |
202 | 0 | nanoseconds: i64::decode(encoded[8..].try_into().unwrap()), |
203 | 0 | } |
204 | 0 | } |
205 | | } |
206 | | |
207 | | /// Returns the total encoded length (including null byte) for a value of type `T::Native` |
208 | 0 | pub const fn encoded_len<T>(_col: &PrimitiveArray<T>) -> usize |
209 | 0 | where |
210 | 0 | T: ArrowPrimitiveType, |
211 | 0 | T::Native: FixedLengthEncoding, |
212 | | { |
213 | 0 | T::Native::ENCODED_LEN |
214 | 0 | } |
215 | | |
216 | | /// Fixed width types are encoded as |
217 | | /// |
218 | | /// - 1 byte `0` if null or `1` if valid |
219 | | /// - bytes of [`FixedLengthEncoding`] |
220 | 0 | pub fn encode<T: FixedLengthEncoding>( |
221 | 0 | data: &mut [u8], |
222 | 0 | offsets: &mut [usize], |
223 | 0 | values: &[T], |
224 | 0 | nulls: &NullBuffer, |
225 | 0 | opts: SortOptions, |
226 | 0 | ) { |
227 | 0 | for (value_idx, is_valid) in nulls.iter().enumerate() { |
228 | 0 | let offset = &mut offsets[value_idx + 1]; |
229 | 0 | let end_offset = *offset + T::ENCODED_LEN; |
230 | 0 | if is_valid { |
231 | 0 | let to_write = &mut data[*offset..end_offset]; |
232 | 0 | to_write[0] = 1; |
233 | 0 | let mut encoded = values[value_idx].encode(); |
234 | 0 | if opts.descending { |
235 | | // Flip bits to reverse order |
236 | 0 | encoded.as_mut().iter_mut().for_each(|v| *v = !*v) |
237 | 0 | } |
238 | 0 | to_write[1..].copy_from_slice(encoded.as_ref()) |
239 | 0 | } else { |
240 | 0 | data[*offset] = null_sentinel(opts); |
241 | 0 | } |
242 | 0 | *offset = end_offset; |
243 | | } |
244 | 0 | } |
245 | | |
246 | | /// Encoding for non-nullable primitive arrays. |
247 | | /// Iterates directly over the `values`, and skips NULLs-checking. |
248 | 0 | pub fn encode_not_null<T: FixedLengthEncoding>( |
249 | 0 | data: &mut [u8], |
250 | 0 | offsets: &mut [usize], |
251 | 0 | values: &[T], |
252 | 0 | opts: SortOptions, |
253 | 0 | ) { |
254 | 0 | for (value_idx, val) in values.iter().enumerate() { |
255 | 0 | let offset = &mut offsets[value_idx + 1]; |
256 | 0 | let end_offset = *offset + T::ENCODED_LEN; |
257 | | |
258 | 0 | let to_write = &mut data[*offset..end_offset]; |
259 | 0 | to_write[0] = 1; |
260 | 0 | let mut encoded = val.encode(); |
261 | 0 | if opts.descending { |
262 | | // Flip bits to reverse order |
263 | 0 | encoded.as_mut().iter_mut().for_each(|v| *v = !*v) |
264 | 0 | } |
265 | 0 | to_write[1..].copy_from_slice(encoded.as_ref()); |
266 | | |
267 | 0 | *offset = end_offset; |
268 | | } |
269 | 0 | } |
270 | | |
271 | | /// Boolean values are encoded as |
272 | | /// |
273 | | /// - 1 byte `0` if null or `1` if valid |
274 | | /// - bytes of [`FixedLengthEncoding`] |
275 | 0 | pub fn encode_boolean( |
276 | 0 | data: &mut [u8], |
277 | 0 | offsets: &mut [usize], |
278 | 0 | values: &BooleanBuffer, |
279 | 0 | nulls: &NullBuffer, |
280 | 0 | opts: SortOptions, |
281 | 0 | ) { |
282 | 0 | for (idx, is_valid) in nulls.iter().enumerate() { |
283 | 0 | let offset = &mut offsets[idx + 1]; |
284 | 0 | let end_offset = *offset + bool::ENCODED_LEN; |
285 | 0 | if is_valid { |
286 | 0 | let to_write = &mut data[*offset..end_offset]; |
287 | 0 | to_write[0] = 1; |
288 | 0 | let mut encoded = values.value(idx).encode(); |
289 | 0 | if opts.descending { |
290 | | // Flip bits to reverse order |
291 | 0 | encoded.as_mut().iter_mut().for_each(|v| *v = !*v) |
292 | 0 | } |
293 | 0 | to_write[1..].copy_from_slice(encoded.as_ref()) |
294 | 0 | } else { |
295 | 0 | data[*offset] = null_sentinel(opts); |
296 | 0 | } |
297 | 0 | *offset = end_offset; |
298 | | } |
299 | 0 | } |
300 | | |
301 | | /// Encoding for non-nullable boolean arrays. |
302 | | /// Iterates directly over `values`, and skips NULLs-checking. |
303 | 0 | pub fn encode_boolean_not_null( |
304 | 0 | data: &mut [u8], |
305 | 0 | offsets: &mut [usize], |
306 | 0 | values: &BooleanBuffer, |
307 | 0 | opts: SortOptions, |
308 | 0 | ) { |
309 | 0 | for (value_idx, val) in values.iter().enumerate() { |
310 | 0 | let offset = &mut offsets[value_idx + 1]; |
311 | 0 | let end_offset = *offset + bool::ENCODED_LEN; |
312 | | |
313 | 0 | let to_write = &mut data[*offset..end_offset]; |
314 | 0 | to_write[0] = 1; |
315 | 0 | let mut encoded = val.encode(); |
316 | 0 | if opts.descending { |
317 | | // Flip bits to reverse order |
318 | 0 | encoded.as_mut().iter_mut().for_each(|v| *v = !*v) |
319 | 0 | } |
320 | 0 | to_write[1..].copy_from_slice(encoded.as_ref()); |
321 | | |
322 | 0 | *offset = end_offset; |
323 | | } |
324 | 0 | } |
325 | | |
326 | 0 | pub fn encode_fixed_size_binary( |
327 | 0 | data: &mut [u8], |
328 | 0 | offsets: &mut [usize], |
329 | 0 | array: &FixedSizeBinaryArray, |
330 | 0 | opts: SortOptions, |
331 | 0 | ) { |
332 | 0 | let len = array.value_length() as usize; |
333 | 0 | for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) { |
334 | 0 | let end_offset = *offset + len + 1; |
335 | 0 | if let Some(val) = maybe_val { |
336 | 0 | let to_write = &mut data[*offset..end_offset]; |
337 | 0 | to_write[0] = 1; |
338 | 0 | to_write[1..].copy_from_slice(&val[..len]); |
339 | 0 | if opts.descending { |
340 | | // Flip bits to reverse order |
341 | 0 | to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v) |
342 | 0 | } |
343 | 0 | } else { |
344 | 0 | data[*offset] = null_sentinel(opts); |
345 | 0 | } |
346 | 0 | *offset = end_offset; |
347 | | } |
348 | 0 | } |
349 | | |
350 | | /// Splits `len` bytes from `src` |
351 | | #[inline] |
352 | 0 | fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] { |
353 | 0 | let v = &src[..len]; |
354 | 0 | *src = &src[len..]; |
355 | 0 | v |
356 | 0 | } |
357 | | |
358 | | /// Decodes a `BooleanArray` from rows |
359 | 0 | pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray { |
360 | 0 | let true_val = match options.descending { |
361 | 0 | true => !1, |
362 | 0 | false => 1, |
363 | | }; |
364 | | |
365 | 0 | let len = rows.len(); |
366 | | |
367 | 0 | let mut null_count = 0; |
368 | 0 | let mut nulls = MutableBuffer::new(bit_util::ceil(len, 64) * 8); |
369 | 0 | let mut values = MutableBuffer::new(bit_util::ceil(len, 64) * 8); |
370 | | |
371 | 0 | let chunks = len / 64; |
372 | 0 | let remainder = len % 64; |
373 | 0 | for chunk in 0..chunks { |
374 | 0 | let mut null_packed = 0; |
375 | 0 | let mut values_packed = 0; |
376 | | |
377 | 0 | for bit_idx in 0..64 { |
378 | 0 | let i = split_off(&mut rows[bit_idx + chunk * 64], 2); |
379 | 0 | let (null, value) = (i[0] == 1, i[1] == true_val); |
380 | 0 | null_count += !null as usize; |
381 | 0 | null_packed |= (null as u64) << bit_idx; |
382 | 0 | values_packed |= (value as u64) << bit_idx; |
383 | 0 | } |
384 | | |
385 | 0 | nulls.push(null_packed); |
386 | 0 | values.push(values_packed); |
387 | | } |
388 | | |
389 | 0 | if remainder != 0 { |
390 | 0 | let mut null_packed = 0; |
391 | 0 | let mut values_packed = 0; |
392 | | |
393 | 0 | for bit_idx in 0..remainder { |
394 | 0 | let i = split_off(&mut rows[bit_idx + chunks * 64], 2); |
395 | 0 | let (null, value) = (i[0] == 1, i[1] == true_val); |
396 | 0 | null_count += !null as usize; |
397 | 0 | null_packed |= (null as u64) << bit_idx; |
398 | 0 | values_packed |= (value as u64) << bit_idx; |
399 | 0 | } |
400 | | |
401 | 0 | nulls.push(null_packed); |
402 | 0 | values.push(values_packed); |
403 | 0 | } |
404 | | |
405 | 0 | let builder = ArrayDataBuilder::new(DataType::Boolean) |
406 | 0 | .len(rows.len()) |
407 | 0 | .null_count(null_count) |
408 | 0 | .add_buffer(values.into()) |
409 | 0 | .null_bit_buffer(Some(nulls.into())); |
410 | | |
411 | | // SAFETY: |
412 | | // Buffers are the correct length |
413 | 0 | unsafe { BooleanArray::from(builder.build_unchecked()) } |
414 | 0 | } |
415 | | |
416 | | /// Decodes a single byte from each row, interpreting `0x01` as a valid value |
417 | | /// and all other values as a null |
418 | | /// |
419 | | /// Returns the null count and null buffer |
420 | 0 | pub fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) { |
421 | 0 | let mut null_count = 0; |
422 | 0 | let buffer = MutableBuffer::collect_bool(rows.len(), |idx| { |
423 | 0 | let valid = rows[idx][0] == 1; |
424 | 0 | null_count += !valid as usize; |
425 | 0 | valid |
426 | 0 | }) |
427 | 0 | .into(); |
428 | 0 | (null_count, buffer) |
429 | 0 | } |
430 | | |
431 | | /// Decodes a `ArrayData` from rows based on the provided `FixedLengthEncoding` `T` |
432 | | /// |
433 | | /// # Safety |
434 | | /// |
435 | | /// `data_type` must be appropriate native type for `T` |
436 | 0 | unsafe fn decode_fixed<T: FixedLengthEncoding + ArrowNativeType>( |
437 | 0 | rows: &mut [&[u8]], |
438 | 0 | data_type: DataType, |
439 | 0 | options: SortOptions, |
440 | 0 | ) -> ArrayData { |
441 | 0 | let len = rows.len(); |
442 | | |
443 | 0 | let mut values = BufferBuilder::<T>::new(len); |
444 | 0 | let (null_count, nulls) = decode_nulls(rows); |
445 | | |
446 | 0 | for row in rows { |
447 | 0 | let i = split_off(row, T::ENCODED_LEN); |
448 | 0 | let value = T::Encoded::from_slice(&i[1..], options.descending); |
449 | 0 | values.append(T::decode(value)); |
450 | 0 | } |
451 | | |
452 | 0 | let builder = ArrayDataBuilder::new(data_type) |
453 | 0 | .len(len) |
454 | 0 | .null_count(null_count) |
455 | 0 | .add_buffer(values.finish()) |
456 | 0 | .null_bit_buffer(Some(nulls)); |
457 | | |
458 | | // SAFETY: Buffers correct length |
459 | 0 | builder.build_unchecked() |
460 | 0 | } |
461 | | |
462 | | /// Decodes a `PrimitiveArray` from rows |
463 | 0 | pub fn decode_primitive<T: ArrowPrimitiveType>( |
464 | 0 | rows: &mut [&[u8]], |
465 | 0 | data_type: DataType, |
466 | 0 | options: SortOptions, |
467 | 0 | ) -> PrimitiveArray<T> |
468 | 0 | where |
469 | 0 | T::Native: FixedLengthEncoding, |
470 | | { |
471 | 0 | assert!(PrimitiveArray::<T>::is_compatible(&data_type)); |
472 | | // SAFETY: |
473 | | // Validated data type above |
474 | 0 | unsafe { decode_fixed::<T::Native>(rows, data_type, options).into() } |
475 | 0 | } |
476 | | |
477 | | /// Decodes a `FixedLengthBinary` from rows |
478 | 0 | pub fn decode_fixed_size_binary( |
479 | 0 | rows: &mut [&[u8]], |
480 | 0 | size: i32, |
481 | 0 | options: SortOptions, |
482 | 0 | ) -> FixedSizeBinaryArray { |
483 | 0 | let len = rows.len(); |
484 | | |
485 | 0 | let mut values = MutableBuffer::new(size as usize * rows.len()); |
486 | 0 | let (null_count, nulls) = decode_nulls(rows); |
487 | | |
488 | 0 | let encoded_len = size as usize + 1; |
489 | | |
490 | 0 | for row in rows { |
491 | 0 | let i = split_off(row, encoded_len); |
492 | 0 | values.extend_from_slice(&i[1..]); |
493 | 0 | } |
494 | | |
495 | 0 | if options.descending { |
496 | 0 | for v in values.as_slice_mut() { |
497 | 0 | *v = !*v; |
498 | 0 | } |
499 | 0 | } |
500 | | |
501 | 0 | let builder = ArrayDataBuilder::new(DataType::FixedSizeBinary(size)) |
502 | 0 | .len(len) |
503 | 0 | .null_count(null_count) |
504 | 0 | .add_buffer(values.into()) |
505 | 0 | .null_bit_buffer(Some(nulls)); |
506 | | |
507 | | // SAFETY: Buffers correct length |
508 | 0 | unsafe { builder.build_unchecked().into() } |
509 | 0 | } |