/Users/andrewlamb/Software/arrow-rs/arrow-row/src/variable.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::null_sentinel; |
19 | | use arrow_array::builder::BufferBuilder; |
20 | | use arrow_array::*; |
21 | | use arrow_buffer::bit_util::ceil; |
22 | | use arrow_buffer::MutableBuffer; |
23 | | use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN}; |
24 | | use arrow_schema::{DataType, SortOptions}; |
25 | | use builder::make_view; |
26 | | |
27 | | /// The block size of the variable length encoding |
28 | | pub const BLOCK_SIZE: usize = 32; |
29 | | |
30 | | /// The first block is split into `MINI_BLOCK_COUNT` mini-blocks |
31 | | /// |
32 | | /// This helps to reduce the space amplification for small strings |
33 | | pub const MINI_BLOCK_COUNT: usize = 4; |
34 | | |
35 | | /// The mini block size |
36 | | pub const MINI_BLOCK_SIZE: usize = BLOCK_SIZE / MINI_BLOCK_COUNT; |
37 | | |
38 | | /// The continuation token |
39 | | pub const BLOCK_CONTINUATION: u8 = 0xFF; |
40 | | |
41 | | /// Indicates an empty string |
42 | | pub const EMPTY_SENTINEL: u8 = 1; |
43 | | |
44 | | /// Indicates a non-empty string |
45 | | pub const NON_EMPTY_SENTINEL: u8 = 2; |
46 | | |
47 | | /// Returns the length of the encoded representation of a byte array, including the null byte |
48 | | #[inline] |
49 | 0 | pub fn encoded_len(a: Option<&[u8]>) -> usize { |
50 | 0 | padded_length(a.map(|x| x.len())) |
51 | 0 | } |
52 | | |
53 | | /// Returns the padded length of the encoded length of the given length |
54 | | #[inline] |
55 | 0 | pub fn padded_length(a: Option<usize>) -> usize { |
56 | 0 | match a { |
57 | 0 | Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1), |
58 | | // Each miniblock ends with a 1 byte continuation, therefore add |
59 | | // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size |
60 | 0 | Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), |
61 | 0 | None => 1, |
62 | | } |
63 | 0 | } |
64 | | |
65 | | /// Variable length values are encoded as |
66 | | /// |
67 | | /// - single `0_u8` if null |
68 | | /// - single `1_u8` if empty array |
69 | | /// - `2_u8` if not empty, followed by one or more blocks |
70 | | /// |
71 | | /// where a block is encoded as |
72 | | /// |
73 | | /// - [`BLOCK_SIZE`] bytes of string data, padded with 0s |
74 | | /// - `0xFF_u8` if this is not the last block for this string |
75 | | /// - otherwise the length of the block as a `u8` |
76 | 0 | pub fn encode<'a, I: Iterator<Item = Option<&'a [u8]>>>( |
77 | 0 | data: &mut [u8], |
78 | 0 | offsets: &mut [usize], |
79 | 0 | i: I, |
80 | 0 | opts: SortOptions, |
81 | 0 | ) { |
82 | 0 | for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(i) { |
83 | 0 | *offset += encode_one(&mut data[*offset..], maybe_val, opts); |
84 | 0 | } |
85 | 0 | } |
86 | | |
87 | 0 | pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize { |
88 | 0 | out[0] = null_sentinel(opts); |
89 | 0 | 1 |
90 | 0 | } |
91 | | |
92 | 0 | pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize { |
93 | 0 | out[0] = match opts.descending { |
94 | 0 | true => !EMPTY_SENTINEL, |
95 | 0 | false => EMPTY_SENTINEL, |
96 | | }; |
97 | 0 | 1 |
98 | 0 | } |
99 | | |
100 | 0 | pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize { |
101 | 0 | match val { |
102 | 0 | None => encode_null(out, opts), |
103 | 0 | Some([]) => encode_empty(out, opts), |
104 | 0 | Some(val) => { |
105 | | // Write `2_u8` to demarcate as non-empty, non-null string |
106 | 0 | out[0] = NON_EMPTY_SENTINEL; |
107 | | |
108 | 0 | let len = if val.len() <= BLOCK_SIZE { |
109 | 0 | 1 + encode_blocks::<MINI_BLOCK_SIZE>(&mut out[1..], val) |
110 | | } else { |
111 | 0 | let (initial, rem) = val.split_at(BLOCK_SIZE); |
112 | 0 | let offset = encode_blocks::<MINI_BLOCK_SIZE>(&mut out[1..], initial); |
113 | 0 | out[offset] = BLOCK_CONTINUATION; |
114 | 0 | 1 + offset + encode_blocks::<BLOCK_SIZE>(&mut out[1 + offset..], rem) |
115 | | }; |
116 | | |
117 | 0 | if opts.descending { |
118 | | // Invert bits |
119 | 0 | out[..len].iter_mut().for_each(|v| *v = !*v) |
120 | 0 | } |
121 | 0 | len |
122 | | } |
123 | | } |
124 | 0 | } |
125 | | |
126 | | /// Writes `val` in `SIZE` blocks with the appropriate continuation tokens |
127 | | #[inline] |
128 | 0 | fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize { |
129 | 0 | let block_count = ceil(val.len(), SIZE); |
130 | 0 | let end_offset = block_count * (SIZE + 1); |
131 | 0 | let to_write = &mut out[..end_offset]; |
132 | | |
133 | 0 | let chunks = val.chunks_exact(SIZE); |
134 | 0 | let remainder = chunks.remainder(); |
135 | 0 | for (input, output) in chunks.clone().zip(to_write.chunks_exact_mut(SIZE + 1)) { |
136 | 0 | let input: &[u8; SIZE] = input.try_into().unwrap(); |
137 | 0 | let out_block: &mut [u8; SIZE] = (&mut output[..SIZE]).try_into().unwrap(); |
138 | 0 |
|
139 | 0 | *out_block = *input; |
140 | 0 |
|
141 | 0 | // Indicate that there are further blocks to follow |
142 | 0 | output[SIZE] = BLOCK_CONTINUATION; |
143 | 0 | } |
144 | | |
145 | 0 | if !remainder.is_empty() { |
146 | 0 | let start_offset = (block_count - 1) * (SIZE + 1); |
147 | 0 | to_write[start_offset..start_offset + remainder.len()].copy_from_slice(remainder); |
148 | 0 | *to_write.last_mut().unwrap() = remainder.len() as u8; |
149 | 0 | } else { |
150 | 0 | // We must overwrite the continuation marker written by the loop above |
151 | 0 | *to_write.last_mut().unwrap() = SIZE as u8; |
152 | 0 | } |
153 | 0 | end_offset |
154 | 0 | } |
155 | | |
156 | | /// Decodes a single block of data |
157 | | /// The `f` function accepts a slice of the decoded data, it may be called multiple times |
158 | 0 | pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize { |
159 | 0 | let (non_empty_sentinel, continuation) = match options.descending { |
160 | 0 | true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION), |
161 | 0 | false => (NON_EMPTY_SENTINEL, BLOCK_CONTINUATION), |
162 | | }; |
163 | | |
164 | 0 | if row[0] != non_empty_sentinel { |
165 | | // Empty or null string |
166 | 0 | return 1; |
167 | 0 | } |
168 | | |
169 | | // Extracts the block length from the sentinel |
170 | 0 | let block_len = |sentinel: u8| match options.descending { |
171 | 0 | true => !sentinel as usize, |
172 | 0 | false => sentinel as usize, |
173 | 0 | }; |
174 | | |
175 | 0 | let mut idx = 1; |
176 | 0 | for _ in 0..MINI_BLOCK_COUNT { |
177 | 0 | let sentinel = row[idx + MINI_BLOCK_SIZE]; |
178 | 0 | if sentinel != continuation { |
179 | 0 | f(&row[idx..idx + block_len(sentinel)]); |
180 | 0 | return idx + MINI_BLOCK_SIZE + 1; |
181 | 0 | } |
182 | 0 | f(&row[idx..idx + MINI_BLOCK_SIZE]); |
183 | 0 | idx += MINI_BLOCK_SIZE + 1; |
184 | | } |
185 | | |
186 | | loop { |
187 | 0 | let sentinel = row[idx + BLOCK_SIZE]; |
188 | 0 | if sentinel != continuation { |
189 | 0 | f(&row[idx..idx + block_len(sentinel)]); |
190 | 0 | return idx + BLOCK_SIZE + 1; |
191 | 0 | } |
192 | 0 | f(&row[idx..idx + BLOCK_SIZE]); |
193 | 0 | idx += BLOCK_SIZE + 1; |
194 | | } |
195 | 0 | } |
196 | | |
197 | | /// Returns the number of bytes of encoded data |
198 | 0 | fn decoded_len(row: &[u8], options: SortOptions) -> usize { |
199 | 0 | let mut len = 0; |
200 | 0 | decode_blocks(row, options, |block| len += block.len()); |
201 | 0 | len |
202 | 0 | } |
203 | | |
204 | | /// Decodes a binary array from `rows` with the provided `options` |
205 | 0 | pub fn decode_binary<I: OffsetSizeTrait>( |
206 | 0 | rows: &mut [&[u8]], |
207 | 0 | options: SortOptions, |
208 | 0 | ) -> GenericBinaryArray<I> { |
209 | 0 | let len = rows.len(); |
210 | 0 | let mut null_count = 0; |
211 | 0 | let nulls = MutableBuffer::collect_bool(len, |x| { |
212 | 0 | let valid = rows[x][0] != null_sentinel(options); |
213 | 0 | null_count += !valid as usize; |
214 | 0 | valid |
215 | 0 | }); |
216 | | |
217 | 0 | let values_capacity = rows.iter().map(|row| decoded_len(row, options)).sum(); |
218 | 0 | let mut offsets = BufferBuilder::<I>::new(len + 1); |
219 | 0 | offsets.append(I::zero()); |
220 | 0 | let mut values = MutableBuffer::new(values_capacity); |
221 | | |
222 | 0 | for row in rows { |
223 | 0 | let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); |
224 | 0 | *row = &row[offset..]; |
225 | 0 | offsets.append(I::from_usize(values.len()).expect("offset overflow")) |
226 | | } |
227 | | |
228 | 0 | if options.descending { |
229 | 0 | values.as_slice_mut().iter_mut().for_each(|o| *o = !*o) |
230 | 0 | } |
231 | | |
232 | 0 | let d = match I::IS_LARGE { |
233 | 0 | true => DataType::LargeBinary, |
234 | 0 | false => DataType::Binary, |
235 | | }; |
236 | | |
237 | 0 | let builder = ArrayDataBuilder::new(d) |
238 | 0 | .len(len) |
239 | 0 | .null_count(null_count) |
240 | 0 | .null_bit_buffer(Some(nulls.into())) |
241 | 0 | .add_buffer(offsets.finish()) |
242 | 0 | .add_buffer(values.into()); |
243 | | |
244 | | // SAFETY: |
245 | | // Valid by construction above |
246 | 0 | unsafe { GenericBinaryArray::from(builder.build_unchecked()) } |
247 | 0 | } |
248 | | |
249 | 0 | fn decode_binary_view_inner( |
250 | 0 | rows: &mut [&[u8]], |
251 | 0 | options: SortOptions, |
252 | 0 | validate_utf8: bool, |
253 | 0 | ) -> BinaryViewArray { |
254 | 0 | let len = rows.len(); |
255 | 0 | let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize; |
256 | | |
257 | 0 | let mut null_count = 0; |
258 | | |
259 | 0 | let nulls = MutableBuffer::collect_bool(len, |x| { |
260 | 0 | let valid = rows[x][0] != null_sentinel(options); |
261 | 0 | null_count += !valid as usize; |
262 | 0 | valid |
263 | 0 | }); |
264 | | |
265 | | // If we are validating UTF-8, decode all string values (including short strings) |
266 | | // into the values buffer and validate UTF-8 once. If not validating, |
267 | | // we save memory by only copying long strings to the values buffer, as short strings |
268 | | // will be inlined into the view and do not need to be stored redundantly. |
269 | 0 | let values_capacity = if validate_utf8 { |
270 | | // Capacity for all long and short strings |
271 | 0 | rows.iter().map(|row| decoded_len(row, options)).sum() |
272 | | } else { |
273 | | // Capacity for all long strings plus room for one short string |
274 | 0 | rows.iter().fold(0, |acc, row| { |
275 | 0 | let len = decoded_len(row, options); |
276 | 0 | if len > inline_str_max_len { |
277 | 0 | acc + len |
278 | | } else { |
279 | 0 | acc |
280 | | } |
281 | 0 | }) + inline_str_max_len |
282 | | }; |
283 | 0 | let mut values = MutableBuffer::new(values_capacity); |
284 | | |
285 | 0 | let mut views = BufferBuilder::<u128>::new(len); |
286 | 0 | for row in rows { |
287 | 0 | let start_offset = values.len(); |
288 | 0 | let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); |
289 | | // Measure string length via change in values buffer. |
290 | | // Used to check if decoded value should be truncated (short string) when validate_utf8 is false |
291 | 0 | let decoded_len = values.len() - start_offset; |
292 | 0 | if row[0] == null_sentinel(options) { |
293 | 0 | debug_assert_eq!(offset, 1); |
294 | 0 | debug_assert_eq!(start_offset, values.len()); |
295 | 0 | views.append(0); |
296 | | } else { |
297 | | // Safety: we just appended the data to the end of the buffer |
298 | 0 | let val = unsafe { values.get_unchecked_mut(start_offset..) }; |
299 | | |
300 | 0 | if options.descending { |
301 | 0 | val.iter_mut().for_each(|o| *o = !*o); |
302 | 0 | } |
303 | | |
304 | 0 | let view = make_view(val, 0, start_offset as u32); |
305 | 0 | views.append(view); |
306 | | |
307 | | // truncate inline string in values buffer if validate_utf8 is false |
308 | 0 | if !validate_utf8 && decoded_len <= inline_str_max_len { |
309 | 0 | values.truncate(start_offset); |
310 | 0 | } |
311 | | } |
312 | 0 | *row = &row[offset..]; |
313 | | } |
314 | | |
315 | 0 | if validate_utf8 { |
316 | 0 | // the values contains all data, no matter if it is short or long |
317 | 0 | // we can validate utf8 in one go. |
318 | 0 | std::str::from_utf8(values.as_slice()).unwrap(); |
319 | 0 | } |
320 | | |
321 | 0 | let builder = ArrayDataBuilder::new(DataType::BinaryView) |
322 | 0 | .len(len) |
323 | 0 | .null_count(null_count) |
324 | 0 | .null_bit_buffer(Some(nulls.into())) |
325 | 0 | .add_buffer(views.finish()) |
326 | 0 | .add_buffer(values.into()); |
327 | | |
328 | | // SAFETY: |
329 | | // Valid by construction above |
330 | 0 | unsafe { BinaryViewArray::from(builder.build_unchecked()) } |
331 | 0 | } |
332 | | |
333 | | /// Decodes a binary view array from `rows` with the provided `options` |
334 | 0 | pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { |
335 | 0 | decode_binary_view_inner(rows, options, false) |
336 | 0 | } |
337 | | |
338 | | /// Decodes a string array from `rows` with the provided `options` |
339 | | /// |
340 | | /// # Safety |
341 | | /// |
342 | | /// The row must contain valid UTF-8 data |
343 | 0 | pub unsafe fn decode_string<I: OffsetSizeTrait>( |
344 | 0 | rows: &mut [&[u8]], |
345 | 0 | options: SortOptions, |
346 | 0 | validate_utf8: bool, |
347 | 0 | ) -> GenericStringArray<I> { |
348 | 0 | let decoded = decode_binary::<I>(rows, options); |
349 | | |
350 | 0 | if validate_utf8 { |
351 | 0 | return GenericStringArray::from(decoded); |
352 | 0 | } |
353 | | |
354 | 0 | let builder = decoded |
355 | 0 | .into_data() |
356 | 0 | .into_builder() |
357 | 0 | .data_type(GenericStringArray::<I>::DATA_TYPE); |
358 | | |
359 | | // SAFETY: |
360 | | // Row data must have come from a valid UTF-8 array |
361 | 0 | GenericStringArray::from(builder.build_unchecked()) |
362 | 0 | } |
363 | | |
364 | | /// Decodes a string view array from `rows` with the provided `options` |
365 | | /// |
366 | | /// # Safety |
367 | | /// |
368 | | /// The row must contain valid UTF-8 data |
369 | 0 | pub unsafe fn decode_string_view( |
370 | 0 | rows: &mut [&[u8]], |
371 | 0 | options: SortOptions, |
372 | 0 | validate_utf8: bool, |
373 | 0 | ) -> StringViewArray { |
374 | 0 | let view = decode_binary_view_inner(rows, options, validate_utf8); |
375 | 0 | view.to_string_view_unchecked() |
376 | 0 | } |