/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/builder/boolean.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::bit_util::apply_bitwise_binary_op; |
19 | | use crate::{BooleanBuffer, Buffer, MutableBuffer, bit_util}; |
20 | | use std::ops::Range; |
21 | | |
22 | | /// Builder for [`BooleanBuffer`] |
23 | | /// |
24 | | /// # See Also |
25 | | /// |
26 | | /// * [`NullBuffer`] for building [`BooleanBuffer`]s for representing nulls |
27 | | /// |
28 | | /// [`NullBuffer`]: crate::NullBuffer |
29 | | #[derive(Debug)] |
30 | | pub struct BooleanBufferBuilder { |
31 | | buffer: MutableBuffer, |
32 | | len: usize, |
33 | | } |
34 | | |
35 | | impl BooleanBufferBuilder { |
36 | | /// Creates a new `BooleanBufferBuilder` with sufficient space for |
37 | | /// `capacity` bits (not bytes). |
38 | | /// |
39 | | /// The capacity is rounded up to the nearest multiple of 8 for the |
40 | | /// allocation. |
41 | | #[inline] |
42 | 83.7k | pub fn new(capacity: usize) -> Self { |
43 | 83.7k | let byte_capacity = bit_util::ceil(capacity, 8); |
44 | 83.7k | let buffer = MutableBuffer::new(byte_capacity); |
45 | 83.7k | Self { buffer, len: 0 } |
46 | 83.7k | } |
47 | | |
48 | | /// Creates a new `BooleanBufferBuilder` from [`MutableBuffer`] of `len` |
49 | 6 | pub fn new_from_buffer(buffer: MutableBuffer, len: usize) -> Self { |
50 | 6 | assert!(len <= buffer.len() * 8); |
51 | 6 | let mut s = Self { |
52 | 6 | len: buffer.len() * 8, |
53 | 6 | buffer, |
54 | 6 | }; |
55 | 6 | s.truncate(len); |
56 | 6 | s |
57 | 6 | } |
58 | | |
59 | | /// Returns the length of the buffer |
60 | | #[inline] |
61 | 2.39k | pub fn len(&self) -> usize { |
62 | 2.39k | self.len |
63 | 2.39k | } |
64 | | |
65 | | /// Sets a bit in the buffer at `index` |
66 | | #[inline] |
67 | 80.1k | pub fn set_bit(&mut self, index: usize, v: bool) { |
68 | 80.1k | if v { |
69 | 80.1k | bit_util::set_bit(self.buffer.as_mut(), index); |
70 | 80.1k | } else { |
71 | 3 | bit_util::unset_bit(self.buffer.as_mut(), index); |
72 | 3 | } |
73 | 80.1k | } |
74 | | |
75 | | /// Gets a bit in the buffer at `index` |
76 | | #[inline] |
77 | | pub fn get_bit(&self, index: usize) -> bool { |
78 | | bit_util::get_bit(self.buffer.as_slice(), index) |
79 | | } |
80 | | |
81 | | /// Returns true if empty |
82 | | #[inline] |
83 | | pub fn is_empty(&self) -> bool { |
84 | | self.len == 0 |
85 | | } |
86 | | |
87 | | /// Returns the capacity of the buffer, in bits (not bytes) |
88 | | /// |
89 | | /// Note this |
90 | | /// |
91 | | /// # Example |
92 | | /// ``` |
93 | | /// # use arrow_buffer::builder::BooleanBufferBuilder; |
94 | | /// // empty requires 0 bytes |
95 | | /// let b = BooleanBufferBuilder::new(0); |
96 | | /// assert_eq!(0, b.capacity()); |
97 | | /// // Creating space for 1 bit results in 64 bytes (space for 512 bits) |
98 | | /// // (64 is the minimum allocation size for 64 bit architectures) |
99 | | /// let mut b = BooleanBufferBuilder::new(1); |
100 | | /// assert_eq!(512, b.capacity()); |
101 | | /// // 1000 bits requires 128 bytes (space for 1024 bits) |
102 | | /// b.append_n(1000, true); |
103 | | /// assert_eq!(1024, b.capacity()); |
104 | | /// ``` |
105 | | #[inline] |
106 | 0 | pub fn capacity(&self) -> usize { |
107 | 0 | self.buffer.capacity() * 8 |
108 | 0 | } |
109 | | |
110 | | /// Advances the buffer by `additional` bits |
111 | | #[inline] |
112 | 1.22M | pub fn advance(&mut self, additional: usize) { |
113 | 1.22M | let new_len = self.len + additional; |
114 | 1.22M | let new_len_bytes = bit_util::ceil(new_len, 8); |
115 | 1.22M | if new_len_bytes > self.buffer.len() { |
116 | 228k | self.buffer.resize(new_len_bytes, 0); |
117 | 992k | } |
118 | 1.22M | self.len = new_len; |
119 | 1.22M | } |
120 | | |
121 | | /// Truncates the builder to the given length |
122 | | /// |
123 | | /// If `len` is greater than the buffer's current length, this has no effect |
124 | | #[inline] |
125 | 6 | pub fn truncate(&mut self, len: usize) { |
126 | 6 | if len > self.len { |
127 | 0 | return; |
128 | 6 | } |
129 | | |
130 | 6 | let new_len_bytes = bit_util::ceil(len, 8); |
131 | 6 | self.buffer.truncate(new_len_bytes); |
132 | 6 | self.len = len; |
133 | | |
134 | 6 | let remainder = self.len % 8; |
135 | 6 | if remainder != 0 { |
136 | 6 | let mask = (1_u8 << remainder).wrapping_sub(1); |
137 | 6 | *self.buffer.as_mut().last_mut().unwrap() &= mask; |
138 | 6 | }0 |
139 | 6 | } |
140 | | |
141 | | /// Reserve space to at least `additional` new bits. |
142 | | /// Capacity will be `>= self.len() + additional`. |
143 | | /// New bytes are uninitialized and reading them is undefined behavior. |
144 | | #[inline] |
145 | | pub fn reserve(&mut self, additional: usize) { |
146 | | let capacity = self.len + additional; |
147 | | if capacity > self.capacity() { |
148 | | // convert differential to bytes |
149 | | let additional = bit_util::ceil(capacity, 8) - self.buffer.len(); |
150 | | self.buffer.reserve(additional); |
151 | | } |
152 | | } |
153 | | |
154 | | /// Resizes the buffer, either truncating its contents (with no change in capacity), or |
155 | | /// growing it (potentially reallocating it) and writing `false` in the newly available bits. |
156 | | #[inline] |
157 | 5 | pub fn resize(&mut self, len: usize) { |
158 | 5 | match len.checked_sub(self.len) { |
159 | 5 | Some(delta) => self.advance(delta), |
160 | 0 | None => self.truncate(len), |
161 | | } |
162 | 5 | } |
163 | | |
164 | | /// Appends a boolean `v` into the buffer |
165 | | #[inline] |
166 | 1.13M | pub fn append(&mut self, v: bool) { |
167 | 1.13M | self.advance(1); |
168 | 1.13M | if v { |
169 | 874k | unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), self.len - 1) }; |
170 | 874k | }257k |
171 | 1.13M | } |
172 | | |
173 | | /// Appends n `additional` bits of value `v` into the buffer |
174 | | #[inline] |
175 | 1.51k | pub fn append_n(&mut self, additional: usize, v: bool) { |
176 | 1.51k | match v { |
177 | | true => { |
178 | 1.50k | let new_len = self.len + additional; |
179 | 1.50k | let new_len_bytes = bit_util::ceil(new_len, 8); |
180 | 1.50k | let cur_remainder = self.len % 8; |
181 | 1.50k | let new_remainder = new_len % 8; |
182 | | |
183 | 1.50k | if cur_remainder != 0 { |
184 | | // Pad last byte with 1s |
185 | 19 | *self.buffer.as_slice_mut().last_mut().unwrap() |= !((1 << cur_remainder) - 1) |
186 | 1.48k | } |
187 | 1.50k | self.buffer.resize(new_len_bytes, 0xFF); |
188 | 1.50k | if new_remainder != 0 { |
189 | | // Clear remaining bits |
190 | 332 | *self.buffer.as_slice_mut().last_mut().unwrap() &= (1 << new_remainder) - 1 |
191 | 1.17k | } |
192 | 1.50k | self.len = new_len; |
193 | | } |
194 | 9 | false => self.advance(additional), |
195 | | } |
196 | 1.51k | } |
197 | | |
198 | | /// Appends a slice of booleans into the buffer |
199 | | #[inline] |
200 | 18 | pub fn append_slice(&mut self, slice: &[bool]) { |
201 | 18 | let additional = slice.len(); |
202 | 18 | self.advance(additional); |
203 | | |
204 | 18 | let offset = self.len() - additional; |
205 | 89 | for (i, v) in slice18 .iter18 ().enumerate18 () { |
206 | 89 | if *v { |
207 | 58 | unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) } |
208 | 31 | } |
209 | | } |
210 | 18 | } |
211 | | |
212 | | /// Append `range` bits from `to_set` |
213 | | /// |
214 | | /// `to_set` is a slice of bits packed LSB-first into `[u8]` |
215 | | /// |
216 | | /// # Panics |
217 | | /// |
218 | | /// Panics if `to_set` does not contain `ceil(range.end / 8)` bytes |
219 | 9.01k | pub fn append_packed_range(&mut self, range: Range<usize>, to_set: &[u8]) { |
220 | 9.01k | let offset_write = self.len; |
221 | 9.01k | let len = range.end - range.start; |
222 | | // allocate new bits as 0 |
223 | 9.01k | self.advance(len); |
224 | | // copy bits from to_set into self.buffer a word at a time |
225 | 9.01k | apply_bitwise_binary_op( |
226 | 9.01k | self.buffer.as_slice_mut(), |
227 | 9.01k | offset_write, |
228 | 9.01k | to_set, |
229 | 9.01k | range.start, |
230 | 9.01k | len, |
231 | | |_a, b| b, // copy bits from to_set |
232 | | ); |
233 | 9.01k | } |
234 | | |
235 | | /// Append [`BooleanBuffer`] to this [`BooleanBufferBuilder`] |
236 | 2.38k | pub fn append_buffer(&mut self, buffer: &BooleanBuffer) { |
237 | 2.38k | let range = buffer.offset()..buffer.offset() + buffer.len(); |
238 | 2.38k | self.append_packed_range(range, buffer.values()) |
239 | 2.38k | } |
240 | | |
241 | | /// Returns the packed bits |
242 | 0 | pub fn as_slice(&self) -> &[u8] { |
243 | 0 | self.buffer.as_slice() |
244 | 0 | } |
245 | | |
246 | | /// Returns the packed bits |
247 | 5 | pub fn as_slice_mut(&mut self) -> &mut [u8] { |
248 | 5 | self.buffer.as_slice_mut() |
249 | 5 | } |
250 | | |
251 | | /// Creates a [`BooleanBuffer`] |
252 | | #[inline] |
253 | 82.4k | pub fn finish(&mut self) -> BooleanBuffer { |
254 | 82.4k | let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); |
255 | 82.4k | let len = std::mem::replace(&mut self.len, 0); |
256 | 82.4k | BooleanBuffer::new(buf.into(), 0, len) |
257 | 82.4k | } |
258 | | |
259 | | /// Builds the [BooleanBuffer] without resetting the builder. |
260 | 0 | pub fn finish_cloned(&self) -> BooleanBuffer { |
261 | 0 | BooleanBuffer::new(Buffer::from_slice_ref(self.as_slice()), 0, self.len) |
262 | 0 | } |
263 | | } |
264 | | |
265 | | impl From<BooleanBufferBuilder> for Buffer { |
266 | | #[inline] |
267 | 1.30k | fn from(builder: BooleanBufferBuilder) -> Self { |
268 | 1.30k | builder.buffer.into() |
269 | 1.30k | } |
270 | | } |
271 | | |
272 | | impl From<BooleanBufferBuilder> for BooleanBuffer { |
273 | | #[inline] |
274 | | fn from(builder: BooleanBufferBuilder) -> Self { |
275 | | BooleanBuffer::new(builder.buffer.into(), 0, builder.len) |
276 | | } |
277 | | } |
278 | | |
279 | | #[cfg(test)] |
280 | | mod tests { |
281 | | use super::*; |
282 | | |
283 | | #[test] |
284 | | fn test_boolean_buffer_builder_write_bytes() { |
285 | | let mut b = BooleanBufferBuilder::new(4); |
286 | | b.append(false); |
287 | | b.append(true); |
288 | | b.append(false); |
289 | | b.append(true); |
290 | | assert_eq!(4, b.len()); |
291 | | assert_eq!(512, b.capacity()); |
292 | | let buffer = b.finish(); |
293 | | assert_eq!(4, buffer.len()); |
294 | | |
295 | | // Overallocate capacity |
296 | | let mut b = BooleanBufferBuilder::new(8); |
297 | | b.append_slice(&[false, true, false, true]); |
298 | | assert_eq!(4, b.len()); |
299 | | assert_eq!(512, b.capacity()); |
300 | | let buffer = b.finish(); |
301 | | assert_eq!(4, buffer.len()); |
302 | | } |
303 | | |
304 | | #[test] |
305 | | fn test_boolean_buffer_builder_unset_first_bit() { |
306 | | let mut buffer = BooleanBufferBuilder::new(4); |
307 | | buffer.append(true); |
308 | | buffer.append(true); |
309 | | buffer.append(false); |
310 | | buffer.append(true); |
311 | | buffer.set_bit(0, false); |
312 | | assert_eq!(buffer.len(), 4); |
313 | | assert_eq!(buffer.finish().values(), &[0b1010_u8]); |
314 | | } |
315 | | |
316 | | #[test] |
317 | | fn test_boolean_buffer_builder_unset_last_bit() { |
318 | | let mut buffer = BooleanBufferBuilder::new(4); |
319 | | buffer.append(true); |
320 | | buffer.append(true); |
321 | | buffer.append(false); |
322 | | buffer.append(true); |
323 | | buffer.set_bit(3, false); |
324 | | assert_eq!(buffer.len(), 4); |
325 | | assert_eq!(buffer.finish().values(), &[0b0011_u8]); |
326 | | } |
327 | | |
328 | | #[test] |
329 | | fn test_boolean_buffer_builder_unset_an_inner_bit() { |
330 | | let mut buffer = BooleanBufferBuilder::new(5); |
331 | | buffer.append(true); |
332 | | buffer.append(true); |
333 | | buffer.append(false); |
334 | | buffer.append(true); |
335 | | buffer.set_bit(1, false); |
336 | | assert_eq!(buffer.len(), 4); |
337 | | assert_eq!(buffer.finish().values(), &[0b1001_u8]); |
338 | | } |
339 | | |
340 | | #[test] |
341 | | fn test_boolean_buffer_builder_unset_several_bits() { |
342 | | let mut buffer = BooleanBufferBuilder::new(5); |
343 | | buffer.append(true); |
344 | | buffer.append(true); |
345 | | buffer.append(true); |
346 | | buffer.append(false); |
347 | | buffer.append(true); |
348 | | buffer.set_bit(1, false); |
349 | | buffer.set_bit(2, false); |
350 | | assert_eq!(buffer.len(), 5); |
351 | | assert_eq!(buffer.finish().values(), &[0b10001_u8]); |
352 | | } |
353 | | |
354 | | #[test] |
355 | | fn test_boolean_buffer_builder_unset_several_bits_bigger_than_one_byte() { |
356 | | let mut buffer = BooleanBufferBuilder::new(16); |
357 | | buffer.append_n(10, true); |
358 | | buffer.set_bit(0, false); |
359 | | buffer.set_bit(3, false); |
360 | | buffer.set_bit(9, false); |
361 | | assert_eq!(buffer.len(), 10); |
362 | | assert_eq!(buffer.finish().values(), &[0b11110110_u8, 0b01_u8]); |
363 | | } |
364 | | |
365 | | #[test] |
366 | | fn test_boolean_buffer_builder_flip_several_bits_bigger_than_one_byte() { |
367 | | let mut buffer = BooleanBufferBuilder::new(16); |
368 | | buffer.append_n(5, true); |
369 | | buffer.append_n(5, false); |
370 | | buffer.append_n(5, true); |
371 | | buffer.set_bit(0, false); |
372 | | buffer.set_bit(3, false); |
373 | | buffer.set_bit(9, false); |
374 | | buffer.set_bit(6, true); |
375 | | buffer.set_bit(14, true); |
376 | | buffer.set_bit(13, false); |
377 | | assert_eq!(buffer.len(), 15); |
378 | | assert_eq!(buffer.finish().values(), &[0b01010110_u8, 0b1011100_u8]); |
379 | | } |
380 | | |
381 | | #[test] |
382 | | fn test_bool_buffer_builder_get_first_bit() { |
383 | | let mut buffer = BooleanBufferBuilder::new(16); |
384 | | buffer.append_n(8, true); |
385 | | buffer.append_n(8, false); |
386 | | assert!(buffer.get_bit(0)); |
387 | | } |
388 | | |
389 | | #[test] |
390 | | fn test_bool_buffer_builder_get_first_bit_not_requires_mutability() { |
391 | | let buffer = { |
392 | | let mut buffer = BooleanBufferBuilder::new(16); |
393 | | buffer.append_n(8, true); |
394 | | buffer |
395 | | }; |
396 | | |
397 | | assert!(buffer.get_bit(0)); |
398 | | } |
399 | | |
400 | | #[test] |
401 | | fn test_bool_buffer_builder_get_last_bit() { |
402 | | let mut buffer = BooleanBufferBuilder::new(16); |
403 | | buffer.append_n(8, true); |
404 | | buffer.append_n(8, false); |
405 | | assert!(!buffer.get_bit(15)); |
406 | | } |
407 | | |
408 | | #[test] |
409 | | fn test_bool_buffer_builder_get_an_inner_bit() { |
410 | | let mut buffer = BooleanBufferBuilder::new(16); |
411 | | buffer.append_n(4, false); |
412 | | buffer.append_n(8, true); |
413 | | buffer.append_n(4, false); |
414 | | assert!(buffer.get_bit(11)); |
415 | | } |
416 | | |
417 | | #[test] |
418 | | fn test_bool_buffer_fuzz() { |
419 | | use rand::prelude::*; |
420 | | |
421 | | let mut buffer = BooleanBufferBuilder::new(12); |
422 | | let mut all_bools = vec![]; |
423 | | let mut rng = rand::rng(); |
424 | | |
425 | | let src_len = 32; |
426 | | let (src, compacted_src) = { |
427 | | let src: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0)) |
428 | | .take(src_len) |
429 | | .collect(); |
430 | | |
431 | | let mut compacted_src = BooleanBufferBuilder::new(src_len); |
432 | | compacted_src.append_slice(&src); |
433 | | (src, compacted_src.finish()) |
434 | | }; |
435 | | |
436 | | for _ in 0..100 { |
437 | | let a = rng.next_u32() as usize % src_len; |
438 | | let b = rng.next_u32() as usize % src_len; |
439 | | |
440 | | let start = a.min(b); |
441 | | let end = a.max(b); |
442 | | |
443 | | buffer.append_packed_range(start..end, compacted_src.values()); |
444 | | all_bools.extend_from_slice(&src[start..end]); |
445 | | } |
446 | | |
447 | | let mut compacted = BooleanBufferBuilder::new(all_bools.len()); |
448 | | compacted.append_slice(&all_bools); |
449 | | |
450 | | assert_eq!(buffer.finish(), compacted.finish()) |
451 | | } |
452 | | |
453 | | #[test] |
454 | | fn test_boolean_array_builder_resize() { |
455 | | let mut builder = BooleanBufferBuilder::new(20); |
456 | | builder.append_n(4, true); |
457 | | builder.append_n(7, false); |
458 | | builder.append_n(2, true); |
459 | | builder.resize(20); |
460 | | |
461 | | assert_eq!(builder.len(), 20); |
462 | | assert_eq!(builder.as_slice(), &[0b00001111, 0b00011000, 0b00000000]); |
463 | | |
464 | | builder.resize(5); |
465 | | assert_eq!(builder.len(), 5); |
466 | | assert_eq!(builder.as_slice(), &[0b00001111]); |
467 | | |
468 | | builder.append_n(4, true); |
469 | | assert_eq!(builder.len(), 9); |
470 | | assert_eq!(builder.as_slice(), &[0b11101111, 0b00000001]); |
471 | | } |
472 | | |
473 | | #[test] |
474 | | fn test_truncate() { |
475 | | let b = MutableBuffer::from_iter([true, true, true, true]); |
476 | | let mut builder = BooleanBufferBuilder::new_from_buffer(b, 2); |
477 | | builder.advance(2); |
478 | | let finished = builder.finish(); |
479 | | assert_eq!(finished.values(), &[0b00000011]); |
480 | | |
481 | | let mut builder = BooleanBufferBuilder::new(10); |
482 | | builder.append_n(5, true); |
483 | | builder.resize(3); |
484 | | builder.advance(2); |
485 | | let finished = builder.finish(); |
486 | | assert_eq!(finished.values(), &[0b00000111]); |
487 | | |
488 | | let mut builder = BooleanBufferBuilder::new(10); |
489 | | builder.append_n(16, true); |
490 | | assert_eq!(builder.as_slice(), &[0xFF, 0xFF]); |
491 | | builder.truncate(20); |
492 | | assert_eq!(builder.as_slice(), &[0xFF, 0xFF]); |
493 | | builder.truncate(14); |
494 | | assert_eq!(builder.as_slice(), &[0xFF, 0b00111111]); |
495 | | builder.append(false); |
496 | | builder.append(true); |
497 | | assert_eq!(builder.as_slice(), &[0xFF, 0b10111111]); |
498 | | builder.append_packed_range(0..3, &[0xFF]); |
499 | | assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b00000111]); |
500 | | builder.truncate(17); |
501 | | assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b00000001]); |
502 | | builder.append_packed_range(0..2, &[2]); |
503 | | assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b0000101]); |
504 | | builder.truncate(8); |
505 | | assert_eq!(builder.as_slice(), &[0xFF]); |
506 | | builder.resize(14); |
507 | | assert_eq!(builder.as_slice(), &[0xFF, 0x00]); |
508 | | builder.truncate(0); |
509 | | assert_eq!(builder.as_slice(), &[]); |
510 | | } |
511 | | |
512 | | #[test] |
513 | | fn test_boolean_builder_increases_buffer_len() { |
514 | | // 00000010 01001000 |
515 | | let buf = Buffer::from([72_u8, 2_u8]); |
516 | | let mut builder = BooleanBufferBuilder::new(8); |
517 | | |
518 | | for i in 0..16 { |
519 | | if i == 3 || i == 6 || i == 9 { |
520 | | builder.append(true); |
521 | | } else { |
522 | | builder.append(false); |
523 | | } |
524 | | } |
525 | | let buf2 = builder.finish(); |
526 | | |
527 | | assert_eq!(buf.len(), buf2.inner().len()); |
528 | | assert_eq!(buf.as_slice(), buf2.values()); |
529 | | } |
530 | | } |