/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/buffer/null.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; |
19 | | use crate::buffer::BooleanBuffer; |
20 | | use crate::{Buffer, MutableBuffer}; |
21 | | |
22 | | /// A [`BooleanBuffer`] used to encode validity for Arrow arrays |
23 | | /// |
24 | | /// In the [Arrow specification], array validity is encoded in a packed bitmask with a |
25 | | /// `true` value indicating the corresponding slot is not null, and `false` indicating |
26 | | /// that it is null. |
27 | | /// |
28 | | /// `NullBuffer`s can be creating using [`NullBufferBuilder`] |
29 | | /// |
30 | | /// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps |
31 | | /// [`NullBufferBuilder`]: crate::NullBufferBuilder |
32 | | #[derive(Debug, Clone, Eq, PartialEq)] |
33 | | pub struct NullBuffer { |
34 | | buffer: BooleanBuffer, |
35 | | null_count: usize, |
36 | | } |
37 | | |
38 | | impl NullBuffer { |
39 | | /// Create a new [`NullBuffer`] computing the null count |
40 | 215 | pub fn new(buffer: BooleanBuffer) -> Self { |
41 | 215 | let null_count = buffer.len() - buffer.count_set_bits(); |
42 | 215 | Self { buffer, null_count } |
43 | 215 | } |
44 | | |
45 | | /// Create a new [`NullBuffer`] of length `len` where all values are null |
46 | 0 | pub fn new_null(len: usize) -> Self { |
47 | 0 | Self { |
48 | 0 | buffer: BooleanBuffer::new_unset(len), |
49 | 0 | null_count: len, |
50 | 0 | } |
51 | 0 | } |
52 | | |
53 | | /// Create a new [`NullBuffer`] of length `len` where all values are valid |
54 | | /// |
55 | | /// Note: it is more efficient to not set the null buffer if it is known to |
56 | | /// be all valid (aka all values are not null) |
57 | 0 | pub fn new_valid(len: usize) -> Self { |
58 | 0 | Self { |
59 | 0 | buffer: BooleanBuffer::new_set(len), |
60 | 0 | null_count: 0, |
61 | 0 | } |
62 | 0 | } |
63 | | |
64 | | /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count` |
65 | | /// |
66 | | /// # Safety |
67 | | /// |
68 | | /// `buffer` must contain `null_count` `0` bits |
69 | 16 | pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self { |
70 | 16 | Self { buffer, null_count } |
71 | 16 | } |
72 | | |
73 | | /// Computes the union of the nulls in two optional [`NullBuffer`] |
74 | | /// |
75 | | /// This is commonly used by binary operations where the result is NULL if either |
76 | | /// of the input values is NULL. Handling the null mask separately in this way |
77 | | /// can yield significant performance improvements over an iterator approach |
78 | 0 | pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> { |
79 | 0 | match (lhs, rhs) { |
80 | 0 | (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())), |
81 | 0 | (Some(n), None) | (None, Some(n)) => Some(n.clone()), |
82 | 0 | (None, None) => None, |
83 | | } |
84 | 0 | } |
85 | | |
86 | | /// Returns true if all nulls in `other` also exist in self |
87 | 2 | pub fn contains(&self, other: &NullBuffer) -> bool { |
88 | 2 | if other.null_count == 0 { |
89 | 0 | return true; |
90 | 2 | } |
91 | 2 | let lhs = self.inner().bit_chunks().iter_padded(); |
92 | 2 | let rhs = other.inner().bit_chunks().iter_padded(); |
93 | 2 | lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) |
94 | 2 | } |
95 | | |
96 | | /// Returns a new [`NullBuffer`] where each bit in the current null buffer |
97 | | /// is repeated `count` times. This is useful for masking the nulls of |
98 | | /// the child of a FixedSizeListArray based on its parent |
99 | 0 | pub fn expand(&self, count: usize) -> Self { |
100 | 0 | let capacity = self.buffer.len().checked_mul(count).unwrap(); |
101 | 0 | let mut buffer = MutableBuffer::new_null(capacity); |
102 | | |
103 | | // Expand each bit within `null_mask` into `element_len` |
104 | | // bits, constructing the implicit mask of the child elements |
105 | 0 | for i in 0..self.buffer.len() { |
106 | 0 | if self.is_null(i) { |
107 | 0 | continue; |
108 | 0 | } |
109 | 0 | for j in 0..count { |
110 | 0 | crate::bit_util::set_bit(buffer.as_mut(), i * count + j) |
111 | | } |
112 | | } |
113 | 0 | Self { |
114 | 0 | buffer: BooleanBuffer::new(buffer.into(), 0, capacity), |
115 | 0 | null_count: self.null_count * count, |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | /// Returns the length of this [`NullBuffer`] in bits |
120 | | #[inline] |
121 | 141 | pub fn len(&self) -> usize { |
122 | 141 | self.buffer.len() |
123 | 141 | } |
124 | | |
125 | | /// Returns the offset of this [`NullBuffer`] in bits |
126 | | #[inline] |
127 | 501 | pub fn offset(&self) -> usize { |
128 | 501 | self.buffer.offset() |
129 | 501 | } |
130 | | |
131 | | /// Returns true if this [`NullBuffer`] is empty |
132 | | #[inline] |
133 | | pub fn is_empty(&self) -> bool { |
134 | | self.buffer.is_empty() |
135 | | } |
136 | | |
137 | | /// Free up unused memory. |
138 | 0 | pub fn shrink_to_fit(&mut self) { |
139 | 0 | self.buffer.shrink_to_fit(); |
140 | 0 | } |
141 | | |
142 | | /// Returns the null count for this [`NullBuffer`] |
143 | | #[inline] |
144 | 409 | pub fn null_count(&self) -> usize { |
145 | 409 | self.null_count |
146 | 409 | } |
147 | | |
148 | | /// Returns `true` if the value at `idx` is not null |
149 | | #[inline] |
150 | 386 | pub fn is_valid(&self, idx: usize) -> bool { |
151 | 386 | self.buffer.value(idx) |
152 | 386 | } |
153 | | |
154 | | /// Returns `true` if the value at `idx` is null |
155 | | #[inline] |
156 | 386 | pub fn is_null(&self, idx: usize) -> bool { |
157 | 386 | !self.is_valid(idx) |
158 | 386 | } |
159 | | |
160 | | /// Returns the packed validity of this [`NullBuffer`] not including any offset |
161 | | #[inline] |
162 | 442 | pub fn validity(&self) -> &[u8] { |
163 | 442 | self.buffer.values() |
164 | 442 | } |
165 | | |
166 | | /// Slices this [`NullBuffer`] by the provided `offset` and `length` |
167 | 36 | pub fn slice(&self, offset: usize, len: usize) -> Self { |
168 | 36 | Self::new(self.buffer.slice(offset, len)) |
169 | 36 | } |
170 | | |
171 | | /// Returns an iterator over the bits in this [`NullBuffer`] |
172 | | /// |
173 | | /// * `true` indicates that the corresponding value is not NULL |
174 | | /// * `false` indicates that the corresponding value is NULL |
175 | | /// |
176 | | /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases |
177 | 0 | pub fn iter(&self) -> BitIterator<'_> { |
178 | 0 | self.buffer.iter() |
179 | 0 | } |
180 | | |
181 | | /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`] |
182 | | /// |
183 | | /// Valid indices indicate the corresponding value is not NULL |
184 | 0 | pub fn valid_indices(&self) -> BitIndexIterator<'_> { |
185 | 0 | self.buffer.set_indices() |
186 | 0 | } |
187 | | |
188 | | /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices |
189 | | /// |
190 | | /// Valid indices indicate the corresponding value is not NULL |
191 | 0 | pub fn valid_slices(&self) -> BitSliceIterator<'_> { |
192 | 0 | self.buffer.set_slices() |
193 | 0 | } |
194 | | |
195 | | /// Calls the provided closure for each index in this null mask that is set |
196 | | #[inline] |
197 | 0 | pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>( |
198 | 0 | &self, |
199 | 0 | f: F, |
200 | 0 | ) -> Result<(), E> { |
201 | 0 | if self.null_count == self.len() { |
202 | 0 | return Ok(()); |
203 | 0 | } |
204 | 0 | self.valid_indices().try_for_each(f) |
205 | 0 | } |
206 | | |
207 | | /// Returns the inner [`BooleanBuffer`] |
208 | | #[inline] |
209 | 38 | pub fn inner(&self) -> &BooleanBuffer { |
210 | 38 | &self.buffer |
211 | 38 | } |
212 | | |
213 | | /// Returns the inner [`BooleanBuffer`] |
214 | | #[inline] |
215 | 0 | pub fn into_inner(self) -> BooleanBuffer { |
216 | 0 | self.buffer |
217 | 0 | } |
218 | | |
219 | | /// Returns the underlying [`Buffer`] |
220 | | #[inline] |
221 | 68 | pub fn buffer(&self) -> &Buffer { |
222 | 68 | self.buffer.inner() |
223 | 68 | } |
224 | | } |
225 | | |
226 | | impl<'a> IntoIterator for &'a NullBuffer { |
227 | | type Item = bool; |
228 | | type IntoIter = BitIterator<'a>; |
229 | | |
230 | 0 | fn into_iter(self) -> Self::IntoIter { |
231 | 0 | self.buffer.iter() |
232 | 0 | } |
233 | | } |
234 | | |
235 | | impl From<BooleanBuffer> for NullBuffer { |
236 | 1 | fn from(value: BooleanBuffer) -> Self { |
237 | 1 | Self::new(value) |
238 | 1 | } |
239 | | } |
240 | | |
241 | | impl From<&[bool]> for NullBuffer { |
242 | 0 | fn from(value: &[bool]) -> Self { |
243 | 0 | BooleanBuffer::from(value).into() |
244 | 0 | } |
245 | | } |
246 | | |
247 | | impl<const N: usize> From<&[bool; N]> for NullBuffer { |
248 | | fn from(value: &[bool; N]) -> Self { |
249 | | value[..].into() |
250 | | } |
251 | | } |
252 | | |
253 | | impl From<Vec<bool>> for NullBuffer { |
254 | 1 | fn from(value: Vec<bool>) -> Self { |
255 | 1 | BooleanBuffer::from(value).into() |
256 | 1 | } |
257 | | } |
258 | | |
259 | | impl FromIterator<bool> for NullBuffer { |
260 | | fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self { |
261 | | BooleanBuffer::from_iter(iter).into() |
262 | | } |
263 | | } |
264 | | |
265 | | #[cfg(test)] |
266 | | mod tests { |
267 | | use super::*; |
268 | | #[test] |
269 | | fn test_size() { |
270 | | // This tests that the niche optimisation eliminates the overhead of an option |
271 | | assert_eq!( |
272 | | std::mem::size_of::<NullBuffer>(), |
273 | | std::mem::size_of::<Option<NullBuffer>>() |
274 | | ); |
275 | | } |
276 | | } |