Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/buffer/null.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19
use crate::buffer::BooleanBuffer;
20
use crate::{Buffer, MutableBuffer};
21
22
/// A [`BooleanBuffer`] used to encode validity for Arrow arrays
23
///
24
/// In the [Arrow specification], array validity is encoded in a packed bitmask with a
25
/// `true` value indicating the corresponding slot is not null, and `false` indicating
26
/// that it is null.
27
///
28
/// `NullBuffer`s can be creating using [`NullBufferBuilder`]
29
///
30
/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
31
/// [`NullBufferBuilder`]: crate::NullBufferBuilder
32
#[derive(Debug, Clone, Eq, PartialEq)]
33
pub struct NullBuffer {
34
    buffer: BooleanBuffer,
35
    null_count: usize,
36
}
37
38
impl NullBuffer {
39
    /// Create a new [`NullBuffer`] computing the null count
40
5.41k
    pub fn new(buffer: BooleanBuffer) -> Self {
41
5.41k
        let null_count = buffer.len() - buffer.count_set_bits();
42
5.41k
        Self { buffer, null_count }
43
5.41k
    }
44
45
    /// Create a new [`NullBuffer`] of length `len` where all values are null
46
138
    pub fn new_null(len: usize) -> Self {
47
138
        Self {
48
138
            buffer: BooleanBuffer::new_unset(len),
49
138
            null_count: len,
50
138
        }
51
138
    }
52
53
    /// Create a new [`NullBuffer`] of length `len` where all values are valid
54
    ///
55
    /// Note: it is more efficient to not set the null buffer if it is known to
56
    /// be all valid (aka all values are not null)
57
1
    pub fn new_valid(len: usize) -> Self {
58
1
        Self {
59
1
            buffer: BooleanBuffer::new_set(len),
60
1
            null_count: 0,
61
1
        }
62
1
    }
63
64
    /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
65
    ///
66
    /// # Safety
67
    ///
68
    /// `buffer` must contain `null_count` `0` bits
69
1.10k
    pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
70
1.10k
        Self { buffer, null_count }
71
1.10k
    }
72
73
    /// Computes the union of the nulls in two optional [`NullBuffer`]
74
    ///
75
    /// This is commonly used by binary operations where the result is NULL if either
76
    /// of the input values is NULL. Handling the null mask separately in this way
77
    /// can yield significant performance improvements over an iterator approach
78
0
    pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
79
0
        match (lhs, rhs) {
80
0
            (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
81
0
            (Some(n), None) | (None, Some(n)) => Some(n.clone()),
82
0
            (None, None) => None,
83
        }
84
0
    }
85
86
    /// Returns true if all nulls in `other` also exist in self
87
0
    pub fn contains(&self, other: &NullBuffer) -> bool {
88
0
        if other.null_count == 0 {
89
0
            return true;
90
0
        }
91
0
        let lhs = self.inner().bit_chunks().iter_padded();
92
0
        let rhs = other.inner().bit_chunks().iter_padded();
93
0
        lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
94
0
    }
95
96
    /// Returns a new [`NullBuffer`] where each bit in the current null buffer
97
    /// is repeated `count` times. This is useful for masking the nulls of
98
    /// the child of a FixedSizeListArray based on its parent
99
1
    pub fn expand(&self, count: usize) -> Self {
100
1
        let capacity = self.buffer.len().checked_mul(count).unwrap();
101
1
        let mut buffer = MutableBuffer::new_null(capacity);
102
103
        // Expand each bit within `null_mask` into `element_len`
104
        // bits, constructing the implicit mask of the child elements
105
5
        for i in 0..
self.buffer1
.
len1
() {
106
5
            if self.is_null(i) {
107
2
                continue;
108
3
            }
109
6
            for j in 0..
count3
{
110
6
                crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
111
            }
112
        }
113
1
        Self {
114
1
            buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
115
1
            null_count: self.null_count * count,
116
1
        }
117
1
    }
118
119
    /// Returns the length of this [`NullBuffer`] in bits
120
    #[inline]
121
818
    pub fn len(&self) -> usize {
122
818
        self.buffer.len()
123
818
    }
124
125
    /// Returns the offset of this [`NullBuffer`] in bits
126
    #[inline]
127
3.60k
    pub fn offset(&self) -> usize {
128
3.60k
        self.buffer.offset()
129
3.60k
    }
130
131
    /// Returns true if this [`NullBuffer`] is empty
132
    #[inline]
133
    pub fn is_empty(&self) -> bool {
134
        self.buffer.is_empty()
135
    }
136
137
    /// Free up unused memory.
138
0
    pub fn shrink_to_fit(&mut self) {
139
0
        self.buffer.shrink_to_fit();
140
0
    }
141
142
    /// Returns the null count for this [`NullBuffer`]
143
    #[inline]
144
14.6k
    pub fn null_count(&self) -> usize {
145
14.6k
        self.null_count
146
14.6k
    }
147
148
    /// Returns `true` if the value at `idx` is not null
149
    #[inline]
150
209k
    pub fn is_valid(&self, idx: usize) -> bool {
151
209k
        self.buffer.value(idx)
152
209k
    }
153
154
    /// Returns `true` if the value at `idx` is null
155
    #[inline]
156
209k
    pub fn is_null(&self, idx: usize) -> bool {
157
209k
        !self.is_valid(idx)
158
209k
    }
159
160
    /// Returns the packed validity of this [`NullBuffer`] not including any offset
161
    #[inline]
162
3.81k
    pub fn validity(&self) -> &[u8] {
163
3.81k
        self.buffer.values()
164
3.81k
    }
165
166
    /// Slices this [`NullBuffer`] by the provided `offset` and `length`
167
1.67k
    pub fn slice(&self, offset: usize, len: usize) -> Self {
168
1.67k
        Self::new(self.buffer.slice(offset, len))
169
1.67k
    }
170
171
    /// Returns an iterator over the bits in this [`NullBuffer`]
172
    ///
173
    /// * `true` indicates that the corresponding value is not NULL
174
    /// * `false` indicates that the corresponding value is NULL
175
    ///
176
    /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
177
1
    pub fn iter(&self) -> BitIterator<'_> {
178
1
        self.buffer.iter()
179
1
    }
180
181
    /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
182
    ///
183
    /// Valid indices indicate the corresponding value is not NULL
184
51
    pub fn valid_indices(&self) -> BitIndexIterator<'_> {
185
51
        self.buffer.set_indices()
186
51
    }
187
188
    /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
189
    ///
190
    /// Valid indices indicate the corresponding value is not NULL
191
0
    pub fn valid_slices(&self) -> BitSliceIterator<'_> {
192
0
        self.buffer.set_slices()
193
0
    }
194
195
    /// Calls the provided closure for each index in this null mask that is set
196
    #[inline]
197
    pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
198
        &self,
199
        f: F,
200
    ) -> Result<(), E> {
201
        if self.null_count == self.len() {
202
            return Ok(());
203
        }
204
        self.valid_indices().try_for_each(f)
205
    }
206
207
    /// Returns the inner [`BooleanBuffer`]
208
    #[inline]
209
2.65k
    pub fn inner(&self) -> &BooleanBuffer {
210
2.65k
        &self.buffer
211
2.65k
    }
212
213
    /// Returns the inner [`BooleanBuffer`]
214
    #[inline]
215
    pub fn into_inner(self) -> BooleanBuffer {
216
        self.buffer
217
    }
218
219
    /// Returns the underlying [`Buffer`]
220
    #[inline]
221
301
    pub fn buffer(&self) -> &Buffer {
222
301
        self.buffer.inner()
223
301
    }
224
}
225
226
impl<'a> IntoIterator for &'a NullBuffer {
227
    type Item = bool;
228
    type IntoIter = BitIterator<'a>;
229
230
0
    fn into_iter(self) -> Self::IntoIter {
231
0
        self.buffer.iter()
232
0
    }
233
}
234
235
impl From<BooleanBuffer> for NullBuffer {
236
28
    fn from(value: BooleanBuffer) -> Self {
237
28
        Self::new(value)
238
28
    }
239
}
240
241
impl From<&[bool]> for NullBuffer {
242
3
    fn from(value: &[bool]) -> Self {
243
3
        BooleanBuffer::from(value).into()
244
3
    }
245
}
246
247
impl<const N: usize> From<&[bool; N]> for NullBuffer {
248
3
    fn from(value: &[bool; N]) -> Self {
249
3
        value[..].into()
250
3
    }
251
}
252
253
impl From<Vec<bool>> for NullBuffer {
254
11
    fn from(value: Vec<bool>) -> Self {
255
11
        BooleanBuffer::from(value).into()
256
11
    }
257
}
258
259
impl FromIterator<bool> for NullBuffer {
260
1
    fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
261
1
        BooleanBuffer::from_iter(iter).into()
262
1
    }
263
}
264
265
#[cfg(test)]
266
mod tests {
267
    use super::*;
268
    #[test]
269
    fn test_size() {
270
        // This tests that the niche optimisation eliminates the overhead of an option
271
        assert_eq!(
272
            std::mem::size_of::<NullBuffer>(),
273
            std::mem::size_of::<Option<NullBuffer>>()
274
        );
275
    }
276
}