Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/util/bit_iterator.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Types for iterating over packed bitmasks
19
20
use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator};
21
use crate::bit_util::{ceil, get_bit_raw};
22
23
/// Iterator over the bits within a packed bitmask
24
///
25
/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and [`BitSliceIterator`]
26
pub struct BitIterator<'a> {
27
    buffer: &'a [u8],
28
    current_offset: usize,
29
    end_offset: usize,
30
}
31
32
impl<'a> BitIterator<'a> {
33
    /// Create a new [`BitIterator`] from the provided `buffer`,
34
    /// and `offset` and `len` in bits
35
    ///
36
    /// # Panic
37
    ///
38
    /// Panics if `buffer` is too short for the provided offset and length
39
0
    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
40
0
        let end_offset = offset.checked_add(len).unwrap();
41
0
        let required_len = ceil(end_offset, 8);
42
0
        assert!(
43
0
            buffer.len() >= required_len,
44
0
            "BitIterator buffer too small, expected {required_len} got {}",
45
0
            buffer.len()
46
        );
47
48
0
        Self {
49
0
            buffer,
50
0
            current_offset: offset,
51
0
            end_offset,
52
0
        }
53
0
    }
54
}
55
56
impl Iterator for BitIterator<'_> {
57
    type Item = bool;
58
59
0
    fn next(&mut self) -> Option<Self::Item> {
60
0
        if self.current_offset == self.end_offset {
61
0
            return None;
62
0
        }
63
        // Safety:
64
        // offsets in bounds
65
0
        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.current_offset) };
66
0
        self.current_offset += 1;
67
0
        Some(v)
68
0
    }
69
70
0
    fn size_hint(&self) -> (usize, Option<usize>) {
71
0
        let remaining_bits = self.end_offset - self.current_offset;
72
0
        (remaining_bits, Some(remaining_bits))
73
0
    }
74
}
75
76
impl ExactSizeIterator for BitIterator<'_> {}
77
78
impl DoubleEndedIterator for BitIterator<'_> {
79
0
    fn next_back(&mut self) -> Option<Self::Item> {
80
0
        if self.current_offset == self.end_offset {
81
0
            return None;
82
0
        }
83
0
        self.end_offset -= 1;
84
        // Safety:
85
        // offsets in bounds
86
0
        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
87
0
        Some(v)
88
0
    }
89
}
90
91
/// Iterator of contiguous ranges of set bits within a provided packed bitmask
92
///
93
/// Returns `(usize, usize)` each representing an interval where the corresponding
94
/// bits in the provides mask are set
95
///
96
/// the first value is the start of the range (inclusive) and the second value is the end of the range (exclusive)
97
///
98
#[derive(Debug)]
99
pub struct BitSliceIterator<'a> {
100
    iter: UnalignedBitChunkIterator<'a>,
101
    len: usize,
102
    current_offset: i64,
103
    current_chunk: u64,
104
}
105
106
impl<'a> BitSliceIterator<'a> {
107
    /// Create a new [`BitSliceIterator`] from the provided `buffer`,
108
    /// and `offset` and `len` in bits
109
121
    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
110
121
        let chunk = UnalignedBitChunk::new(buffer, offset, len);
111
121
        let mut iter = chunk.iter();
112
113
121
        let current_offset = -(chunk.lead_padding() as i64);
114
121
        let current_chunk = iter.next().unwrap_or(0);
115
116
121
        Self {
117
121
            iter,
118
121
            len,
119
121
            current_offset,
120
121
            current_chunk,
121
121
        }
122
121
    }
123
124
    /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at
125
    /// least one bit set, or None if there is no such chunk.
126
    ///
127
    /// Where `chunk_offset` is the bit offset to the current `u64` chunk
128
    /// and `bit_offset` is the offset of the first `1` bit in that chunk
129
127
    fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> {
130
        loop {
131
127
            if self.current_chunk != 0 {
132
                // Find the index of the first 1
133
92
                let bit_pos = self.current_chunk.trailing_zeros();
134
92
                return Some((self.current_offset, bit_pos));
135
35
            }
136
137
35
            self.current_chunk = self.iter.next()?;
138
0
            self.current_offset += 64;
139
        }
140
127
    }
141
}
142
143
impl Iterator for BitSliceIterator<'_> {
144
    type Item = (usize, usize);
145
146
139
    fn next(&mut self) -> Option<Self::Item> {
147
        // Used as termination condition
148
139
        if self.len == 0 {
149
12
            return None;
150
127
        }
151
152
127
        let (
start_chunk92
,
start_bit92
) = self.advance_to_set_bit()
?35
;
153
154
        // Set bits up to start
155
92
        self.current_chunk |= (1 << start_bit) - 1;
156
157
        loop {
158
92
            if self.current_chunk != u64::MAX {
159
                // Find the index of the first 0
160
92
                let end_bit = self.current_chunk.trailing_ones();
161
162
                // Zero out up to end_bit
163
92
                self.current_chunk &= !((1 << end_bit) - 1);
164
165
92
                return Some((
166
92
                    (start_chunk + start_bit as i64) as usize,
167
92
                    (self.current_offset + end_bit as i64) as usize,
168
92
                ));
169
0
            }
170
171
0
            match self.iter.next() {
172
0
                Some(next) => {
173
0
                    self.current_chunk = next;
174
0
                    self.current_offset += 64;
175
0
                }
176
                None => {
177
0
                    return Some((
178
0
                        (start_chunk + start_bit as i64) as usize,
179
0
                        std::mem::replace(&mut self.len, 0),
180
0
                    ));
181
                }
182
            }
183
        }
184
139
    }
185
}
186
187
/// An iterator of `usize` whose index in a provided bitmask is true
188
///
189
/// This provides the best performance on most masks, apart from those which contain
190
/// large runs and therefore favour [`BitSliceIterator`]
191
#[derive(Debug)]
192
pub struct BitIndexIterator<'a> {
193
    current_chunk: u64,
194
    chunk_offset: i64,
195
    iter: UnalignedBitChunkIterator<'a>,
196
}
197
198
impl<'a> BitIndexIterator<'a> {
199
    /// Create a new [`BitIndexIterator`] from the provide `buffer`,
200
    /// and `offset` and `len` in bits
201
9
    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
202
9
        let chunks = UnalignedBitChunk::new(buffer, offset, len);
203
9
        let mut iter = chunks.iter();
204
205
9
        let current_chunk = iter.next().unwrap_or(0);
206
9
        let chunk_offset = -(chunks.lead_padding() as i64);
207
208
9
        Self {
209
9
            current_chunk,
210
9
            chunk_offset,
211
9
            iter,
212
9
        }
213
9
    }
214
}
215
216
impl Iterator for BitIndexIterator<'_> {
217
    type Item = usize;
218
219
    #[inline]
220
21
    fn next(&mut self) -> Option<Self::Item> {
221
        loop {
222
21
            if self.current_chunk != 0 {
223
12
                let bit_pos = self.current_chunk.trailing_zeros();
224
12
                self.current_chunk ^= 1 << bit_pos;
225
12
                return Some((self.chunk_offset + bit_pos as i64) as usize);
226
9
            }
227
228
9
            self.current_chunk = self.iter.next()?;
229
0
            self.chunk_offset += 64;
230
        }
231
21
    }
232
}
233
234
/// An iterator of u32 whose index in a provided bitmask is true
235
/// Respects arbitrary offsets and slice lead/trail padding exactly like BitIndexIterator
236
#[derive(Debug)]
237
pub struct BitIndexU32Iterator<'a> {
238
    curr: u64,
239
    chunk_offset: i64,
240
    iter: UnalignedBitChunkIterator<'a>,
241
}
242
243
impl<'a> BitIndexU32Iterator<'a> {
244
    /// Create a new [BitIndexU32Iterator] from the provided buffer,
245
    /// offset and len in bits.
246
0
    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
247
        // Build the aligned chunks (including prefix/suffix masked)
248
0
        let chunks = UnalignedBitChunk::new(buffer, offset, len);
249
0
        let mut iter = chunks.iter();
250
251
        // First 64-bit word (masked for lead padding), or 0 if empty
252
0
        let curr = iter.next().unwrap_or(0);
253
        // Negative lead padding ensures the first bit in curr maps to index 0
254
0
        let chunk_offset = -(chunks.lead_padding() as i64);
255
256
0
        Self {
257
0
            curr,
258
0
            chunk_offset,
259
0
            iter,
260
0
        }
261
0
    }
262
}
263
264
impl<'a> Iterator for BitIndexU32Iterator<'a> {
265
    type Item = u32;
266
267
    #[inline(always)]
268
0
    fn next(&mut self) -> Option<u32> {
269
        loop {
270
0
            if self.curr != 0 {
271
                // Position of least-significant set bit
272
0
                let tz = self.curr.trailing_zeros();
273
                // Clear that bit
274
0
                self.curr &= self.curr - 1;
275
                // Return global index = chunk_offset + tz
276
0
                return Some((self.chunk_offset + tz as i64) as u32);
277
0
            }
278
            // Advance to next 64-bit chunk
279
0
            match self.iter.next() {
280
0
                Some(next_chunk) => {
281
0
                    // Move offset forward by 64 bits
282
0
                    self.chunk_offset += 64;
283
0
                    self.curr = next_chunk;
284
0
                }
285
0
                None => return None,
286
            }
287
        }
288
0
    }
289
}
290
291
/// Calls the provided closure for each index in the provided null mask that is set,
292
/// using an adaptive strategy based on the null count
293
///
294
/// Ideally this would be encapsulated in an [`Iterator`] that would determine the optimal
295
/// strategy up front, and then yield indexes based on this.
296
///
297
/// Unfortunately, external iteration based on the resulting [`Iterator`] would match the strategy
298
/// variant on each call to [`Iterator::next`], and LLVM generally cannot eliminate this.
299
///
300
/// One solution to this might be internal iteration, e.g. [`Iterator::try_fold`], however,
301
/// it is currently [not possible] to override this for custom iterators in stable Rust.
302
///
303
/// As such this is the next best option
304
///
305
/// [not possible]: https://github.com/rust-lang/rust/issues/69595
306
#[inline]
307
0
pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
308
0
    len: usize,
309
0
    offset: usize,
310
0
    null_count: usize,
311
0
    nulls: Option<&[u8]>,
312
0
    f: F,
313
0
) -> Result<(), E> {
314
0
    let valid_count = len - null_count;
315
316
0
    if valid_count == len {
317
0
        (0..len).try_for_each(f)
318
0
    } else if null_count != len {
319
0
        BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f)
320
    } else {
321
0
        Ok(())
322
    }
323
0
}
324
325
// Note: further tests located in arrow_select::filter module
326
327
#[cfg(test)]
328
mod tests {
329
    use super::*;
330
331
    #[test]
332
    fn test_bit_iterator_size_hint() {
333
        let mut b = BitIterator::new(&[0b00000011], 0, 2);
334
        assert_eq!(
335
            b.size_hint(),
336
            (2, Some(2)),
337
            "Expected size_hint to be (2, Some(2))"
338
        );
339
340
        b.next();
341
        assert_eq!(
342
            b.size_hint(),
343
            (1, Some(1)),
344
            "Expected size_hint to be (1, Some(1)) after one bit consumed"
345
        );
346
347
        b.next();
348
        assert_eq!(
349
            b.size_hint(),
350
            (0, Some(0)),
351
            "Expected size_hint to be (0, Some(0)) after all bits consumed"
352
        );
353
    }
354
355
    #[test]
356
    fn test_bit_iterator() {
357
        let mask = &[0b00010010, 0b00100011, 0b00000101, 0b00010001, 0b10010011];
358
        let actual: Vec<_> = BitIterator::new(mask, 0, 5).collect();
359
        assert_eq!(actual, &[false, true, false, false, true]);
360
361
        let actual: Vec<_> = BitIterator::new(mask, 4, 5).collect();
362
        assert_eq!(actual, &[true, false, false, false, true]);
363
364
        let actual: Vec<_> = BitIterator::new(mask, 12, 14).collect();
365
        assert_eq!(
366
            actual,
367
            &[
368
                false, true, false, false, true, false, true, false, false, false, false, false,
369
                true, false
370
            ]
371
        );
372
373
        assert_eq!(BitIterator::new(mask, 0, 0).count(), 0);
374
        assert_eq!(BitIterator::new(mask, 40, 0).count(), 0);
375
    }
376
377
    #[test]
378
    #[should_panic(expected = "BitIterator buffer too small, expected 3 got 2")]
379
    fn test_bit_iterator_bounds() {
380
        let mask = &[223, 23];
381
        BitIterator::new(mask, 17, 0);
382
    }
383
384
    #[test]
385
    fn test_bit_index_u32_iterator_basic() {
386
        let mask = &[0b00010010, 0b00100011];
387
388
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
389
        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
390
            .map(|i| i as u32)
391
            .collect();
392
        assert_eq!(result, expected);
393
394
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 4, 8).collect();
395
        let expected: Vec<u32> = BitIndexIterator::new(mask, 4, 8)
396
            .map(|i| i as u32)
397
            .collect();
398
        assert_eq!(result, expected);
399
400
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 10, 4).collect();
401
        let expected: Vec<u32> = BitIndexIterator::new(mask, 10, 4)
402
            .map(|i| i as u32)
403
            .collect();
404
        assert_eq!(result, expected);
405
406
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 0).collect();
407
        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 0)
408
            .map(|i| i as u32)
409
            .collect();
410
        assert_eq!(result, expected);
411
    }
412
413
    #[test]
414
    fn test_bit_index_u32_iterator_all_set() {
415
        let mask = &[0xFF, 0xFF];
416
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
417
        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
418
            .map(|i| i as u32)
419
            .collect();
420
        assert_eq!(result, expected);
421
    }
422
423
    #[test]
424
    fn test_bit_index_u32_iterator_none_set() {
425
        let mask = &[0x00, 0x00];
426
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
427
        let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
428
            .map(|i| i as u32)
429
            .collect();
430
        assert_eq!(result, expected);
431
    }
432
433
    #[test]
434
    fn test_bit_index_u32_cross_chunk() {
435
        let mut buf = vec![0u8; 16];
436
        for bit in 60..68 {
437
            let byte = (bit / 8) as usize;
438
            let bit_in_byte = bit % 8;
439
            buf[byte] |= 1 << bit_in_byte;
440
        }
441
        let offset = 58;
442
        let len = 10;
443
444
        let result: Vec<u32> = BitIndexU32Iterator::new(&buf, offset, len).collect();
445
        let expected: Vec<u32> = BitIndexIterator::new(&buf, offset, len)
446
            .map(|i| i as u32)
447
            .collect();
448
        assert_eq!(result, expected);
449
    }
450
451
    #[test]
452
    fn test_bit_index_u32_unaligned_offset() {
453
        let mask = &[0b0110_1100, 0b1010_0000];
454
        let offset = 2;
455
        let len = 12;
456
457
        let result: Vec<u32> = BitIndexU32Iterator::new(mask, offset, len).collect();
458
        let expected: Vec<u32> = BitIndexIterator::new(mask, offset, len)
459
            .map(|i| i as u32)
460
            .collect();
461
        assert_eq!(result, expected);
462
    }
463
464
    #[test]
465
    fn test_bit_index_u32_long_all_set() {
466
        let len = 200;
467
        let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
468
        let bytes = vec![0xFFu8; num_bytes];
469
470
        let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
471
        let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
472
            .map(|i| i as u32)
473
            .collect();
474
        assert_eq!(result, expected);
475
    }
476
477
    #[test]
478
    fn test_bit_index_u32_none_set() {
479
        let len = 50;
480
        let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
481
        let bytes = vec![0u8; num_bytes];
482
483
        let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
484
        let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
485
            .map(|i| i as u32)
486
            .collect();
487
        assert_eq!(result, expected);
488
    }
489
}