Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ members = [
[profile.release]
lto = true
codegen-units = 1

[patch.crates-io]
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relies on apache/arrow-rs@e375bba, will remove this once we have arrow 9.0.1 released.

2 changes: 2 additions & 0 deletions datafusion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ pub use arrow;
pub use parquet;

pub(crate) mod field_util;
#[allow(dead_code)]
pub(crate) mod row;

#[cfg(feature = "pyarrow")]
mod pyarrow;
Expand Down
194 changes: 194 additions & 0 deletions datafusion/src/row/bitmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! General utilities for null bit section handling based on [arrow::util::bit_util]
use arrow::util::bit_util::{
ceil, get_bit_raw, round_upto_power_of_2, set_bit_raw, unset_bit_raw,
};
use std::fmt::Write;

const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255];

/// Returns whether bit at position `i` in `byte` is set or not
#[inline]
pub fn is_set(byte: u8, i: usize) -> bool {
(byte & BIT_MASK[i]) != 0
}

/// Sets bit at position `i` in `data`
#[inline]
pub fn set_bit(data: &mut [u8], i: usize) {
unsafe {
set_bit_raw(data.as_mut_ptr(), i);
}
}

/// Unsets bit at position `i` in `data`
#[inline]
pub fn unset_bit(data: &mut [u8], i: usize) {
unsafe {
unset_bit_raw(data.as_mut_ptr(), i);
}
}

/// Returns whether bit at position `i` in `data` is set or not.
#[inline]
pub fn get_bit(data: &[u8], i: usize) -> bool {
unsafe { get_bit_raw(data.as_ptr(), i) }
}

/// Returns the number of bytes required to hold `n` bits.
#[inline]
pub fn null_width(n: usize) -> usize {
ceil(n, 8)
}

#[inline]
pub fn align_word(n: usize) -> usize {
round_upto_power_of_2(n, 8)
}

/// Returns if all fields are valid
pub fn all_valid(data: &[u8], n: usize) -> bool {
for item in data.iter().take(n / 8) {
if *item != ALL_VALID_MASK[7] {
return false;
}
}
if n % 8 == 0 {
true
} else {
data[n / 8] == ALL_VALID_MASK[n % 8 - 1]
}
}

/// Show null bit for each field in a tuple, 1 for valid and 0 for null.
/// For a tuple with nine total fields, valid at field 0, 6, 7, 8 shows as `[10000011, 1]`.
pub struct NullBitsFormatter<'a> {
null_bits: &'a [u8],
field_count: usize,
}

impl<'a> NullBitsFormatter<'a> {
/// new
pub fn new(null_bits: &'a [u8], field_count: usize) -> Self {
Self {
null_bits,
field_count,
}
}
}

impl<'a> std::fmt::Debug for NullBitsFormatter<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut is_first = true;
for i in 0..self.field_count {
if is_first {
f.write_char('[')?;
is_first = false;
} else if i % 8 == 0 {
f.write_str(", ")?;
}
if get_bit(self.null_bits, i) {
f.write_char('1')?;
} else {
f.write_char('0')?;
}
}
f.write_char(']')?;
Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;

fn test_validity(bs: &[bool]) {
let mut data = vec![0; null_width(bs.len())];
for (i, b) in bs.iter().enumerate() {
if *b {
set_bit(&mut data, i);
} else {
unset_bit(&mut data, i);
}
}
let expected = bs.iter().all(|f| *f);
assert_eq!(all_valid(&data, bs.len()), expected);
}

#[test]
fn test_all_valid() {
let sizes = [4, 8, 12, 16, 19, 23, 32, 44];
for i in sizes {
{
// contains false
let input = {
let mut rng = rand::thread_rng();
let mut input: Vec<bool> = vec![false; i];
rng.fill(&mut input[..]);
input
};
test_validity(&input);
}

{
// all true
let input = vec![true; i];
test_validity(&input);
}
}
}

#[test]
fn test_formatter() -> std::fmt::Result {
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[0b11000001], 8)),
"[10000011]"
);
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1], 9)),
"[10000011, 1]"
);
assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 2)), "[10]");
assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 3)), "[100]");
assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 4)), "[1000]");
assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 5)), "[10000]");
assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 6)), "[100000]");
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[1], 7)),
"[1000000]"
);
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[1], 8)),
"[10000000]"
);
// extra bytes are ignored
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1, 1], 9)),
"[10000011, 1]"
);
assert_eq!(
format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1], 16)),
"[10000011, 10000000]"
);
Ok(())
}
}
Loading