Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ members = [
[profile.release]
lto = true
codegen-units = 1

[patch.crates-io]
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relies on apache/arrow-rs@e375bba, will remove this once we have arrow 9.0.1 released.

2 changes: 2 additions & 0 deletions datafusion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ pub use arrow;
pub use parquet;

pub(crate) mod field_util;
#[allow(dead_code)]
pub(crate) mod row;

#[cfg(feature = "pyarrow")]
mod pyarrow;
Expand Down
132 changes: 132 additions & 0 deletions datafusion/src/row/bitmap/fmt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::fmt::Write;

use super::is_set;

/// Formats `bytes` taking into account an offset and length of the form
pub fn fmt(
bytes: &[u8],
offset: usize,
length: usize,
f: &mut std::fmt::Formatter<'_>,
) -> std::fmt::Result {
assert!(offset < 8);

f.write_char('[')?;
let mut remaining = length;
if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let first = bytes[0];
let bytes = &bytes[1..];
let empty_before = 8usize.saturating_sub(remaining + offset);
f.write_str("0b")?;
for _ in 0..empty_before {
f.write_char('_')?;
}
let until = std::cmp::min(8, offset + remaining);
for i in offset..until {
if is_set(first, offset + until - 1 - i) {
f.write_char('1')?;
} else {
f.write_char('0')?;
}
}
for _ in 0..offset {
f.write_char('_')?;
}
remaining -= until - offset;

if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let number_of_bytes = remaining / 8;
for byte in &bytes[..number_of_bytes] {
f.write_str(", ")?;
f.write_fmt(format_args!("{:#010b}", byte))?;
}
remaining -= number_of_bytes * 8;
if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let last = bytes[std::cmp::min((length + offset + 7) / 8, bytes.len() - 1)];
let remaining = (length + offset) % 8;
f.write_str(", ")?;
f.write_str("0b")?;
for _ in 0..(8 - remaining) {
f.write_char('_')?;
}
for i in 0..remaining {
if is_set(last, remaining - 1 - i) {
f.write_char('1')?;
} else {
f.write_char('0')?;
}
}
f.write_char(']')
}

#[cfg(test)]
mod tests {
use super::*;

struct A<'a>(&'a [u8], usize, usize);
impl<'a> std::fmt::Debug for A<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
fmt(self.0, self.1, self.2, f)
}
}

#[test]
fn test_debug() -> std::fmt::Result {
assert_eq!(format!("{:?}", A(&[1], 0, 0)), "[]");
assert_eq!(format!("{:?}", A(&[0b11000001], 0, 8)), "[0b11000001]");
assert_eq!(
format!("{:?}", A(&[0b11000001, 1], 0, 9)),
"[0b11000001, 0b_______1]"
);
assert_eq!(format!("{:?}", A(&[1], 0, 2)), "[0b______01]");
assert_eq!(format!("{:?}", A(&[1], 1, 2)), "[0b_____00_]");
assert_eq!(format!("{:?}", A(&[1], 2, 2)), "[0b____00__]");
assert_eq!(format!("{:?}", A(&[1], 3, 2)), "[0b___00___]");
assert_eq!(format!("{:?}", A(&[1], 4, 2)), "[0b__00____]");
assert_eq!(format!("{:?}", A(&[1], 5, 2)), "[0b_00_____]");
assert_eq!(format!("{:?}", A(&[1], 6, 2)), "[0b00______]");
assert_eq!(
format!("{:?}", A(&[0b11000001, 1], 1, 9)),
"[0b1100000_, 0b______01]"
);
// extra bytes are ignored
assert_eq!(
format!("{:?}", A(&[0b11000001, 1, 1, 1], 1, 9)),
"[0b1100000_, 0b______01]"
);
assert_eq!(
format!("{:?}", A(&[0b11000001, 1, 1], 2, 16)),
"[0b110000__, 0b00000001, 0b______01]"
);
Ok(())
}
}
126 changes: 126 additions & 0 deletions datafusion/src/row/bitmap/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! General utilities for null bit section handling
//!
//! Note: this is a tailored version based on [arrow2 bitmap utils](https://github.com/jorgecarleitao/arrow2/tree/main/src/bitmap/utils)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this appears to itself be a copy of https://docs.rs/arrow/latest/arrow/util/bit_util/index.html

Copy link
Member Author

@yjshen yjshen Feb 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bitmap is rewritten on top of arrow/util/bit_util, along with a much-simplified version of fmt.

mod fmt;

pub use fmt::fmt;

const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
const UNSET_BIT_MASK: [u8; 8] = [
255 - 1,
255 - 2,
255 - 4,
255 - 8,
255 - 16,
255 - 32,
255 - 64,
255 - 128,
];
const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255];

/// Returns whether bit at position `i` in `byte` is set or not
#[inline]
pub fn is_set(byte: u8, i: usize) -> bool {
(byte & BIT_MASK[i]) != 0
}

/// Sets bit at position `i` in `byte`
#[inline]
pub fn set(byte: u8, i: usize, value: bool) -> u8 {
if value {
byte | BIT_MASK[i]
} else {
byte & UNSET_BIT_MASK[i]
}
}

/// Sets bit at position `i` in `data`
#[inline]
pub fn set_bit(data: &mut [u8], i: usize, value: bool) {
data[i / 8] = set(data[i / 8], i % 8, value);
}

/// Returns whether bit at position `i` in `data` is set or not.
///
/// # Safety
/// `i >= data.len() * 8` results in undefined behavior
#[inline]
pub unsafe fn get_bit_unchecked(data: &[u8], i: usize) -> bool {
(*data.as_ptr().add(i >> 3) & BIT_MASK[i & 7]) != 0
}

/// Returns the number of bytes required to hold `bits` bits.
#[inline]
pub fn bytes_for(bits: usize) -> usize {
bits.saturating_add(7) / 8
}

/// Returns if all fields are valid
pub fn all_valid(data: &[u8], n: usize) -> bool {
for item in data.iter().take(n / 8) {
if *item != ALL_VALID_MASK[7] {
return false;
}
}
if n % 8 == 0 {
true
} else {
data[n / 8] == ALL_VALID_MASK[n % 8 - 1]
}
}

#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;

fn test_validity(bs: &[bool]) {
let mut data = vec![0; bytes_for(bs.len())];
for (i, b) in bs.iter().enumerate() {
set_bit(&mut data, i, *b);
}
let expected = bs.iter().all(|f| *f);
assert_eq!(all_valid(&data, bs.len()), expected);
}

#[test]
fn test_all_valid() {
let sizes = [4, 8, 12, 16, 19, 23, 32, 44];
for i in sizes {
{
// contains false
let input = {
let mut rng = rand::thread_rng();
let mut input: Vec<bool> = vec![false; i];
rng.fill(&mut input[..]);
input
};
test_validity(&input);
}

{
// all true
let input = vec![true; i];
test_validity(&input);
}
}
}
}
Loading