Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 180 additions & 29 deletions parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ use chrono::Timelike;
use indexmap::{IndexMap, IndexSet};
use uuid::Uuid;

use std::collections::HashMap;

const BASIC_TYPE_BITS: u8 = 2;
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();

Expand Down Expand Up @@ -431,13 +433,97 @@ impl ValueBuilder {
}
}

/// A trait for building variant metadata dictionaries, to be used in conjunction with a
/// [`ValueBuilder`]. The trait provides methods for managing field names and their IDs, as well as
/// rolling back a failed builder operation that might have created new field ids.
pub trait MetadataBuilder: std::fmt::Debug {
/// Attempts to register a field name, returning the corresponding (possibly newly-created)
/// field id on success. Attempting to register the same field name twice will _generally_
/// produce the same field id both times, but the variant spec does not actually require it.
fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError>;

/// Retrieves the field name for a given field id, which must be less than
/// [`Self::num_field_names`]. Panics if the field id is out of bounds.
fn field_name(&self, field_id: usize) -> &str;

/// Returns the number of field names stored in this metadata builder. Any number less than this
/// is a valid field id. The builder can be reverted back to this size later on (discarding any
/// newer/higher field ids) by calling [`Self::truncate_field_names`].
fn num_field_names(&self) -> usize;

/// Reverts the field names to a previous size, discarding any newly out of bounds field ids.
fn truncate_field_names(&mut self, new_size: usize);
}

impl MetadataBuilder for BasicMetadataBuilder {
fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> {
Ok(self.upsert_field_name(field_name))
}
fn field_name(&self, field_id: usize) -> &str {
self.field_name(field_id)
}
fn num_field_names(&self) -> usize {
self.num_field_names()
}
fn truncate_field_names(&mut self, new_size: usize) {
self.field_names.truncate(new_size)
}
}

/// A metadata builder that cannot register new field names, and merely returns the field id
/// associated with a known field name. This is useful for variant unshredding operations, where the
/// metadata column is fixed and -- per variant shredding spec -- already contains all field names
/// from the typed_value column. It is also useful when projecting a subset of fields from a variant
/// object value, since the bytes can be copied across directly without re-encoding their field ids.
#[derive(Debug)]
pub struct ReadOnlyMetadataBuilder<'m> {
metadata: VariantMetadata<'m>,
known_field_names: HashMap<&'m str, u32>,
}

impl<'m> ReadOnlyMetadataBuilder<'m> {
/// Creates a new read-only metadata builder from the given metadata dictionary.
pub fn new(metadata: VariantMetadata<'m>) -> Self {
Self {
metadata,
known_field_names: HashMap::new(),
}
}
}

impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32, ArrowError> {
if let Some(field_id) = self.known_field_names.get(field_name) {
return Ok(*field_id);
}

let Some((field_id, field_name)) = self.metadata.get_entry(field_name) else {
return Err(ArrowError::InvalidArgumentError(format!(
"Field name '{field_name}' not found in metadata dictionary"
)));
};

self.known_field_names.insert(field_name, field_id);
Ok(field_id)
}
fn field_name(&self, field_id: usize) -> &str {
&self.metadata[field_id]
}
fn num_field_names(&self) -> usize {
self.metadata.len()
}
fn truncate_field_names(&mut self, new_size: usize) {
debug_assert_eq!(self.metadata.len(), new_size);
}
}

/// Builder for constructing metadata for [`Variant`] values.
///
/// This is used internally by the [`VariantBuilder`] to construct the metadata
///
/// You can use an existing `Vec<u8>` as the metadata buffer by using the `from` impl.
#[derive(Default, Debug)]
struct MetadataBuilder {
struct BasicMetadataBuilder {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that Basic is a bit generic and it is somewhat unclear what the difference between a "ReadOnly" builder and a "Basic" builder are.

Here are some other ideas for names:

  • WriteableMetadataBuilder -- can update the metadata (this is my preference)
  • OwnedMetadataBuilder --- the builder owns the underlying structures (can thus can make changes)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Went with WritableMetadataBuilder.

// Field names -- field_ids are assigned in insert order
field_names: IndexSet<String>,

Expand All @@ -449,7 +535,7 @@ struct MetadataBuilder {
}

/// Create a new MetadataBuilder that will write to the specified metadata buffer
impl From<Vec<u8>> for MetadataBuilder {
impl From<Vec<u8>> for BasicMetadataBuilder {
fn from(metadata_buffer: Vec<u8>) -> Self {
Self {
metadata_buffer,
Expand All @@ -458,7 +544,7 @@ impl From<Vec<u8>> for MetadataBuilder {
}
}

impl MetadataBuilder {
impl BasicMetadataBuilder {
/// Upsert field name to dictionary, return its ID
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
let (id, new_entry) = self.field_names.insert_full(field_name.to_string());
Expand Down Expand Up @@ -549,7 +635,7 @@ impl MetadataBuilder {
}
}

impl<S: AsRef<str>> FromIterator<S> for MetadataBuilder {
impl<S: AsRef<str>> FromIterator<S> for BasicMetadataBuilder {
fn from_iter<T: IntoIterator<Item = S>>(iter: T) -> Self {
let mut this = Self::default();
this.extend(iter);
Expand All @@ -558,7 +644,7 @@ impl<S: AsRef<str>> FromIterator<S> for MetadataBuilder {
}
}

impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
impl<S: AsRef<str>> Extend<S> for BasicMetadataBuilder {
fn extend<T: IntoIterator<Item = S>>(&mut self, iter: T) {
let iter = iter.into_iter();
let (min, _) = iter.size_hint();
Expand Down Expand Up @@ -589,14 +675,14 @@ enum ParentState<'a> {
Variant {
value_builder: &'a mut ValueBuilder,
saved_value_builder_offset: usize,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using &dyn here I think means all accesses now need a level of indirection (and I think we can see this slowdown of a few percent in the benchmarks)

I wonder if (maybe as a follow on PR) we can consider making parent state generic and squeezing that performance back. It may also be premature optimization for now

Copy link
Contributor Author

@scovich scovich Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I was trying to avoid the "pollution" that generics would cause, which is clearly visible in my first attempt:

It's not just ParentState, but also ListBuilder, ObjectBuilder and even the VariantBuilderExt trait. Plus it requires a bunch of methods that are only defined for specific combinations of types.

But we may need to go generic now, because VariantArrayVariantBuilder::finish needs access to BasicMetadataBuilder::finish in order to work correctly, and that's unavailable because the parent state has upcast it to &mut dyn MetadataBuilder.

I initially reached for std::any::AsAny, but it turns out that types with non-static lifetimes cannot implement std::any::Any because the lifetime information would be lost.

In order to get the latest merge with upstream to compile, I had to define a new MetadataBuilder::finish trait method to expose the underlying BasicMetadataBuilder::finish, but that also forced a no-op ReadOnlyMetadataBuilder::finish.

Honestly, the generic mess is messy enough that I lean strongly toward keeping the dyn indirection unless we're really certain we need generics for performance or functionality reasons.

Copy link
Contributor Author

@scovich scovich Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A related issue we'll hit shortly: VariantArrayBuilder is currently hard-wired to use a BasicMetadataBuilder that produces a new metadata column.

Once we need a variant array builder that reuses an existing metadata column (e.g. for unshredding), we'll be forced to decide whether we want to make VariantArrayBuilder generic over M: MetadataBuilder or just define a second builder class.

I suspect that choice will ultimately decide the generic-vs-not question.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose we could also avoid generics altogether by defining our own versions of Any and AsAny that preserve lifetime info... but they would become part of the public arrow-rs API which seems awkward.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, the generic mess is messy enough that I lean strongly toward keeping the dyn indirection unless we're really certain we need generics for performance or functionality reasons.

I think this is a wise strategy. Let's wait for some more "end to end" type benchmarks (like reading/writing JSON to arrays, or shredding variants) and see if we need to try and squeeze a few more percent out

saved_metadata_builder_dict_size: usize,
finished: bool,
},
List {
value_builder: &'a mut ValueBuilder,
saved_value_builder_offset: usize,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
saved_metadata_builder_dict_size: usize,
offsets: &'a mut Vec<usize>,
saved_offsets_size: usize,
Expand All @@ -605,7 +691,7 @@ enum ParentState<'a> {
Object {
value_builder: &'a mut ValueBuilder,
saved_value_builder_offset: usize,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
saved_metadata_builder_dict_size: usize,
fields: &'a mut IndexMap<u32, usize>,
saved_fields_size: usize,
Expand All @@ -616,7 +702,7 @@ enum ParentState<'a> {
impl<'a> ParentState<'a> {
fn variant(
value_builder: &'a mut ValueBuilder,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
) -> Self {
ParentState::Variant {
saved_value_builder_offset: value_builder.offset(),
Expand All @@ -629,7 +715,7 @@ impl<'a> ParentState<'a> {

fn list(
value_builder: &'a mut ValueBuilder,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
offsets: &'a mut Vec<usize>,
saved_parent_value_builder_offset: usize,
) -> Self {
Expand All @@ -653,7 +739,7 @@ impl<'a> ParentState<'a> {

fn try_object(
value_builder: &'a mut ValueBuilder,
metadata_builder: &'a mut MetadataBuilder,
metadata_builder: &'a mut dyn MetadataBuilder,
fields: &'a mut IndexMap<u32, usize>,
saved_parent_value_builder_offset: usize,
field_name: &str,
Expand All @@ -665,7 +751,7 @@ impl<'a> ParentState<'a> {
let saved_value_builder_offset = value_builder.offset();
let saved_fields_size = fields.len();
let saved_metadata_builder_dict_size = metadata_builder.num_field_names();
let field_id = metadata_builder.upsert_field_name(field_name);
let field_id = metadata_builder.try_upsert_field_name(field_name)?;
let field_start = saved_value_builder_offset - saved_parent_value_builder_offset;
if fields.insert(field_id, field_start).is_some() && validate_unique_fields {
return Err(ArrowError::InvalidArgumentError(format!(
Expand All @@ -688,7 +774,7 @@ impl<'a> ParentState<'a> {
self.value_and_metadata_builders().0
}

fn metadata_builder(&mut self) -> &mut MetadataBuilder {
fn metadata_builder(&mut self) -> &mut dyn MetadataBuilder {
self.value_and_metadata_builders().1
}

Expand Down Expand Up @@ -754,9 +840,7 @@ impl<'a> ParentState<'a> {
value_builder
.inner_mut()
.truncate(*saved_value_builder_offset);
metadata_builder
.field_names
.truncate(*saved_metadata_builder_dict_size);
metadata_builder.truncate_field_names(*saved_metadata_builder_dict_size);
}
};

Expand All @@ -778,7 +862,7 @@ impl<'a> ParentState<'a> {

/// Return mutable references to the value and metadata builders that this
/// parent state is using.
fn value_and_metadata_builders(&mut self) -> (&mut ValueBuilder, &mut MetadataBuilder) {
fn value_and_metadata_builders(&mut self) -> (&mut ValueBuilder, &mut dyn MetadataBuilder) {
match self {
ParentState::Variant {
value_builder,
Expand Down Expand Up @@ -1079,7 +1163,7 @@ impl Drop for ParentState<'_> {
#[derive(Default, Debug)]
pub struct VariantBuilder {
value_builder: ValueBuilder,
metadata_builder: MetadataBuilder,
metadata_builder: BasicMetadataBuilder,
validate_unique_fields: bool,
}

Expand All @@ -1088,7 +1172,7 @@ impl VariantBuilder {
pub fn new() -> Self {
Self {
value_builder: ValueBuilder::new(),
metadata_builder: MetadataBuilder::default(),
metadata_builder: BasicMetadataBuilder::default(),
validate_unique_fields: false,
}
}
Expand All @@ -1105,7 +1189,7 @@ impl VariantBuilder {
pub fn new_with_buffers(metadata_buffer: Vec<u8>, value_buffer: Vec<u8>) -> Self {
Self {
value_builder: ValueBuilder::from(value_buffer),
metadata_builder: MetadataBuilder::from(metadata_buffer),
metadata_builder: BasicMetadataBuilder::from(metadata_buffer),
validate_unique_fields: false,
}
}
Expand Down Expand Up @@ -2703,28 +2787,28 @@ mod tests {

#[test]
fn test_metadata_builder_from_iter() {
let metadata = MetadataBuilder::from_iter(vec!["apple", "banana", "cherry"]);
let metadata = BasicMetadataBuilder::from_iter(vec!["apple", "banana", "cherry"]);
assert_eq!(metadata.num_field_names(), 3);
assert_eq!(metadata.field_name(0), "apple");
assert_eq!(metadata.field_name(1), "banana");
assert_eq!(metadata.field_name(2), "cherry");
assert!(metadata.is_sorted);

let metadata = MetadataBuilder::from_iter(["zebra", "apple", "banana"]);
let metadata = BasicMetadataBuilder::from_iter(["zebra", "apple", "banana"]);
assert_eq!(metadata.num_field_names(), 3);
assert_eq!(metadata.field_name(0), "zebra");
assert_eq!(metadata.field_name(1), "apple");
assert_eq!(metadata.field_name(2), "banana");
assert!(!metadata.is_sorted);

let metadata = MetadataBuilder::from_iter(Vec::<&str>::new());
let metadata = BasicMetadataBuilder::from_iter(Vec::<&str>::new());
assert_eq!(metadata.num_field_names(), 0);
assert!(!metadata.is_sorted);
}

#[test]
fn test_metadata_builder_extend() {
let mut metadata = MetadataBuilder::default();
let mut metadata = BasicMetadataBuilder::default();
assert_eq!(metadata.num_field_names(), 0);
assert!(!metadata.is_sorted);

Expand All @@ -2749,7 +2833,7 @@ mod tests {

#[test]
fn test_metadata_builder_extend_sort_order() {
let mut metadata = MetadataBuilder::default();
let mut metadata = BasicMetadataBuilder::default();

metadata.extend(["middle"]);
assert!(metadata.is_sorted);
Expand All @@ -2765,17 +2849,20 @@ mod tests {
#[test]
fn test_metadata_builder_from_iter_with_string_types() {
// &str
let metadata = MetadataBuilder::from_iter(["a", "b", "c"]);
let metadata = BasicMetadataBuilder::from_iter(["a", "b", "c"]);
assert_eq!(metadata.num_field_names(), 3);

// string
let metadata =
MetadataBuilder::from_iter(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
let metadata = BasicMetadataBuilder::from_iter(vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
]);
assert_eq!(metadata.num_field_names(), 3);

// mixed types (anything that implements AsRef<str>)
let field_names: Vec<Box<str>> = vec!["a".into(), "b".into(), "c".into()];
let metadata = MetadataBuilder::from_iter(field_names);
let metadata = BasicMetadataBuilder::from_iter(field_names);
assert_eq!(metadata.num_field_names(), 3);
}

Expand Down Expand Up @@ -3255,4 +3342,68 @@ mod tests {

assert_eq!(format!("{v1:?}"), format!("{v2:?}"));
}

#[test]
fn test_read_only_metadata_builder() {
// First create some metadata with a few field names
let mut default_builder = VariantBuilder::new();
default_builder.add_field_name("name");
default_builder.add_field_name("age");
default_builder.add_field_name("active");
let (metadata_bytes, _) = default_builder.finish();

// Use the metadata to build new variant values
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata);
let mut value_builder = ValueBuilder::new();

{
let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
let mut obj = ObjectBuilder::new(state, false);

// These should succeed because the fields exist in the metadata
obj.insert("name", "Alice");
obj.insert("age", 30i8);
obj.insert("active", true);
obj.finish().unwrap();
}

let value = value_builder.into_inner();

// Verify the variant was built correctly
let variant = Variant::try_new(&metadata_bytes, &value).unwrap();
let obj = variant.as_object().unwrap();
assert_eq!(obj.get("name"), Some(Variant::from("Alice")));
assert_eq!(obj.get("age"), Some(Variant::Int8(30)));
assert_eq!(obj.get("active"), Some(Variant::from(true)));
}

#[test]
fn test_read_only_metadata_builder_fails_on_unknown_field() {
// Create metadata with only one field
let mut default_builder = VariantBuilder::new();
default_builder.add_field_name("known_field");
let (metadata_bytes, _) = default_builder.finish();

// Use the metadata to build new variant values
let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap();
let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata);
let mut value_builder = ValueBuilder::new();

{
let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
let mut obj = ObjectBuilder::new(state, false);

// This should succeed
obj.insert("known_field", "value");

// This should fail because "unknown_field" is not in the metadata
let result = obj.try_insert("unknown_field", "value");
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("Field name 'unknown_field' not found"));
}
}
}
Loading
Loading