diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 4b6627234334..f0a0771477fb 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -86,10 +86,9 @@ impl ConvertOpt { // Select all apart from the padding column let selection = csv .schema() - .fields() .iter() .take(schema.fields.len() - 1) - .map(|d| Expr::Column(d.qualified_column())) + .map(col) .collect(); csv = csv.select(selection)?; diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 5f9f3106e14d..1e1947c7d9e5 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -22,7 +22,7 @@ use arrow::array::{BooleanArray, Int32Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use datafusion::common::{DFField, DFSchema}; +use datafusion::common::DFSchema; use datafusion::error::Result; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::{ @@ -273,18 +273,16 @@ fn expression_type_demo() -> Result<()> { // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Utf8, true)], + vec![Field::new("c", DataType::Utf8, true)], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Int32, true)], + vec![Field::new("c", DataType::Int32, true)], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); // Get the type of an expression that adds 2 columns. Adding an Int32 @@ -292,12 +290,11 @@ fn expression_type_demo() -> Result<()> { let expr = col("c1") + col("c2"); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("c1", DataType::Int32, true), - DFField::new_unqualified("c2", DataType::Float32, true), + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Float32, true), ], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); Ok(()) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f0edc7175948..3992d381925a 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -19,7 +19,9 @@ use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; -use crate::{DFSchema, DataFusionError, OwnedTableReference, Result, SchemaError}; +use crate::{ + DFFieldRef, DFSchema, DataFusionError, OwnedTableReference, Result, SchemaError, +}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; @@ -178,11 +180,13 @@ impl Column { } for schema in schemas { - let fields = schema.fields_with_unqualified_name(&self.name); - match fields.len() { + let mut qualified_fields = + schema.qualified_fields_with_unqualified_name(&self.name); + match qualified_fields.len() { 0 => continue, 1 => { - return Ok(fields[0].qualified_column()); + let dffield = qualified_fields.swap_remove(0); + return Ok(dffield.into()); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -198,14 +202,13 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema.columns_with_unqualified_name(&self.name); for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = columns.iter().all(|f| using_col.contains(f)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(columns[0].clone()); } } } @@ -214,10 +217,7 @@ impl Column { _schema_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(self.relation.clone(), self.name)), - valid_fields: schemas - .iter() - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) - .collect(), + valid_fields: schemas.iter().flat_map(|s| s.columns()).collect(), }) } @@ -267,13 +267,18 @@ impl Column { } for schema_level in schemas { - let fields = schema_level + let qualified_fields = schema_level .iter() - .flat_map(|s| s.fields_with_unqualified_name(&self.name)) + .flat_map(|s| s.qualified_fields_with_unqualified_name(&self.name)) .collect::>(); - match fields.len() { + match qualified_fields.len() { 0 => continue, - 1 => return Ok(fields[0].qualified_column()), + 1 => { + return Ok(Column::new( + qualified_fields[0].owned_qualifier(), + qualified_fields[0].name(), + )) + } _ => { // More than 1 fields in this schema have their names set to self.name. // @@ -288,14 +293,16 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema_level + .iter() + .flat_map(|s| s.columns_with_unqualified_name(&self.name)) + .collect::>(); for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(columns[0].clone()); } } @@ -312,12 +319,18 @@ impl Column { valid_fields: schemas .iter() .flat_map(|s| s.iter()) - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) + .flat_map(|s| s.columns()) .collect(), }) } } +impl<'a> From> for Column { + fn from(f: DFFieldRef<'a>) -> Self { + Column::new(f.owned_qualifier(), f.name()) + } +} + impl From<&str> for Column { fn from(c: &str) -> Self { Self::from_qualified_name(c) @@ -355,36 +368,25 @@ impl fmt::Display for Column { #[cfg(test)] mod tests { use super::*; - use crate::DFField; use arrow::datatypes::DataType; - use std::collections::HashMap; - - fn create_schema(names: &[(Option<&str>, &str)]) -> Result { - let fields = names - .iter() - .map(|(qualifier, name)| { - DFField::new( - qualifier.to_owned().map(|s| s.to_string()), - name, - DataType::Boolean, - true, - ) - }) - .collect::>(); - DFSchema::new_with_metadata(fields, HashMap::new()) + use arrow_schema::{Field, SchemaBuilder}; + + fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result { + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend( + names + .iter() + .map(|f| Field::new(*f, DataType::Boolean, true)), + ); + let schema = Arc::new(schema_builder.finish()); + DFSchema::try_from_qualified_schema(qualifier, &schema) } #[test] fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> { - let schema1 = create_schema(&[(Some("t1"), "a"), (Some("t1"), "b")])?; - let schema2 = create_schema(&[(Some("t2"), "c"), (Some("t2"), "d")])?; - let schema3 = create_schema(&[ - (Some("t3"), "a"), - (Some("t3"), "b"), - (Some("t3"), "c"), - (Some("t3"), "d"), - (Some("t3"), "e"), - ])?; + let schema1 = create_qualified_schema("t1", vec!["a", "b"])?; + let schema2 = create_qualified_schema("t2", vec!["c", "d"])?; + let schema3 = create_qualified_schema("t3", vec!["a", "b", "c", "d", "e"])?; // already normalized let col = Column::new(Some("t1"), "a"); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 90fb0b035d35..9476751098b3 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -24,16 +24,15 @@ use std::fmt::{Display, Formatter}; use std::hash::Hash; use std::sync::Arc; -use crate::error::{ - unqualified_field_not_found, DataFusionError, Result, SchemaError, _plan_err, - _schema_err, -}; +use crate::error::{DataFusionError, Result, _plan_err, _schema_err}; use crate::{ - field_not_found, Column, FunctionalDependencies, OwnedTableReference, TableReference, + field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, + OwnedTableReference, SchemaError, TableReference, }; use arrow::compute::can_cast_types; -use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; +use arrow_schema::{FieldRef, SchemaBuilder}; /// A reference-counted reference to a [DFSchema]. pub type DFSchemaRef = Arc; @@ -95,22 +94,24 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust -/// use datafusion_common::{DFSchema, DFField}; +/// use datafusion_common::DFSchema; /// use arrow_schema::Schema; +/// use arrow::datatypes::Field; /// use std::collections::HashMap; /// /// let df_schema = DFSchema::new_with_metadata(vec![ -/// DFField::new_unqualified("c1", arrow::datatypes::DataType::Int32, false), -/// ], HashMap::new()).unwrap(); +/// Field::new("c1", arrow::datatypes::DataType::Int32, false), +/// ],HashMap::new()); /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFSchema { - /// Fields - fields: Vec, - /// Additional metadata in form of key value pairs - metadata: HashMap, + /// Inner Arrow schema reference. + inner: SchemaRef, + /// Optional qualifiers for each column in this schema. In the same order as + /// the `self.inner.fields()` + field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, } @@ -119,66 +120,124 @@ impl DFSchema { /// Creates an empty `DFSchema` pub fn empty() -> Self { Self { - fields: vec![], - metadata: HashMap::new(), + inner: Arc::new(Schema::new([])), + field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), } } - /// Create a new `DFSchema` + /// Create a new `DFSchema` from an Arrow schema pub fn new_with_metadata( - fields: Vec, + fields: Vec, metadata: HashMap, + ) -> Self { + let field_count = fields.len(); + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + Self { + inner: schema, + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + } + } + + /// Create a `DFSchema` from an Arrow schema and a given qualifier + /// + /// To create a schema from an Arrow schema without a qualifier, use + /// `DFSchema::try_from`. + pub fn try_from_qualified_schema<'a>( + qualifier: impl Into>, + schema: &Schema, ) -> Result { + let qualifier = qualifier.into(); + let owned_qualifier = qualifier.to_owned_reference(); + let schema = DFSchema { + inner: schema.clone().into(), + field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + schema.check_names()?; + Ok(schema) + } + + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_field_specific_qualified_schema<'a>( + qualifiers: Vec>>>, + schema: &SchemaRef, + ) -> Result { + let owned_qualifiers = qualifiers + .into_iter() + .map(|qualifier| qualifier.map(|q| q.into().to_owned_reference())) + .collect(); + let dfschema = Self { + inner: schema.clone(), + field_qualifiers: owned_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_qualified_fields( + dffields: Vec, + metadata: HashMap, + ) -> Result { + let fields = dffields.iter().map(|f| f.owned_field()).collect::>(); + let qualifiers = dffields + .iter() + .map(|f| f.owned_qualifier()) + .collect::>(); + + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + + let dfschema = Self { + inner: schema, + field_qualifiers: qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_owned_qualified_fields( + field_qualifiers: Vec>, + fields: impl Into, + metadata: HashMap, + ) -> Result { + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + let dfschema = Self { + inner: schema, + field_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Check if the schema have some fields with the same name + fn check_names(&self) -> Result<()> { let mut qualified_names = BTreeSet::new(); let mut unqualified_names = BTreeSet::new(); - for field in &fields { - if let Some(qualifier) = field.qualifier() { + for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + if let Some(qualifier) = qualifier { qualified_names.insert((qualifier, field.name())); } else if !unqualified_names.insert(field.name()) { return _schema_err!(SchemaError::DuplicateUnqualifiedField { - name: field.name().to_string(), + name: field.name().to_string() }); } } - // Check for mix of qualified and unqualified fields with same unqualified name. - // The BTreeSet storage makes sure that errors are reported in deterministic order. - for (qualifier, name) in &qualified_names { + for (qualifier, name) in qualified_names { if unqualified_names.contains(name) { return _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: Some((*qualifier).clone()), - name: name.to_string(), - } + field: Column::new(Some(qualifier.to_owned_reference()), name) }); } } - Ok(Self { - fields, - metadata, - functional_dependencies: FunctionalDependencies::empty(), - }) - } - - /// Create a `DFSchema` from an Arrow schema and a given qualifier - /// - /// To create a schema from an Arrow schema without a qualifier, use - /// `DFSchema::try_from`. - pub fn try_from_qualified_schema<'a>( - qualifier: impl Into>, - schema: &Schema, - ) -> Result { - let qualifier = qualifier.into(); - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.clone())) - .collect(), - schema.metadata().clone(), - ) + Ok(()) } /// Assigns functional dependencies. @@ -186,7 +245,7 @@ impl DFSchema { mut self, functional_dependencies: FunctionalDependencies, ) -> Result { - if functional_dependencies.is_valid(self.fields.len()) { + if functional_dependencies.is_valid(self.inner.fields.len()) { self.functional_dependencies = functional_dependencies; Ok(self) } else { @@ -200,50 +259,74 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let mut fields = self.fields.clone(); - let mut metadata = self.metadata.clone(); - fields.extend_from_slice(schema.fields().as_slice()); - metadata.extend(schema.metadata.clone()); - Self::new_with_metadata(fields, metadata) + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend(self.inner.fields().iter().cloned()); + schema_builder.extend(schema.fields().iter().cloned()); + let new_schema = schema_builder.finish(); + + let mut new_metadata = self.inner.metadata.clone(); + new_metadata.extend(schema.inner.metadata.clone()); + let new_schema_with_metadata = new_schema.with_metadata(new_metadata); + + let mut new_qualifiers = self.field_qualifiers.clone(); + new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); + + let new_self = Self { + inner: Arc::new(new_schema_with_metadata), + field_qualifiers: new_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + new_self.check_names()?; + Ok(new_self) } /// Modify this schema by appending the fields from the supplied schema, ignoring any /// duplicate fields. pub fn merge(&mut self, other_schema: &DFSchema) { - if other_schema.fields.is_empty() { + if other_schema.inner.fields.is_empty() { return; } - let self_fields: HashSet<&DFField> = self.fields.iter().collect(); - let self_unqualified_names: HashSet<&str> = - self.fields.iter().map(|x| x.name().as_str()).collect(); - - let mut fields_to_add = vec![]; + let self_fields: HashSet = + self.iter().map(|f| f.qualified_name()).collect(); - for field in other_schema.fields() { + let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); + let mut qualifiers = Vec::new(); + for f in other_schema.iter() { // skip duplicate columns - let duplicated_field = match field.qualifier() { - Some(_) => self_fields.contains(field), - // for unqualified columns, check as unqualified name - None => self_unqualified_names.contains(field.name().as_str()), - }; + let duplicated_field = self_fields.contains(&f.qualified_name()); if !duplicated_field { - fields_to_add.push(field.clone()); + schema_builder.push(f.owned_field()); + qualifiers.push(f.owned_qualifier()); } } - self.fields.extend(fields_to_add); - self.metadata.extend(other_schema.metadata.clone()) + let mut metadata = self.inner.metadata.clone(); + metadata.extend(other_schema.inner.metadata.clone()); + + let finished = schema_builder.finish(); + let finished_with_metadata = finished.with_metadata(metadata); + self.inner = finished_with_metadata.into(); + self.field_qualifiers.extend(qualifiers); } /// Get a list of fields - pub fn fields(&self) -> &Vec { - &self.fields + pub fn fields(&self) -> &Fields { + &self.inner.fields } /// Returns an immutable reference of a specific `Field` instance selected using an /// offset within the internal `fields` vector - pub fn field(&self, i: usize) -> &DFField { - &self.fields[i] + pub fn field(&self, i: usize) -> &Field { + &self.inner.fields[i] + } + + /// Returns an immutable reference of a specific `Field` instance selected using an + /// offset within the internal `fields` vector and its qualifier + pub fn qualified_field(&self, i: usize) -> DFFieldRef { + DFFieldRef { + qualifier: self.field_qualifiers[i].as_ref(), + field: self.field(i), + } } pub fn index_of_column_by_name( @@ -252,21 +335,18 @@ impl DFSchema { name: &str, ) -> Result> { let mut matches = self - .fields .iter() .enumerate() - .filter(|(_, field)| match (qualifier, &field.qualifier) { + .filter(|(_, f)| match (qualifier, f.qualifier) { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. - (Some(q), Some(field_q)) => { - q.resolved_eq(field_q) && field.name() == name - } + (Some(q), Some(field_q)) => q.resolved_eq(field_q) && f.name() == name, // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { // the original field may now be aliased with a name that matches the // original qualified name - let column = Column::from_qualified_name(field.name()); + let column = Column::from_qualified_name(f.name()); match column { Column { relation: Some(r), @@ -276,7 +356,7 @@ impl DFSchema { } } // field to lookup is unqualified, no need to compare qualifier - (None, Some(_)) | (None, None) => field.name() == name, + (None, Some(_)) | (None, None) => f.name() == name, }) .map(|(idx, _)| idx); Ok(matches.next()) @@ -299,7 +379,7 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<&DFField> { + ) -> Result<&Field> { if let Some(qualifier) = qualifier { self.field_with_qualified_name(qualifier, name) } else { @@ -307,11 +387,33 @@ impl DFSchema { } } + /// Find the qualified field with the given name + pub fn qualified_field_with_name<'a>( + &'a self, + qualifier: Option<&TableReference>, + name: &str, + ) -> Result> { + if let Some(qualifier) = qualifier { + let idx = self + .index_of_column_by_name(Some(qualifier), name)? + .ok_or_else(|| { + field_not_found(Some(qualifier.to_string()), name, self) + })?; + + Ok(DFFieldRef::new( + self.field_qualifiers[idx].as_ref(), + self.field(idx), + )) + } else { + self.qualified_field_with_unqualified_name(name) + } + } + /// Find all fields having the given qualifier - pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&DFField> { - self.fields - .iter() - .filter(|field| field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false)) + pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { + self.iter() + .filter(|f| f.qualifier.map(|q| q.eq(qualifier)).unwrap_or(false)) + .map(|f| f.field) .collect() } @@ -320,31 +422,82 @@ impl DFSchema { &self, qualifier: &TableReference, ) -> Vec { - self.fields - .iter() + self.iter() .enumerate() - .filter_map(|(idx, field)| { - field - .qualifier() - .and_then(|q| q.eq(qualifier).then_some(idx)) + .filter_map(|(idx, f)| { + f.qualifier.and_then(|q| q.eq(qualifier).then_some(idx)) }) .collect() } - /// Find all fields match the given name - pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&DFField> { - self.fields + /// Find all fields that match the given name + pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { + self.fields() .iter() .filter(|field| field.name() == name) + .map(|f| f.as_ref()) + .collect() + } + + /// Find all fields that match the given name and return them with their qualifier + pub fn qualified_fields_with_unqualified_name(&self, name: &str) -> Vec { + self.iter().filter(|f| f.name() == name).collect() + } + + /// Find all fields that match the given name and convert to column + pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { + self.iter() + .filter(|f| f.name() == name) + .map(|f| f.into()) .collect() } + /// Return all `Column`s for the schema + pub fn columns(&self) -> Vec { + self.iter().map(|f| f.into()).collect() + } + + /// Find the qualified field with the given unqualified name + pub fn qualified_field_with_unqualified_name( + &self, + name: &str, + ) -> Result { + let mut matches = self.qualified_fields_with_unqualified_name(name); + match matches.len() { + 0 => Err(unqualified_field_not_found(name, self)), + 1 => Ok(matches.swap_remove(0)), + _ => { + // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. + // Because name may generate from Alias/... . It means that it don't own qualifier. + // For example: + // Join on id = b.id + // Project a.id as id TableScan b id + // In this case, there isn't `ambiguous name` problem. When `matches` just contains + // one field without qualifier, we should return it. + let mut fields_without_qualifier = matches + .into_iter() + .filter(|f| f.qualifier.is_none()) + .collect::>(); + if fields_without_qualifier.len() == 1 { + Ok(fields_without_qualifier.swap_remove(0)) + } else { + _schema_err!(SchemaError::AmbiguousReference { + field: Column { + relation: None, + name: name.to_string(), + }, + }) + } + } + } + } + /// Find the field with the given name - pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { - let matches = self.fields_with_unqualified_name(name); + pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { + let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), - 1 => Ok(matches[0]), + 1 => Ok(matches[0].field), _ => { // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. // Because name may generate from Alias/... . It means that it don't own qualifier. @@ -358,7 +511,7 @@ impl DFSchema { .filter(|f| f.qualifier.is_none()) .collect::>(); if fields_without_qualifier.len() == 1 { - Ok(fields_without_qualifier[0]) + Ok(fields_without_qualifier[0].field) } else { _schema_err!(SchemaError::AmbiguousReference { field: Column { @@ -376,7 +529,7 @@ impl DFSchema { &self, qualifier: &TableReference, name: &str, - ) -> Result<&DFField> { + ) -> Result<&Field> { let idx = self .index_of_column_by_name(Some(qualifier), name)? .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; @@ -385,13 +538,21 @@ impl DFSchema { } /// Find the field with the given qualified column - pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { + pub fn field_from_column(&self, column: &Column) -> Result<&Field> { match &column.relation { Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), } } + /// Find the field with the given qualified column + pub fn qualified_field_from_column<'a>( + &'a self, + column: &Column, + ) -> Result> { + self.qualified_field_with_name(column.relation.as_ref(), &column.name) + } + /// Find if the field exists with the given name pub fn has_column_with_unqualified_name(&self, name: &str) -> bool { self.fields().iter().any(|field| field.name() == name) @@ -403,9 +564,8 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> bool { - self.fields().iter().any(|field| { - field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false) - && field.name() == name + self.iter().any(|f| { + f.qualifier.map(|q| q == qualifier).unwrap_or(false) && f.name() == name }) } @@ -419,7 +579,8 @@ impl DFSchema { /// Check to see if unqualified field names matches field names in Arrow schema pub fn matches_arrow_schema(&self, arrow_schema: &Schema) -> bool { - self.fields + self.inner + .fields .iter() .zip(arrow_schema.fields().iter()) .all(|(dffield, arrowfield)| dffield.name() == arrowfield.name()) @@ -457,8 +618,8 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); + let self_fields = self.iter(); + let other_fields = other.iter(); self_fields.zip(other_fields).all(|(f1, f2)| { f1.qualifier() == f2.qualifier() && f1.name() == f2.name() @@ -479,8 +640,8 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); + let self_fields = self.iter(); + let other_fields = other.iter(); self_fields.zip(other_fields).all(|(f1, f2)| { f1.qualifier() == f2.qualifier() && f1.name() == f2.name() @@ -588,12 +749,9 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { DFSchema { - fields: self - .fields - .into_iter() - .map(|f| f.strip_qualifier()) - .collect(), - ..self + field_qualifiers: vec![None; self.inner.fields.len()], + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } @@ -601,47 +759,73 @@ impl DFSchema { pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); DFSchema { - fields: self - .fields - .into_iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.field)) - .collect(), - ..self + field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { - self.fields - .iter() - .map(|f| f.qualified_name()) - .collect::>() + self.iter().map(|f| f.qualified_name()).collect::>() } /// Get metadata of this schema pub fn metadata(&self) -> &HashMap { - &self.metadata + &self.inner.metadata } /// Get functional dependencies pub fn functional_dependencies(&self) -> &FunctionalDependencies { &self.functional_dependencies } + + /// Iterate over the qualifiers and fields in the DFSchema and return DFFieldRef. + pub fn iter(&self) -> impl Iterator> { + self.field_qualifiers + .iter() + .zip(self.inner.fields()) + .map(|(qualifier, field)| DFFieldRef::new(qualifier.as_ref(), field)) + } + + pub fn iter_owned( + &self, + ) -> impl Iterator, FieldRef)> + '_ { + self.field_qualifiers + .iter() + .zip(self.inner.fields()) + .map(|(qualifier, field)| (qualifier.as_ref().cloned(), field.to_owned())) + } + + /// Special cases for FieldRef that we need Arc to avoid too many clone + pub fn iter_dffield_with_arc(&self) -> impl Iterator> { + self.field_qualifiers + .iter() + .zip(self.inner.fields()) + .map(|(qualifier, field)| DFFieldRefWithArc::new(qualifier.as_ref(), field)) + } + + /// Iterate over the qualifiers in the DFSchema and return TableReference. + pub fn iter_qualifier(&self) -> impl Iterator>> { + self.field_qualifiers + .iter() + .map(|qualifier| qualifier.as_ref()) + } } impl From for Schema { /// Convert DFSchema into a Schema fn from(df_schema: DFSchema) -> Self { - let fields: Fields = df_schema.fields.into_iter().map(|f| f.field).collect(); - Schema::new_with_metadata(fields, df_schema.metadata) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } impl From<&DFSchema> for Schema { /// Convert DFSchema reference into a Schema fn from(df_schema: &DFSchema) -> Self { - let fields: Fields = df_schema.fields.iter().map(|f| f.field.clone()).collect(); - Schema::new_with_metadata(fields, df_schema.metadata.clone()) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } @@ -649,14 +833,13 @@ impl From<&DFSchema> for Schema { impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from(f.clone())) - .collect(), - schema.metadata().clone(), - ) + let field_count = schema.fields.len(); + let dfschema = Self { + inner: schema.into(), + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -669,8 +852,8 @@ impl From for SchemaRef { // Hashing refers to a subset of fields considered in PartialEq. impl Hash for DFSchema { fn hash(&self, state: &mut H) { - self.fields.hash(state); - self.metadata.len().hash(state); // HashMap is not hashable + self.inner.fields.hash(state); + self.inner.metadata.len().hash(state); // HashMap is not hashable } } @@ -705,9 +888,19 @@ impl ToDFSchema for SchemaRef { } } -impl ToDFSchema for Vec { +impl ToDFSchema for Vec { fn to_dfschema(self) -> Result { - DFSchema::new_with_metadata(self, HashMap::new()) + let field_count = self.len(); + let schema = Schema { + fields: self.into(), + metadata: HashMap::new(), + }; + let dfschema = DFSchema { + inner: schema.into(), + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -716,12 +909,11 @@ impl Display for DFSchema { write!( f, "fields:[{}], metadata:{:?}", - self.fields - .iter() - .map(|field| field.qualified_name()) + self.iter() + .map(|f| f.qualified_name()) .collect::>() .join(", "), - self.metadata + self.inner.metadata ) } } @@ -783,138 +975,6 @@ impl ExprSchema for DFSchema { } } -/// DFField wraps an Arrow field and adds an optional qualifier -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct DFField { - /// Optional qualifier (usually a table or relation name) - qualifier: Option, - /// Arrow field definition - field: FieldRef, -} - -impl DFField { - /// Creates a new `DFField` - pub fn new>( - qualifier: Option, - name: &str, - data_type: DataType, - nullable: bool, - ) -> Self { - DFField { - qualifier: qualifier.map(|s| s.into()), - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Convenience method for creating new `DFField` without a qualifier - pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { - DFField { - qualifier: None, - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Create a qualified field from an existing Arrow field - pub fn from_qualified<'a>( - qualifier: impl Into>, - field: impl Into, - ) -> Self { - Self { - qualifier: Some(qualifier.into().to_owned_reference()), - field: field.into(), - } - } - - /// Returns an immutable reference to the `DFField`'s unqualified name - pub fn name(&self) -> &String { - self.field.name() - } - - /// Returns an immutable reference to the `DFField`'s data-type - pub fn data_type(&self) -> &DataType { - self.field.data_type() - } - - /// Indicates whether this `DFField` supports null values - pub fn is_nullable(&self) -> bool { - self.field.is_nullable() - } - - pub fn metadata(&self) -> &HashMap { - self.field.metadata() - } - - /// Returns a string to the `DFField`'s qualified name - pub fn qualified_name(&self) -> String { - if let Some(qualifier) = &self.qualifier { - format!("{}.{}", qualifier, self.field.name()) - } else { - self.field.name().to_owned() - } - } - - /// Builds a qualified column based on self - pub fn qualified_column(&self) -> Column { - Column { - relation: self.qualifier.clone(), - name: self.field.name().to_string(), - } - } - - /// Builds an unqualified column based on self - pub fn unqualified_column(&self) -> Column { - Column { - relation: None, - name: self.field.name().to_string(), - } - } - - /// Get the optional qualifier - pub fn qualifier(&self) -> Option<&OwnedTableReference> { - self.qualifier.as_ref() - } - - /// Get the arrow field - pub fn field(&self) -> &FieldRef { - &self.field - } - - /// Return field with qualifier stripped - pub fn strip_qualifier(mut self) -> Self { - self.qualifier = None; - self - } - - /// Return field with nullable specified - pub fn with_nullable(mut self, nullable: bool) -> Self { - let f = self.field().as_ref().clone().with_nullable(nullable); - self.field = f.into(); - self - } - - /// Return field with new metadata - pub fn with_metadata(mut self, metadata: HashMap) -> Self { - let f = self.field().as_ref().clone().with_metadata(metadata); - self.field = f.into(); - self - } -} - -impl From for DFField { - fn from(value: FieldRef) -> Self { - Self { - qualifier: None, - field: value, - } - } -} - -impl From for DFField { - fn from(value: Field) -> Self { - Self::from(Arc::new(value)) - } -} - /// DataFusion-specific extensions to [`Schema`]. pub trait SchemaExt { /// This is a specialized version of Eq that ignores differences @@ -967,6 +1027,94 @@ impl SchemaExt for Schema { } } +pub struct DFFieldRef<'a> { + /// Optional qualifier (usually a table or relation name) + qualifier: Option<&'a OwnedTableReference>, + /// Arrow field definition + field: &'a Field, +} + +pub struct DFFieldRefWithArc<'a> { + /// Optional qualifier (usually a table or relation name) + qualifier: Option<&'a OwnedTableReference>, + /// Arrow field definition + field: &'a FieldRef, +} + +impl<'a> DFFieldRefWithArc<'a> { + pub fn new(qualifier: Option<&'a OwnedTableReference>, field: &'a FieldRef) -> Self { + Self { qualifier, field } + } + + pub fn owned_field(&self) -> FieldRef { + self.field.to_owned() + } + + pub fn field(&self) -> &FieldRef { + self.field + } + + pub fn owned_qualifier(&self) -> Option { + self.qualifier.cloned() + } + + pub fn qualifier(&self) -> Option<&TableReference> { + self.qualifier + } +} + +impl<'a> DFFieldRef<'a> { + pub fn new(qualifier: Option<&'a OwnedTableReference>, field: &'a Field) -> Self { + Self { qualifier, field } + } + + pub fn new_with_longer_lifetime_field<'b>( + qualifier: Option<&'a OwnedTableReference>, + field: &'b Field, + ) -> Self + where + 'b: 'a, + { + Self { qualifier, field } + } + + pub fn owned_field(&self) -> Field { + self.field.to_owned() + } + + pub fn field(&self) -> &Field { + self.field + } + + pub fn name(&self) -> &String { + self.field.name() + } + + pub fn is_nullable(&self) -> bool { + self.field.is_nullable() + } + + pub fn data_type(&self) -> &DataType { + self.field.data_type() + } + + pub fn owned_qualifier(&self) -> Option { + self.qualifier.cloned() + } + + pub fn qualifier(&self) -> Option<&TableReference> { + self.qualifier + } + + pub fn qualified_name(&self) -> String { + if let Some(q) = self.qualifier { + format!("{}.{}", q, self.name()) + } else { + self.name().to_owned() + } + } +} + #[cfg(test)] mod tests { use crate::assert_contains; @@ -1007,22 +1155,6 @@ mod tests { Ok(()) } - #[test] - fn from_unqualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from(field); - assert_eq!("c0", field.name()); - assert_eq!("c0", field.qualified_name()); - } - - #[test] - fn from_qualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from_qualified("t1", field); - assert_eq!("c0", field.name()); - assert_eq!("t1.c0", field.qualified_name()); - } - #[test] fn from_unqualified_schema() -> Result<()> { let schema = DFSchema::try_from(test_schema_1())?; @@ -1037,6 +1169,36 @@ mod tests { Ok(()) } + #[test] + fn test_from_field_specific_qualified_schema() -> Result<()> { + let schema = DFSchema::from_field_specific_qualified_schema( + vec![Some("t1"), None], + &Arc::new(Schema::new(vec![ + Field::new("c0", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ])), + )?; + assert_eq!("fields:[t1.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + + #[test] + fn test_from_qualified_fields() -> Result<()> { + let qualifier: OwnedTableReference = "t0".into(); + let field_c0 = Field::new("c0", DataType::Boolean, true); + let field_c1 = Field::new("c1", DataType::Boolean, true); + + let schema = DFSchema::from_qualified_fields( + vec![ + DFFieldRef::new(Some(&qualifier), &field_c0), + DFFieldRef::new(None, &field_c1), + ], + HashMap::new(), + )?; + assert_eq!("fields:[t0.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + #[test] fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; @@ -1168,252 +1330,6 @@ mod tests { assert_eq!(err.strip_backtrace(), "Schema error: No field named c0."); } - #[test] - fn equivalent_names_and_types() { - let arrow_field1 = Field::new("f1", DataType::Int16, true); - let arrow_field1_meta = arrow_field1.clone().with_metadata(test_metadata_n(2)); - - let field1_i16_t = DFField::from(arrow_field1); - let field1_i16_t_meta = DFField::from(arrow_field1_meta); - let field1_i16_t_qualified = - DFField::from_qualified("foo", field1_i16_t.field().clone()); - let field1_i16_f = DFField::from(Field::new("f1", DataType::Int16, false)); - let field1_i32_t = DFField::from(Field::new("f1", DataType::Int32, true)); - let field2_i16_t = DFField::from(Field::new("f2", DataType::Int16, true)); - let field3_i16_t = DFField::from(Field::new("f3", DataType::Int16, true)); - - let dict = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let field_dict_t = DFField::from(Field::new("f_dict", dict.clone(), true)); - let field_dict_f = DFField::from(Field::new("f_dict", dict, false)); - - let list_t = DFField::from(Field::new_list( - "f_list", - field1_i16_t.field().clone(), - true, - )); - let list_f = DFField::from(Field::new_list( - "f_list", - field1_i16_f.field().clone(), - false, - )); - - let list_f_name = DFField::from(Field::new_list( - "f_list", - field2_i16_t.field().clone(), - false, - )); - - let struct_t = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t.field().clone()], - true, - )); - let struct_f = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_f.field().clone()], - false, - )); - - let struct_f_meta = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t_meta.field().clone()], - false, - )); - - let struct_f_type = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i32_t.field().clone()], - false, - )); - - // same - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // same but metadata is different, should still be true - TestCase { - fields1: vec![&field1_i16_t_meta], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different name - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field2_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different type - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i32_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different nullability - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different qualifier - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t_qualified], - expected_dfschema: false, - expected_arrow: true, - } - .run(); - - // different name after first - TestCase { - fields1: vec![&field2_i16_t, &field1_i16_t], - fields2: vec![&field2_i16_t, &field3_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different number - TestCase { - fields1: vec![&field1_i16_t, &field2_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // dictionary - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (different nullable) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (wrong type) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // list (different embedded nullability) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // list (different sub field names) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f_name], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // struct - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different embedded meta) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_meta], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different field type) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_type], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - #[derive(Debug)] - struct TestCase<'a> { - fields1: Vec<&'a DFField>, - fields2: Vec<&'a DFField>, - expected_dfschema: bool, - expected_arrow: bool, - } - - impl<'a> TestCase<'a> { - fn run(self) { - println!("Running {self:#?}"); - let schema1 = to_df_schema(self.fields1); - let schema2 = to_df_schema(self.fields2); - assert_eq!( - schema1.equivalent_names_and_types(&schema2), - self.expected_dfschema, - "Comparison did not match expected: {}\n\n\ - schema1:\n\n{:#?}\n\nschema2:\n\n{:#?}", - self.expected_dfschema, - schema1, - schema2 - ); - - let arrow_schema1 = Schema::from(schema1); - let arrow_schema2 = Schema::from(schema2); - assert_eq!( - arrow_schema1.equivalent_names_and_types(&arrow_schema2), - self.expected_arrow, - "Comparison did not match expected: {}\n\n\ - arrow schema1:\n\n{:#?}\n\n arrow schema2:\n\n{:#?}", - self.expected_arrow, - arrow_schema1, - arrow_schema2 - ); - } - } - - fn to_df_schema(fields: Vec<&DFField>) -> DFSchema { - let fields = fields.into_iter().cloned().collect(); - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() - } - } - #[test] fn into() { // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef @@ -1424,11 +1340,11 @@ mod tests { ); let arrow_schema_ref = Arc::new(arrow_schema.clone()); - let df_schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c0", DataType::Int64, true)], - metadata, - ) - .unwrap(); + let df_schema = DFSchema { + inner: arrow_schema_ref.clone(), + field_qualifiers: vec![None; arrow_schema_ref.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; let df_schema_ref = Arc::new(df_schema.clone()); { @@ -1468,16 +1384,15 @@ mod tests { b_metadata.insert("key".to_string(), "value".to_string()); let b_field = Field::new("b", DataType::Int64, false).with_metadata(b_metadata); - let a: DFField = DFField::from_qualified("table1", a_field); - let b: DFField = DFField::from_qualified("table1", b_field); + let schema = Arc::new(Schema::new(vec![a_field, b_field])); - let df_schema = Arc::new( - DFSchema::new_with_metadata([a, b].to_vec(), HashMap::new()).unwrap(), - ); - let schema: Schema = df_schema.as_ref().clone().into(); - let a_df = df_schema.fields.first().unwrap().field(); - let a_arrow = schema.fields.first().unwrap(); - assert_eq!(a_df.metadata(), a_arrow.metadata()) + let df_schema = DFSchema { + inner: schema.clone(), + field_qualifiers: vec![None; schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + + assert_eq!(df_schema.inner.metadata(), schema.metadata()) } #[test] diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index d1e47b473499..2c0950dbaa90 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -612,11 +612,7 @@ pub fn field_not_found>( ) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(qualifier, name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().to_vec(), }) } @@ -624,11 +620,7 @@ pub fn field_not_found>( pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new_unqualified(name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().to_vec(), }) } diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 1cb1751d713e..65cd0e8af3b2 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -78,11 +78,9 @@ impl Constraints { .iter() .map(|pk| { let idx = df_schema - .fields() + .field_names() .iter() - .position(|item| { - item.qualified_name() == pk.value.clone() - }) + .position(|item| *item == pk.value) .ok_or_else(|| { DataFusionError::Execution( "Primary key doesn't exist".to_string(), @@ -452,7 +450,7 @@ pub fn aggregate_functional_dependencies( aggr_schema: &DFSchema, ) -> FunctionalDependencies { let mut aggregate_func_dependencies = vec![]; - let aggr_input_fields = aggr_input_schema.fields(); + let aggr_input_fields = aggr_input_schema.field_names(); let aggr_fields = aggr_schema.fields(); // Association covers the whole table: let target_indices = (0..aggr_schema.fields().len()).collect::>(); @@ -470,14 +468,14 @@ pub fn aggregate_functional_dependencies( let mut new_source_field_names = vec![]; let source_field_names = source_indices .iter() - .map(|&idx| aggr_input_fields[idx].qualified_name()) + .map(|&idx| &aggr_input_fields[idx]) .collect::>(); for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { // When one of the input determinant expressions matches with // the GROUP BY expression, add the index of the GROUP BY // expression as a new determinant key: - if source_field_names.contains(group_by_expr_name) { + if source_field_names.contains(&group_by_expr_name) { new_source_indices.push(idx); new_source_field_names.push(group_by_expr_name.clone()); } @@ -538,11 +536,7 @@ pub fn get_target_functional_dependencies( ) -> Option> { let mut combined_target_indices = HashSet::new(); let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names(); for FunctionalDependence { source_indices, target_indices, @@ -551,7 +545,7 @@ pub fn get_target_functional_dependencies( { let source_key_names = source_indices .iter() - .map(|id_key_idx| field_names[*id_key_idx].clone()) + .map(|id_key_idx| &field_names[*id_key_idx]) .collect::>(); // If the GROUP BY expression contains a determinant key, we can use // the associated fields after aggregation even if they are not part @@ -577,11 +571,7 @@ pub fn get_required_group_by_exprs_indices( group_by_expr_names: &[String], ) -> Option> { let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names(); let mut groupby_expr_indices = group_by_expr_names .iter() .map(|group_by_expr_name| { diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index da7d6579bfe6..c7954a083874 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -46,7 +46,9 @@ pub mod utils; /// Reexport arrow crate pub use arrow; pub use column::Column; -pub use dfschema::{DFField, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema}; +pub use dfschema::{ + DFFieldRef, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, +}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index eea5fc1127ce..30818bae81e0 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -195,12 +195,13 @@ impl DataFrame { pub fn select_columns(self, columns: &[&str]) -> Result { let fields = columns .iter() - .map(|name| self.plan.schema().field_with_unqualified_name(name)) + .map(|name| { + self.plan + .schema() + .qualified_field_with_unqualified_name(name) + }) .collect::>>()?; - let expr: Vec = fields - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect(); + let expr: Vec = fields.into_iter().map(col).collect(); self.select(expr) } @@ -1240,14 +1241,13 @@ impl DataFrame { let mut col_exists = false; let mut fields: Vec = plan .schema() - .fields() .iter() .map(|f| { if f.name() == name { col_exists = true; new_column.clone() } else { - col(f.qualified_column()) + col(f) } }) .collect(); @@ -1298,24 +1298,27 @@ impl DataFrame { Column::from_qualified_name_ignore_case(old_name) }; - let field_to_rename = match self.plan.schema().field_from_column(&old_column) { - Ok(field) => field, - // no-op if field not found - Err(DataFusionError::SchemaError(SchemaError::FieldNotFound { .. }, _)) => { - return Ok(self) - } - Err(err) => return Err(err), - }; + let rename_field = + match self.plan.schema().qualified_field_from_column(&old_column) { + Ok(qualifier_and_field) => qualifier_and_field, + // no-op if field not found + Err(DataFusionError::SchemaError( + SchemaError::FieldNotFound { .. }, + _, + )) => return Ok(self), + Err(err) => return Err(err), + }; let projection = self .plan .schema() - .fields() .iter() - .map(|f| { - if f == field_to_rename { - col(f.qualified_column()).alias(new_name) + .map(|dffield| { + if dffield.qualifier() == rename_field.qualifier() + && dffield.field() == rename_field.field() + { + col(dffield).alias(new_name) } else { - col(f.qualified_column()) + col(dffield) } }) .collect::>(); diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index c53e8df35de8..e9864876f937 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -31,13 +31,15 @@ use arrow::{ record_batch::RecordBatch, }; use arrow_schema::Fields; -use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{internal_err, Column, DFField, DFSchema, DataFusionError}; use datafusion_expr::execution_props::ExecutionProps; +use futures::stream::FuturesUnordered; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use log::{debug, trace}; + +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{internal_err, Column, DFSchema, DataFusionError}; use datafusion_expr::{Expr, ScalarFunctionDefinition, Volatility}; use datafusion_physical_expr::create_physical_expr; -use futures::stream::{BoxStream, FuturesUnordered, StreamExt, TryStreamExt}; -use log::{debug, trace}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; @@ -270,10 +272,10 @@ async fn prune_partitions( let df_schema = DFSchema::new_with_metadata( partition_cols .iter() - .map(|(n, d)| DFField::new_unqualified(n, d.clone(), true)) + .map(|(n, d)| Field::new(n, d.clone(), true)) .collect(), Default::default(), - )?; + ); let batch = RecordBatch::try_new(schema.clone(), arrays)?; diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 85fb8939886c..4cdf39b8cd51 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -21,7 +21,7 @@ use std::{any::Any, sync::Arc}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; -use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; +use datafusion_expr::{col, LogicalPlanBuilder, TableProviderFilterPushDown}; use crate::{ error::Result, @@ -126,9 +126,8 @@ impl TableProvider for ViewTable { let fields: Vec = projection .iter() .map(|i| { - Expr::Column( - self.logical_plan.schema().field(*i).qualified_column(), - ) + let dffield = self.logical_plan.schema().qualified_field(*i); + col(dffield) }) .collect(); plan.project(fields)? diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index ca708b05823e..a051214afda1 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -87,7 +87,7 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::unnormalize_cols; use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary; use datafusion_expr::{ - DescribeTable, DmlStatement, RecursiveQuery, ScalarFunctionDefinition, + col, DescribeTable, DmlStatement, RecursiveQuery, ScalarFunctionDefinition, StringifiedPlan, WindowFrame, WindowFrameBound, WriteOp, }; use datafusion_physical_expr::expressions::Literal; @@ -1028,11 +1028,8 @@ impl DefaultPhysicalPlanner { // Remove temporary projected columns let join_plan = if added_project { let final_join_result = join_schema - .fields() .iter() - .map(|field| { - Expr::Column(field.qualified_column()) - }) + .map(col) .collect::>(); let projection = Projection::try_new( @@ -1095,14 +1092,14 @@ impl DefaultPhysicalPlanner { let (filter_df_fields, filter_fields): (Vec<_>, Vec<_>) = left_field_indices.clone() .into_iter() .map(|i| ( - left_df_schema.field(i).clone(), + left_df_schema.qualified_field(i), physical_left.schema().field(i).clone(), )) .chain( right_field_indices.clone() .into_iter() .map(|i| ( - right_df_schema.field(i).clone(), + right_df_schema.qualified_field(i), physical_right.schema().field(i).clone(), )) ) @@ -1110,7 +1107,7 @@ impl DefaultPhysicalPlanner { // Construct intermediate schemas used for filtering data and // convert logical expression to physical according to filter schema - let filter_df_schema = DFSchema::new_with_metadata(filter_df_fields, HashMap::new())?; + let filter_df_schema = DFSchema::from_qualified_fields(filter_df_fields, HashMap::new())?; let filter_schema = Schema::new_with_metadata(filter_fields, HashMap::new()); let filter_expr = create_physical_expr( expr, @@ -2021,9 +2018,7 @@ mod tests { use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type, SchemaRef}; use arrow::record_batch::RecordBatch; - use datafusion_common::{ - assert_contains, DFField, DFSchema, DFSchemaRef, TableReference, - }; + use datafusion_common::{assert_contains, DFSchema, DFSchemaRef, TableReference}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::{ @@ -2266,25 +2261,23 @@ mod tests { .await; let expected_error: &str = "Error during planning: \ - Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ - LogicalPlan schema: DFSchema { fields: [\ - DFField { qualifier: None, field: Field { \ - name: \"a\", \ + Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ + LogicalPlan schema: \ + DFSchema { inner: Schema { fields: \ + [Field { name: \"a\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} } }\ - ], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - ExecutionPlan schema: Schema { fields: [\ - Field { \ - name: \"b\", \ + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }, field_qualifiers: [None], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + ExecutionPlan schema: Schema { fields: \ + [Field { name: \"b\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} }\ - ], metadata: {} }"; + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }"; match plan { Ok(_) => panic!("Expected planning failure"), Err(e) => assert!( @@ -2547,13 +2540,10 @@ mod tests { impl Default for NoOpExtensionNode { fn default() -> Self { Self { - schema: DFSchemaRef::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], - HashMap::new(), - ) - .unwrap(), - ), + schema: DFSchemaRef::new(DFSchema::new_with_metadata( + vec![Field::new("a", DataType::Int32, false)], + HashMap::new(), + )), } } } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 7a227a91c455..8fcc7c5fc2ce 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -21,6 +21,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; +use crate::col; use crate::expr::{Alias, Unnest}; use crate::logical_plan::Projection; use crate::{Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder}; @@ -210,12 +211,7 @@ pub fn coerce_plan_expr_for_schema( Ok(LogicalPlan::Projection(projection)) } _ => { - let exprs: Vec = plan - .schema() - .fields() - .iter() - .map(|field| Expr::Column(field.qualified_column())) - .collect(); + let exprs: Vec = plan.schema().iter().map(col).collect(); let new_exprs = coerce_exprs_for_schema(exprs, plan.schema(), schema)?; let add_project = new_exprs.iter().any(|expr| expr.try_into_col().is_err()); @@ -283,10 +279,9 @@ mod test { use super::*; use crate::expr::Sort; use crate::{col, lit, Cast}; - - use arrow::datatypes::DataType; + use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter}; - use datafusion_common::{DFField, DFSchema, ScalarValue}; + use datafusion_common::{DFSchema, OwnedTableReference, ScalarValue}; #[derive(Default)] struct RecordingRewriter { @@ -347,20 +342,21 @@ mod test { let expr = col("a") + col("b") + col("c"); // Schemas with some matching and some non matching cols - let schema_a = make_schema_with_empty_metadata(vec![ - make_field("tableA", "a"), - make_field("tableA", "aa"), - ]); - let schema_c = make_schema_with_empty_metadata(vec![ - make_field("tableC", "cc"), - make_field("tableC", "c"), - ]); - let schema_b = make_schema_with_empty_metadata(vec![make_field("tableB", "b")]); + let schema_a = make_schema_with_empty_metadata( + vec![Some("tableA".into()), Some("tableA".into())], + vec!["a", "aa"], + ); + let schema_c = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["cc", "c"], + ); + let schema_b = + make_schema_with_empty_metadata(vec![Some("tableB".into())], vec!["b"]); // non matching - let schema_f = make_schema_with_empty_metadata(vec![ - make_field("tableC", "f"), - make_field("tableC", "ff"), - ]); + let schema_f = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["f", "ff"], + ); let schemas = vec![schema_c, schema_f, schema_b, schema_a]; let schemas = schemas.iter().collect::>(); @@ -378,9 +374,8 @@ mod test { // test normalizing columns when the name doesn't exist let expr = col("a") + col("b"); let schema_a = - make_schema_with_empty_metadata(vec![make_field("\"tableA\"", "a")]); - let schemas = [schema_a]; - let schemas = schemas.iter().collect::>(); + make_schema_with_empty_metadata(vec![Some("\"tableA\"".into())], vec!["a"]); + let schemas = vec![&schema_a]; let error = normalize_col_with_schemas_and_ambiguity_check(expr, &[&schemas], &[]) @@ -399,12 +394,16 @@ mod test { assert_eq!(unnormalized_expr, col("a") + col("b")); } - fn make_schema_with_empty_metadata(fields: Vec) -> DFSchema { - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() - } - - fn make_field(relation: &str, column: &str) -> DFField { - DFField::new(Some(relation.to_string()), column, DataType::Int8, false) + fn make_schema_with_empty_metadata( + qualifiers: Vec>, + fields: Vec<&str>, + ) -> DFSchema { + let fields = fields + .iter() + .map(|f| Arc::new(Field::new(f.to_string(), DataType::Int8, false))) + .collect::>(); + let schema = Arc::new(Schema::new(fields)); + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap() } #[test] diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index b1bc11a83f90..7614ad014220 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -19,7 +19,7 @@ use crate::expr::{Alias, Sort}; use crate::expr_rewriter::normalize_col; -use crate::{Cast, Expr, ExprSchemable, LogicalPlan, TryCast}; +use crate::{col, Cast, Expr, ExprSchemable, LogicalPlan, TryCast}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{Column, Result}; @@ -87,12 +87,9 @@ fn rewrite_in_terms_of_projection( expr.transform(&|expr| { // search for unnormalized names first such as "c1" (such as aliases) if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) { - let col = Expr::Column( - found - .to_field(input.schema()) - .map(|f| f.qualified_column())?, - ); - return Ok(Transformed::yes(col)); + let field = found.to_field(input.schema())?; + let dffield = found.to_dffield(&field)?; + return Ok(Transformed::yes(col(dffield))); } // if that doesn't work, try to match the expression as an diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index f1ac22d584ee..36a16c2d06a1 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -24,18 +24,32 @@ use crate::expr::{ use crate::field_util::GetFieldAccessSchema; use crate::type_coercion::binary::get_result_type; use crate::type_coercion::functions::data_types; -use crate::{utils, LogicalPlan, Projection, Subquery}; +use crate::{col, utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::{ - internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFField, - ExprSchema, Result, + internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFFieldRef, + ExprSchema, OwnedTableReference, Result, }; use std::collections::HashMap; use std::sync::Arc; /// trait to allow expr to typable with respect to a schema pub trait ExprSchemable { + // TODO: merge these two? + fn to_qualifers_and_fields( + &self, + input_schema: &dyn ExprSchema, + is_nullable: Option, + ) -> Result<(Option, Field)>; + + fn to_qualifers_and_fieldrefs( + &self, + input_schema: &dyn ExprSchema, + ) -> Result<(Option, FieldRef)>; + + fn to_field(&self, input_schema: &dyn ExprSchema) -> Result; + /// given a schema, return the type of the expr fn get_type(&self, schema: &dyn ExprSchema) -> Result; @@ -46,7 +60,7 @@ pub trait ExprSchemable { fn metadata(&self, schema: &dyn ExprSchema) -> Result>; /// convert to a field with respect to a schema - fn to_field(&self, input_schema: &dyn ExprSchema) -> Result; + fn to_dffield<'a>(&'a self, field: &'a Field) -> Result; /// cast to a type with respect to a schema fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result; @@ -70,8 +84,8 @@ impl ExprSchemable for Expr { /// ## and Float32 results in Float32 type /// /// ``` - /// # use arrow::datatypes::DataType; - /// # use datafusion_common::{DFField, DFSchema}; + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{col, ExprSchemable}; /// # use std::collections::HashMap; /// @@ -79,12 +93,11 @@ impl ExprSchemable for Expr { /// let expr = col("c1") + col("c2"); /// let schema = DFSchema::new_with_metadata( /// vec![ - /// DFField::new_unqualified("c1", DataType::Int32, true), - /// DFField::new_unqualified("c2", DataType::Float32, true), + /// Field::new("c1", DataType::Int32, true), + /// Field::new("c2", DataType::Float32, true), /// ], /// HashMap::new(), - /// ) - /// .unwrap(); + /// ); /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` @@ -374,6 +387,84 @@ impl ExprSchemable for Expr { } } + fn to_qualifers_and_fields( + &self, + input_schema: &dyn ExprSchema, + is_nullable: Option, + ) -> Result<(Option, Field)> { + let (data_type, is_nullable) = if let Some(is_nullable) = is_nullable { + (self.get_type(input_schema)?, is_nullable) + } else { + self.data_type_and_nullable(input_schema)? + }; + + let metadata = self.metadata(input_schema)?; + match self { + Expr::Column(c) => { + let field = Field::new(c.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok((c.relation.clone(), field)) + } + Expr::Alias(alias) => { + let field = Field::new(alias.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok((alias.relation.clone(), field)) + } + _ => { + let field = Field::new(self.display_name()?, data_type, is_nullable) + .with_metadata(metadata); + Ok((None, field)) + } + } + } + + fn to_qualifers_and_fieldrefs( + &self, + input_schema: &dyn ExprSchema, + ) -> Result<(Option, FieldRef)> { + let (data_type, is_nullable) = self.data_type_and_nullable(input_schema)?; + let metadata = self.metadata(input_schema)?; + match self { + Expr::Column(c) => { + let field = Field::new(c.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok((c.relation.clone(), field.into())) + } + Expr::Alias(alias) => { + let field = Field::new(alias.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok((alias.relation.clone(), field.into())) + } + _ => { + let field = Field::new(self.display_name()?, data_type, is_nullable) + .with_metadata(metadata); + Ok((None, field.into())) + } + } + } + + fn to_field(&self, input_schema: &dyn ExprSchema) -> Result { + let (data_type, is_nullable) = self.data_type_and_nullable(input_schema)?; + let metadata = self.metadata(input_schema)?; + match self { + Expr::Column(c) => { + let field = Field::new(c.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok(field) + } + Expr::Alias(alias) => { + let field = Field::new(alias.name.clone(), data_type, is_nullable) + .with_metadata(metadata); + Ok(field) + } + _ => { + let field = Field::new(self.display_name()?, data_type, is_nullable) + .with_metadata(metadata); + Ok(field) + } + } + } + /// Returns the datatype and nullability of the expression based on [ExprSchema]. /// /// Note: [`DFSchema`] implements [ExprSchema]. @@ -437,27 +528,11 @@ impl ExprSchemable for Expr { /// /// So for example, a projected expression `col(c1) + col(c2)` is /// placed in an output field **named** col("c1 + c2") - fn to_field(&self, input_schema: &dyn ExprSchema) -> Result { + fn to_dffield<'a>(&'a self, field: &'a Field) -> Result { match self { - Expr::Column(c) => { - let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok( - DFField::new(c.relation.clone(), &c.name, data_type, nullable) - .with_metadata(self.metadata(input_schema)?), - ) - } - Expr::Alias(Alias { relation, name, .. }) => { - let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok(DFField::new(relation.clone(), name, data_type, nullable) - .with_metadata(self.metadata(input_schema)?)) - } - _ => { - let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok( - DFField::new_unqualified(&self.display_name()?, data_type, nullable) - .with_metadata(self.metadata(input_schema)?), - ) - } + Expr::Column(c) => Ok(DFFieldRef::new(c.relation.as_ref(), field)), + Expr::Alias(alias) => Ok(DFFieldRef::new(alias.relation.as_ref(), field)), + _ => Ok(DFFieldRef::new(None, field)), } } @@ -535,8 +610,9 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { - let cast_expr = Expr::Column(plan.schema().field(0).qualified_column()) - .cast_to(cast_to_type, subquery.subquery.schema())?; + let dffield = plan.schema().qualified_field(0); + let cast_expr = + col(dffield).cast_to(cast_to_type, subquery.subquery.schema())?; LogicalPlan::Projection(Projection::try_new( vec![cast_expr], subquery.subquery, @@ -553,8 +629,8 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result {{ @@ -680,22 +756,20 @@ mod tests { ); let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("foo", DataType::Int32, true) - .with_metadata(meta.clone())], + vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())], HashMap::new(), - ) - .unwrap(); + ); // verify to_field method populates metadata - assert_eq!(&meta, expr.to_field(&schema).unwrap().metadata()); + let field = expr.to_field(&schema).unwrap(); + assert_eq!(&meta, expr.to_dffield(&field).unwrap().field().metadata()); } #[test] fn test_nested_schema_nullability() { - let fields = DFField::new( - Some(TableReference::Bare { - table: "table_name".into(), - }), + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("foo", DataType::Int32, true)); + builder.push(Field::new( "parent", DataType::Struct(Fields::from(vec![Field::new( "child", @@ -703,12 +777,17 @@ mod tests { false, )])), true, - ); + )); + let schema = builder.finish(); - let schema = DFSchema::new_with_metadata(vec![fields], HashMap::new()).unwrap(); + let dfschema = DFSchema::from_field_specific_qualified_schema( + vec![Some("table_name"), None], + &Arc::new(schema), + ) + .unwrap(); let expr = col("parent").field("child"); - assert!(expr.nullable(&schema).unwrap()); + assert!(expr.nullable(&dfschema).unwrap()); } #[derive(Debug)] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 01e6af948762..18a4aede22cc 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -43,17 +43,17 @@ use crate::utils::{ expand_wildcard, find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, }; use crate::{ - and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery, + and, binary_expr, col, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery, TableProviderFilterPushDown, TableSource, WriteOp, }; -use arrow::datatypes::{DataType, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - get_target_functional_dependencies, plan_datafusion_err, plan_err, Column, DFField, - DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, ScalarValue, - TableReference, ToDFSchema, UnnestOptions, + get_target_functional_dependencies, plan_datafusion_err, plan_err, Column, + DFFieldRef, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, + ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; /// Default table name for unnamed table @@ -202,18 +202,14 @@ impl LogicalPlanBuilder { .map(|(j, data_type)| { // naming is following convention https://www.postgresql.org/docs/current/queries-values.html let name = &format!("column{}", j + 1); - DFField::new_unqualified( - name, - data_type.clone().unwrap_or(DataType::Utf8), - true, - ) + Field::new(name, data_type.clone().unwrap_or(DataType::Utf8), true) }) .collect::>(); for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let schema = - DFSchemaRef::new(DFSchema::new_with_metadata(fields, HashMap::new())?); + let dfschema = DFSchema::new_with_metadata(fields, HashMap::new()); + let schema = DFSchemaRef::new(dfschema); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } @@ -356,10 +352,12 @@ impl LogicalPlanBuilder { /// Select the given column indices pub fn select(self, indices: impl IntoIterator) -> Result { - let fields = self.plan.schema().fields(); let exprs: Vec<_> = indices .into_iter() - .map(|x| Expr::Column(fields[x].qualified_column())) + .map(|x| { + let dffield = self.plan.schema().qualified_field(x); + col(dffield) + }) .collect(); self.project(exprs) } @@ -545,11 +543,7 @@ impl LogicalPlanBuilder { } // remove pushed down sort columns - let new_expr = schema - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect(); + let new_expr = schema.columns().into_iter().map(Expr::Column).collect(); let is_distinct = false; let plan = Self::add_missing_columns(self.plan, &missing_cols, is_distinct)?; @@ -1125,7 +1119,7 @@ impl LogicalPlanBuilder { )?)) } } -pub fn change_redundant_column(fields: Vec) -> Vec { +pub fn change_redundant_column(fields: &Fields) -> Vec { let mut name_map = HashMap::new(); fields .into_iter() @@ -1134,18 +1128,14 @@ pub fn change_redundant_column(fields: Vec) -> Vec { *counter += 1; if *counter > 1 { let new_name = format!("{}:{}", field.name(), *counter - 1); - DFField::new( - field.qualifier().cloned(), - &new_name, - field.data_type().clone(), - field.is_nullable(), - ) + Field::new(new_name, field.data_type().clone(), field.is_nullable()) } else { - field + field.as_ref().clone() } }) .collect() } + /// Creates a schema for a join operation. /// The fields from the left side are first pub fn build_join_schema( @@ -1153,67 +1143,62 @@ pub fn build_join_schema( right: &DFSchema, join_type: &JoinType, ) -> Result { - fn nullify_fields(fields: &[DFField]) -> Vec { + fn nullify_feilds<'a>( + fields: impl Iterator>, + ) -> Vec<(Option, FieldRef)> { fields - .iter() - .map(|f| f.clone().with_nullable(true)) + .map(|dffield| { + let field = dffield.owned_field().with_nullable(true); + (dffield.owned_qualifier(), Arc::new(field)) + }) .collect() } - let right_fields = right.fields(); - let left_fields = left.fields(); - - let fields: Vec = match join_type { - JoinType::Inner => { - // left then right - left_fields - .iter() - .chain(right_fields.iter()) - .cloned() - .collect() - } - JoinType::Left => { - // left then right, right set to nullable in case of not matched scenario - left_fields - .iter() - .chain(&nullify_fields(right_fields)) - .cloned() - .collect() - } - JoinType::Right => { - // left then right, left set to nullable in case of not matched scenario - nullify_fields(left_fields) - .iter() - .chain(right_fields.iter()) - .cloned() - .collect() - } - JoinType::Full => { - // left then right, all set to nullable in case of not matched scenario - nullify_fields(left_fields) - .iter() - .chain(&nullify_fields(right_fields)) - .cloned() - .collect() - } - JoinType::LeftSemi | JoinType::LeftAnti => { - // Only use the left side for the schema - left_fields.clone() - } - JoinType::RightSemi | JoinType::RightAnti => { - // Only use the right side for the schema - right_fields.clone() - } - }; + let (field_qualifiers, fields): (Vec>, Vec) = + match join_type { + JoinType::Inner => { + // left then right + left.iter_owned().chain(right.iter_owned()).unzip() + } + JoinType::Left => { + // left then right, right set to nullable in case of not matched scenario + left.iter_owned() + .chain(nullify_feilds(right.iter())) + .unzip() + } + JoinType::Right => { + // left then right, left set to nullable in case of not matched scenario + nullify_feilds(left.iter()) + .into_iter() + .chain(right.iter_owned()) + .unzip() + } + JoinType::Full => { + // left then right, all set to nullable in case of not matched scenario + nullify_feilds(left.iter()) + .into_iter() + .chain(nullify_feilds(right.iter())) + .unzip() + } + JoinType::LeftSemi | JoinType::LeftAnti => { + // Only use the left side for the schema + left.iter_owned().unzip() + } + JoinType::RightSemi | JoinType::RightAnti => { + // Only use the right side for the schema + right.iter_owned().unzip() + } + }; let func_dependencies = left.functional_dependencies().join( right.functional_dependencies(), join_type, - left_fields.len(), + left.fields().len(), ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let schema = DFSchema::new_with_metadata(fields, metadata)?; - schema.with_functional_dependencies(func_dependencies) + let dfschema = + DFSchema::from_owned_qualified_fields(field_qualifiers, fields, metadata)?; + dfschema.with_functional_dependencies(func_dependencies) } /// Add additional "synthetic" group by expressions based on functional @@ -1240,9 +1225,8 @@ fn add_group_by_exprs_from_dependencies( get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { - let field = schema.field(idx); - let expr = - Expr::Column(Column::new(field.qualifier().cloned(), field.name())); + let dffield = schema.qualified_field(idx); + let expr = col(dffield); let expr_name = expr.display_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); @@ -1312,34 +1296,42 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result( + left_fields: impl Iterator>, + right_fields: impl Iterator>, + ) -> Result, FieldRef)>> { + zip(left_fields, right_fields) + .map(|(left_field, right_field)| { + let nullable = left_field.is_nullable() || right_field.is_nullable(); + let data_type = comparison_coercion(left_field.data_type(), right_field.data_type()) .ok_or_else(|| { plan_datafusion_err!( - "UNION Column {} (type: {}) is not compatible with column {} (type: {})", - right_field.name(), - right_field.data_type(), - left_field.name(), - left_field.data_type() - ) + "UNION Column {} (type: {}) is not compatible with column {} (type: {})", + right_field.name(), + right_field.data_type(), + left_field.name(), + left_field.data_type() + ) })?; - Ok(DFField::new( - left_field.qualifier().cloned(), - left_field.name(), - data_type, - nullable, - )) - }) - .collect::>>()? - .to_dfschema()?; + let field = Field::new(left_field.name(), data_type, nullable); + Ok((left_field.owned_qualifier(), Arc::new(field))) + }) + .collect::>>() + } + + // create union schema + let (union_qualifers, union_fields): ( + Vec>, + Vec, + ) = union_fields(left_plan.schema().iter(), right_plan.schema().iter())? + .into_iter() + .unzip(); + let union_schema = DFSchema::from_owned_qualified_fields( + union_qualifers, + union_fields, + HashMap::new(), + )?; let inputs = vec![left_plan, right_plan] .into_iter() @@ -1539,18 +1531,18 @@ pub fn unnest_with_options( column: Column, options: UnnestOptions, ) -> Result { - let unnest_field = input.schema().field_from_column(&column)?; + let input_schema = input.schema(); + let unnest_field = input_schema.qualified_field_from_column(&column)?; // Extract the type of the nested field in the list. let unnested_field = match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => DFField::new( - unnest_field.qualifier().cloned(), + | DataType::LargeList(field) => Arc::new(Field::new( unnest_field.name(), field.data_type().clone(), unnest_field.is_nullable(), - ), + )), _ => { // If the unnest field is not a list type return the input plan. return Ok(input); @@ -1558,28 +1550,31 @@ pub fn unnest_with_options( }; // Update the schema with the unnest column type changed to contain the nested type. - let input_schema = input.schema(); - let fields = input_schema - .fields() - .iter() - .map(|f| { - if f == unnest_field { - unnested_field.clone() - } else { - f.clone() - } - }) - .collect::>(); + let (field_qualifiers, fields): (Vec>, Vec) = + input_schema + .iter_dffield_with_arc() + .map(|f| { + if f.qualifier() == unnest_field.qualifier() + && f.field().as_ref() == unnest_field.field() + { + (f.owned_qualifier(), unnested_field.clone()) + } else { + (f.owned_qualifier(), f.owned_field()) + } + }) + .unzip(); let metadata = input_schema.metadata().clone(); - let df_schema = DFSchema::new_with_metadata(fields, metadata)?; + let dfschema = + DFSchema::from_owned_qualified_fields(field_qualifiers, fields, metadata)?; // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); - let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); + let schema = Arc::new(dfschema.with_functional_dependencies(deps)?); + let column = Column::new(unnest_field.owned_qualifier(), unnest_field.name()); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - column: unnested_field.qualified_column(), + column, schema, options, })) @@ -2101,23 +2096,23 @@ mod tests { } #[test] fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_3 = DFField::new_unqualified("a", DataType::Int32, false); - let t1_field_2 = DFField::new_unqualified("b", DataType::Int32, false); - let t2_field_2 = DFField::new_unqualified("b", DataType::Int32, false); + let t1_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_3 = Field::new("a", DataType::Int32, false); + let t1_field_2 = Field::new("b", DataType::Int32, false); + let t2_field_2 = Field::new("b", DataType::Int32, false); let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; - let remove_redundant = change_redundant_column(field_vec); + let remove_redundant = change_redundant_column(&Fields::from(field_vec)); assert_eq!( remove_redundant, vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("a:1", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("b:1", DataType::Int32, false), - DFField::new_unqualified("a:2", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("a:1", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("b:1", DataType::Int32, false), + Field::new("a:2", DataType::Int32, false), ] ); Ok(()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 05d7ac539458..4a3e88a0575d 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -33,23 +33,23 @@ use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor}; use crate::logical_plan::extension::UserDefinedLogicalNode; use crate::logical_plan::{DmlStatement, Statement}; use crate::utils::{ - enumerate_grouping_sets, exprlist_to_fields, find_out_reference_exprs, - grouping_set_expr_count, grouping_set_to_exprlist, inspect_expr_pre, - split_conjunction, + enumerate_grouping_sets, exprlist_to_qualified_fieldrefs, + exprlist_to_qualified_fields, find_out_reference_exprs, grouping_set_expr_count, + grouping_set_to_exprlist, inspect_expr_pre, split_conjunction, }; use crate::{ - build_join_schema, expr_vec_fmt, BinaryExpr, BuiltInWindowFunction, + build_join_schema, col, expr_vec_fmt, BinaryExpr, BuiltInWindowFunction, CreateMemoryTable, CreateView, Expr, ExprSchemable, LogicalPlanBuilder, Operator, TableProviderFilterPushDown, TableSource, WindowFunctionDefinition, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeVisitor, }; use datafusion_common::{ aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints, - DFField, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, + DFFieldRef, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, FunctionalDependencies, OwnedTableReference, ParamValues, Result, UnnestOptions, }; @@ -486,12 +486,14 @@ impl LogicalPlan { LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, .. }) => { static_term.head_output_expr() } - LogicalPlan::Union(union) => Ok(Some(Expr::Column( - union.schema.fields()[0].qualified_column(), - ))), - LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( - table.projected_schema.fields()[0].qualified_column(), - ))), + LogicalPlan::Union(union) => { + let dffield = union.schema.qualified_field(0); + Ok(Some(col(dffield))) + } + LogicalPlan::TableScan(table) => { + let dffield = table.projected_schema.qualified_field(0); + Ok(Some(col(dffield))) + } LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; expr_opt @@ -866,23 +868,32 @@ impl LogicalPlan { }) => { // Update schema with unnested column type. let input = Arc::new(inputs.swap_remove(0)); - let nested_field = input.schema().field_from_column(column)?; - let unnested_field = schema.field_from_column(column)?; - let fields = input - .schema() - .fields() - .iter() - .map(|f| { - if f == nested_field { - unnested_field.clone() - } else { - f.clone() - } - }) - .collect::>(); + let nested_dffield = + input.schema().qualified_field_from_column(column)?; + + let unnested_dffield = schema.qualified_field_from_column(column)?; + + let (qualifers, fields): (Vec>, Vec) = + input + .schema() + .iter() + .map(|f| { + if f.qualifier().eq(&nested_dffield.qualifier()) + && f.field() == nested_dffield.field() + { + ( + unnested_dffield.owned_qualifier(), + unnested_dffield.owned_field(), + ) + } else { + (f.owned_qualifier(), f.owned_field()) + } + }) + .unzip(); let schema = Arc::new( - DFSchema::new_with_metadata( + DFSchema::from_owned_qualified_fields( + qualifers, fields, input.schema().metadata().clone(), )? @@ -1782,12 +1793,7 @@ impl Projection { /// Create a new Projection using the specified output schema pub fn new_from_schema(input: Arc, schema: DFSchemaRef) -> Self { - let expr: Vec = schema - .fields() - .iter() - .map(|field| field.qualified_column()) - .map(Expr::Column) - .collect(); + let expr: Vec = schema.columns().into_iter().map(Expr::Column).collect(); Self { expr, input, @@ -1810,13 +1816,18 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { - let mut schema = DFSchema::new_with_metadata( - exprlist_to_fields(exprs, input)?, + let (qualifiers, fields): (Vec>, Vec) = + exprlist_to_qualified_fields(exprs, input, None)? + .into_iter() + .unzip(); + + let schema = DFSchema::from_owned_qualified_fields( + qualifiers, + fields, input.schema().metadata().clone(), - )?; - schema = schema.with_functional_dependencies(calc_func_dependencies_for_project( - exprs, input, - )?)?; + )? + .with_functional_dependencies(calc_func_dependencies_for_project(exprs, input)?)?; + Ok(Arc::new(schema)) } @@ -1839,9 +1850,9 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let fields = change_redundant_column(plan.schema().fields().clone()); + let fields = change_redundant_column(plan.schema().fields()); let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = DFSchema::new_with_metadata(fields, meta_data)?.into(); + let schema: Schema = DFSchema::new_with_metadata(fields, meta_data).into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); @@ -1986,17 +1997,22 @@ pub struct Window { impl Window { /// Create a new window operator. pub fn try_new(window_expr: Vec, input: Arc) -> Result { - let fields = input.schema().fields(); + let fields: Vec<(Option, FieldRef)> = + input.schema().iter_owned().collect(); let input_len = fields.len(); - let mut window_fields = fields.clone(); - let expr_fields = exprlist_to_fields(window_expr.as_slice(), &input)?; - window_fields.extend_from_slice(expr_fields.as_slice()); + + let window_fields = exprlist_to_qualified_fieldrefs(&window_expr, &input)?; + let window_len = window_fields.len(); + + let (qualifiers, fields): (Vec>, Vec) = + fields.into_iter().chain(window_fields).unzip(); + let metadata = input.schema().metadata().clone(); // Update functional dependencies for window: let mut window_func_dependencies = input.schema().functional_dependencies().clone(); - window_func_dependencies.extend_target_indices(window_fields.len()); + window_func_dependencies.extend_target_indices(window_len); // Since we know that ROW_NUMBER outputs will be unique (i.e. it consists // of consecutive numbers per partition), we can represent this fact with @@ -2031,20 +2047,22 @@ impl Window { if !new_dependencies.is_empty() { for dependence in new_dependencies.iter_mut() { - dependence.target_indices = (0..window_fields.len()).collect(); + dependence.target_indices = (0..window_len).collect(); } // Add the dependency introduced because of ROW_NUMBER window function to the functional dependency let new_deps = FunctionalDependencies::new(new_dependencies); window_func_dependencies.extend(new_deps); } + let schema = Arc::new( + DFSchema::from_owned_qualified_fields(qualifiers, fields, metadata)? + .with_functional_dependencies(window_func_dependencies)?, + ); + Ok(Window { input, window_expr, - schema: Arc::new( - DFSchema::new_with_metadata(window_fields, metadata)? - .with_functional_dependencies(window_func_dependencies)?, - ), + schema, }) } } @@ -2113,17 +2131,14 @@ impl TableScan { .map(|p| { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); - let df_schema = DFSchema::new_with_metadata( - p.iter() - .map(|i| { - DFField::from_qualified( - table_name.clone(), - schema.field(*i).clone(), - ) - }) - .collect(), - schema.metadata().clone(), - )?; + + let dffields: Vec = p + .iter() + .map(|i| DFFieldRef::new(Some(&table_name), schema.field(*i))) + .collect(); + + let df_schema = + DFSchema::from_qualified_fields(dffields, schema.metadata.clone())?; df_schema.with_functional_dependencies(projected_func_dependencies) }) .unwrap_or_else(|| { @@ -2315,8 +2330,14 @@ impl DistinctOn { let on_expr = normalize_cols(on_expr, input.as_ref())?; - let schema = DFSchema::new_with_metadata( - exprlist_to_fields(select_expr.as_slice(), &input)?, + let (qualifiers, fields): (Vec>, Vec) = + exprlist_to_qualified_fieldrefs(&select_expr, &input)? + .into_iter() + .unzip(); + + let dfschema = DFSchema::from_owned_qualified_fields( + qualifiers, + fields, input.schema().metadata().clone(), )?; @@ -2325,7 +2346,7 @@ impl DistinctOn { select_expr, sort_expr: None, input, - schema: Arc::new(schema), + schema: Arc::new(dfschema), }; if let Some(sort_expr) = sort_expr { @@ -2390,25 +2411,22 @@ impl Aggregate { aggr_expr: Vec, ) -> Result { let group_expr = enumerate_grouping_sets(group_expr)?; - - let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]); - let grouping_expr: Vec = grouping_set_to_exprlist(group_expr.as_slice())?; - let mut fields = exprlist_to_fields(grouping_expr.as_slice(), &input)?; - // Even columns that cannot be null will become nullable when used in a grouping set. - if is_grouping_set { - fields = fields + let is_nullable = if matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]) { + Some(true) + } else { + None + }; + let (qualifiers, fields): (Vec>, Vec) = + exprlist_to_qualified_fields(&grouping_expr, &input, is_nullable)? .into_iter() - .map(|field| field.with_nullable(true)) - .collect::>(); - } - - fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); + .chain(exprlist_to_qualified_fields(&aggr_expr, &input, None)?) + .unzip(); let schema = - DFSchema::new_with_metadata(fields, input.schema().metadata().clone())?; + DFSchema::from_owned_qualified_fields(qualifiers, fields, HashMap::new())?; Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(schema)) } @@ -2503,7 +2521,7 @@ fn calc_func_dependencies_for_project( exprs: &[Expr], input: &LogicalPlan, ) -> Result { - let input_fields = input.schema().fields(); + let input_fields = input.schema().field_names(); // Calculate expression indices (if present) in the input schema. let proj_indices = exprs .iter() @@ -2514,9 +2532,7 @@ fn calc_func_dependencies_for_project( } _ => format!("{}", expr), }; - input_fields - .iter() - .position(|item| item.qualified_name() == expr_name) + input_fields.iter().position(|item| *item == expr_name) }) .collect::>(); Ok(input @@ -2652,7 +2668,6 @@ pub struct Unnest { #[cfg(test)] mod tests { - use std::collections::HashMap; use std::sync::Arc; use super::*; @@ -3002,7 +3017,7 @@ digraph { #[test] fn projection_expr_schema_mismatch() -> Result<()> { - let empty_schema = Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new())?); + let empty_schema = Arc::new(DFSchema::empty()); let p = Projection::try_new_with_schema( vec![col("a")], Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { @@ -3176,10 +3191,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); @@ -3206,10 +3221,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index c7907d0db16a..ebc23cb89325 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -26,16 +26,16 @@ use crate::expr_rewriter::strip_outer_reference; use crate::logical_plan::Aggregate; use crate::signature::{Signature, TypeSignature}; use crate::{ - and, BinaryExpr, Cast, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, + and, col, BinaryExpr, Cast, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, TryCast, }; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Field, FieldRef, Schema, TimeUnit}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, DFSchemaRef, - Result, ScalarValue, TableReference, + internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, + OwnedTableReference, Result, ScalarValue, TableReference, }; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; @@ -342,12 +342,8 @@ fn get_excluded_columns( let mut result = vec![]; for ident in unique_idents.into_iter() { let col_name = ident.value.as_str(); - let field = if let Some(qualifier) = qualifier { - schema.field_with_qualified_name(qualifier, col_name)? - } else { - schema.field_with_unqualified_name(col_name)? - }; - result.push(field.qualified_column()) + let f = schema.qualified_field_with_name(qualifier.as_ref(), col_name)?; + result.push(f.into()); } Ok(result) } @@ -358,19 +354,14 @@ fn get_exprs_except_skipped( columns_to_skip: HashSet, ) -> Vec { if columns_to_skip.is_empty() { - schema - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect::>() + schema.iter().map(col).collect::>() } else { schema - .fields() + .columns() .iter() - .filter_map(|f| { - let col = f.qualified_column(); - if !columns_to_skip.contains(&col) { - Some(Expr::Column(col)) + .filter_map(|c| { + if !columns_to_skip.contains(c) { + Some(Expr::Column(c.clone())) } else { None } @@ -433,13 +424,14 @@ pub fn expand_qualified_wildcard( let projected_func_dependencies = schema .functional_dependencies() .project_functional_dependencies(&qualified_indices, qualified_indices.len()); - let qualified_fields = get_at_indices(schema.fields(), &qualified_indices)?; - if qualified_fields.is_empty() { + let fields_with_qualified = get_at_indices(schema.fields(), &qualified_indices)?; + if fields_with_qualified.is_empty() { return plan_err!("Invalid qualifier {qualifier}"); } - let qualified_schema = - DFSchema::new_with_metadata(qualified_fields, schema.metadata().clone())? - // We can use the functional dependencies as is, since it only stores indices: + + let qualified_schema = Arc::new(Schema::new(fields_with_qualified)); + let qualified_dfschema = + DFSchema::try_from_qualified_schema(qualifier.clone(), &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; let excluded_columns = if let Some(WildcardAdditionalOptions { opt_exclude, @@ -459,7 +451,10 @@ pub fn expand_qualified_wildcard( // Add each excluded `Column` to columns_to_skip let mut columns_to_skip = HashSet::new(); columns_to_skip.extend(excluded_columns); - Ok(get_exprs_except_skipped(&qualified_schema, columns_to_skip)) + Ok(get_exprs_except_skipped( + &qualified_dfschema, + columns_to_skip, + )) } /// (expr, "is the SortExpr for window (either comes from PARTITION BY or ORDER BY columns)") @@ -737,37 +732,79 @@ fn agg_cols(agg: &Aggregate) -> Vec { .collect() } -fn exprlist_to_fields_aggregate(exprs: &[Expr], agg: &Aggregate) -> Result> { +fn exprlist_to_qualified_fields_aggregate( + exprs: &[Expr], + agg: &Aggregate, + is_nullable: Option, +) -> Result, Field)>> { let agg_cols = agg_cols(agg); - let mut fields = vec![]; - for expr in exprs { - match expr { + exprs + .iter() + .map(|expr| match expr { Expr::Column(c) if agg_cols.iter().any(|x| x == c) => { // resolve against schema of input to aggregate - fields.push(expr.to_field(agg.input.schema())?); + expr.to_qualifers_and_fields(agg.input.schema(), is_nullable) } - _ => fields.push(expr.to_field(&agg.schema)?), + _ => expr.to_qualifers_and_fields(&agg.schema, is_nullable), + }) + .collect() +} + +fn exprlist_to_qualified_fieldrefs_aggregate( + exprs: &[Expr], + agg: &Aggregate, +) -> Result, FieldRef)>> { + let agg_cols = agg_cols(agg); + exprs + .iter() + .map(|expr| match expr { + Expr::Column(c) if agg_cols.iter().any(|x| x == c) => { + // resolve against schema of input to aggregate + expr.to_qualifers_and_fieldrefs(agg.input.schema()) + } + _ => expr.to_qualifers_and_fieldrefs(&agg.schema), + }) + .collect() +} + +pub fn exprlist_to_qualified_fields( + exprs: &[Expr], + plan: &LogicalPlan, + is_nullable: Option, +) -> Result, Field)>> { + // when dealing with aggregate plans we cannot simply look in the aggregate output schema + // because it will contain columns representing complex expressions (such a column named + // `GROUPING(person.state)` so in order to resolve `person.state` in this case we need to + // look at the input to the aggregate instead. + + match plan { + LogicalPlan::Aggregate(agg) => { + exprlist_to_qualified_fields_aggregate(exprs, agg, is_nullable) } + _ => exprs + .iter() + .map(|e| e.to_qualifers_and_fields(plan.schema(), is_nullable)) + .collect(), } - Ok(fields) } -/// Create field meta-data from an expression, for use in a result set schema -pub fn exprlist_to_fields(exprs: &[Expr], plan: &LogicalPlan) -> Result> { +pub fn exprlist_to_qualified_fieldrefs( + exprs: &[Expr], + plan: &LogicalPlan, +) -> Result, FieldRef)>> { // when dealing with aggregate plans we cannot simply look in the aggregate output schema // because it will contain columns representing complex expressions (such a column named // `GROUPING(person.state)` so in order to resolve `person.state` in this case we need to // look at the input to the aggregate instead. - let fields = match plan { - LogicalPlan::Aggregate(agg) => Some(exprlist_to_fields_aggregate(exprs, agg)), - _ => None, - }; - if let Some(fields) = fields { - fields - } else { - // look for exact match in plan's output schema - let input_schema = &plan.schema(); - exprs.iter().map(|e| e.to_field(input_schema)).collect() + + match plan { + LogicalPlan::Aggregate(agg) => { + exprlist_to_qualified_fieldrefs_aggregate(exprs, agg) + } + _ => exprs + .iter() + .map(|e| e.to_qualifers_and_fieldrefs(plan.schema())) + .collect(), } } @@ -805,11 +842,13 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { )), Expr::ScalarSubquery(_) => e.clone(), _ => match e.display_name() { - Ok(name) => match input_schema.field_with_unqualified_name(&name) { - Ok(field) => Expr::Column(field.qualified_column()), - // expression not provided as input, do not convert to a column reference - Err(_) => e, - }, + Ok(name) => { + match input_schema.qualified_field_with_unqualified_name(&name) { + Ok(f) => col(f), + // expression not provided as input, do not convert to a column reference + Err(_) => e, + } + } Err(_) => e, }, } @@ -841,9 +880,9 @@ pub(crate) fn find_columns_referenced_by_expr(e: &Expr) -> Vec { /// Convert any `Expr` to an `Expr::Column`. pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { - Expr::Column(col) => { - let field = plan.schema().field_from_column(col)?; - Ok(Expr::Column(field.qualified_column())) + Expr::Column(c) => { + let f = plan.schema().qualified_field_from_column(c)?; + Ok(col(f)) } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index b21ec851dfcd..79462e27cbe4 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -24,6 +24,7 @@ use crate::analyzer::AnalyzerRule; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::Result; +use datafusion_expr::col; use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::{ logical_plan::LogicalPlan, Expr, Filter, LogicalPlanBuilder, TableScan, @@ -119,9 +120,8 @@ fn generate_projection_expr( let mut exprs = vec![]; if let Some(projection) = projection { for i in projection { - exprs.push(Expr::Column( - sub_plan.schema().fields()[*i].qualified_column(), - )); + let dffield = sub_plan.schema().qualified_field(*i); + exprs.push(col(dffield)); } } else { exprs.push(Expr::Wildcard { qualifier: None }); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index f8dcf460a469..811e51d247e2 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -763,9 +763,9 @@ mod test { }; use crate::test::assert_analyzed_plan_eq; - use arrow::datatypes::{DataType, TimeUnit}; + use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; use datafusion_expr::logical_plan::{EmptyRelation, Projection}; use datafusion_expr::{ @@ -787,13 +787,10 @@ mod test { fn empty_with_type(data_type: DataType) -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", data_type, true)], - std::collections::HashMap::new(), - ) - .unwrap(), - ), + schema: Arc::new(DFSchema::new_with_metadata( + vec![Field::new("a", data_type, true)], + std::collections::HashMap::new(), + )), })) } @@ -1050,13 +1047,9 @@ mod test { let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified( - "a", - DataType::Decimal128(12, 4), - true, - )], + vec![Field::new("a", DataType::Decimal128(12, 4), true)], std::collections::HashMap::new(), - )?), + )), })); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); let expected = @@ -1259,9 +1252,9 @@ mod test { fn test_type_coercion_rewrite() -> Result<()> { // gt let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).gt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).gt(lit(13i64))); @@ -1270,9 +1263,9 @@ mod test { // eq let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).eq(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).eq(lit(13i64))); @@ -1281,9 +1274,9 @@ mod test { // lt let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).lt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).lt(lit(13i64))); @@ -1355,26 +1348,26 @@ mod test { fn test_case_expression_coercion() -> Result<()> { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("boolean", DataType::Boolean, true), - DFField::new_unqualified("integer", DataType::Int32, true), - DFField::new_unqualified("float", DataType::Float32, true), - DFField::new_unqualified( + Field::new("boolean", DataType::Boolean, true), + Field::new("integer", DataType::Int32, true), + Field::new("float", DataType::Float32, true), + Field::new( "timestamp", DataType::Timestamp(TimeUnit::Nanosecond, None), true, ), - DFField::new_unqualified("date", DataType::Date32, true), - DFField::new_unqualified( + Field::new("date", DataType::Date32, true), + Field::new( "interval", DataType::Interval(arrow::datatypes::IntervalUnit::MonthDayNano), true, ), - DFField::new_unqualified("binary", DataType::Binary, true), - DFField::new_unqualified("string", DataType::Utf8, true), - DFField::new_unqualified("decimal", DataType::Decimal128(10, 10), true), + Field::new("binary", DataType::Binary, true), + Field::new("string", DataType::Utf8, true), + Field::new("decimal", DataType::Decimal128(10, 10), true), ], std::collections::HashMap::new(), - )?); + )); let case = Case { expr: None, diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 0c9064d0641f..fa7e853b212a 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -23,13 +23,13 @@ use std::sync::Arc; use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, }; use datafusion_common::{ - internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, + internal_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; @@ -323,8 +323,11 @@ impl CommonSubexprEliminate { let id = ExprIdentifierVisitor::<'static>::expr_identifier( &expr_rewritten, ); - let out_name = - expr_rewritten.to_field(&new_input_schema)?.qualified_name(); + + let field = expr_rewritten.to_field(&new_input_schema)?; + let dffield = expr_rewritten.to_dffield(&field)?; + let out_name = dffield.qualified_name(); + agg_exprs.push(expr_rewritten.alias(&id)); proj_exprs .push(Expr::Column(Column::from_name(id)).alias(out_name)); @@ -483,7 +486,7 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` - let field = DFField::new_unqualified(&id, data_type.clone(), true); + let field = Field::new(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); } @@ -493,9 +496,10 @@ fn build_common_expr_project_plan( } } - for field in input.schema().fields() { - if fields_set.insert(field.qualified_name()) { - project_exprs.push(Expr::Column(field.qualified_column())); + for field in input.schema().iter() { + let name = field.qualified_name(); + if fields_set.insert(name) { + project_exprs.push(col(field)); } } @@ -513,11 +517,7 @@ fn build_recover_project_plan( schema: &DFSchema, input: LogicalPlan, ) -> Result { - let col_exprs = schema - .fields() - .iter() - .map(|field| Expr::Column(field.qualified_column())) - .collect(); + let col_exprs = schema.iter().map(col).collect(); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, Arc::new(input), @@ -531,10 +531,14 @@ fn extract_expressions( ) -> Result<()> { if let Expr::GroupingSet(groupings) = expr { for e in groupings.distinct_expr() { - result.push(Expr::Column(e.to_field(schema)?.qualified_column())) + let field = e.to_field(schema)?; + let dffield = e.to_dffield(&field)?; + result.push(col(dffield)) } } else { - result.push(Expr::Column(expr.to_field(schema)?.qualified_column())); + let field = expr.to_field(schema)?; + let dffield = expr.to_dffield(&field)?; + result.push(col(dffield)) } Ok(()) @@ -866,11 +870,11 @@ mod test { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int64, false), - DFField::new_unqualified("c", DataType::Int64, false), + Field::new("a", DataType::Int64, false), + Field::new("c", DataType::Int64, false), ], Default::default(), - )?); + )); // skip aggregates let mut id_array = vec![]; @@ -1196,8 +1200,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1245,8 +1249,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1324,12 +1328,12 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("c", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), ], HashMap::default(), - )?; + ); extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 3); @@ -1342,11 +1346,11 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), ], HashMap::default(), - )?; + ); extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 2); @@ -1357,9 +1361,9 @@ mod test { fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], + vec![Field::new("a", DataType::Int32, false)], HashMap::default(), - )?; + ); extract_expressions(&col("a"), &schema, &mut result)?; assert!(result.len() == 1); diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 08ee38f64abd..4577431c6e6a 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -33,6 +33,7 @@ use arrow::datatypes::SchemaRef; use datafusion_common::{ get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result, }; +use datafusion_expr::col; use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::{ logical_plan::LogicalPlan, projection_schema, Aggregate, BinaryExpr, Cast, Distinct, @@ -638,10 +639,12 @@ fn outer_columns_helper_multi<'a>( /// /// A vector of `Expr::Column` expressions residing at `indices` of the `input_schema`. fn get_required_exprs(input_schema: &Arc, indices: &[usize]) -> Vec { - let fields = input_schema.fields(); indices .iter() - .map(|&idx| Expr::Column(fields[idx].qualified_column())) + .map(|&idx| { + let dffield = input_schema.qualified_field(idx); + col(dffield) + }) .collect() } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index fe63766fc265..14e85df8eb5b 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -467,7 +467,10 @@ mod tests { use crate::test::test_table_scan; use crate::{OptimizerConfig, OptimizerContext, OptimizerRule}; - use datafusion_common::{plan_err, DFField, DFSchema, DFSchemaRef, Result}; + use arrow::datatypes::FieldRef; + use datafusion_common::{ + plan_err, DFSchema, DFSchemaRef, OwnedTableReference, Result, + }; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; @@ -509,14 +512,18 @@ mod tests { let err = opt.optimize(&plan, &config, &observe).unwrap_err(); assert_eq!( "Optimizer rule 'get table_scan rule' failed\ncaused by\nget table_scan rule\ncaused by\n\ - Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { fields: [\ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ - metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - new schema: DFSchema { fields: [], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }.\ - \nThis was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker", + Internal error: Failed due to a difference in schemas, original schema: \ + DFSchema { inner: Schema { fields: \ + [Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, \ + field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + new schema: DFSchema { inner: Schema { \ + fields: [], metadata: {} }, \ + field_qualifiers: [], \ + functional_dependencies: FunctionalDependencies { deps: [] } }.\n\ + This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker", err.strip_backtrace() ); } @@ -609,25 +616,23 @@ mod tests { } fn add_metadata_to_fields(schema: &DFSchema) -> DFSchemaRef { - let new_fields = schema - .fields() - .iter() - .enumerate() - .map(|(i, f)| { - let metadata = - [("key".into(), format!("value {i}"))].into_iter().collect(); - - let new_arrow_field = f.field().as_ref().clone().with_metadata(metadata); - if let Some(qualifier) = f.qualifier() { - DFField::from_qualified(qualifier.clone(), new_arrow_field) - } else { - DFField::from(new_arrow_field) - } - }) - .collect::>(); + let (qualifiers, fields): (Vec>, Vec) = + schema + .iter() + .enumerate() + .map(|(i, dfffield)| { + let metadata = + [("key".into(), format!("value {i}"))].into_iter().collect(); + let field = dfffield.owned_field().with_metadata(metadata); + (dfffield.owned_qualifier(), Arc::new(field)) + }) + .unzip(); let new_metadata = schema.metadata().clone(); - Arc::new(DFSchema::new_with_metadata(new_fields, new_metadata).unwrap()) + Arc::new( + DFSchema::from_owned_qualified_fields(qualifiers, fields, new_metadata) + .unwrap(), + ) } fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index d1f9f87a32a3..2c08ae9bd2fb 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -188,7 +188,7 @@ mod tests { test_table_scan_fields, test_table_scan_with_name, }; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, ScalarValue}; + use datafusion_common::{Column, DFSchema, ScalarValue}; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Expr, JoinType, @@ -373,14 +373,11 @@ mod tests { fn test_empty_with_non_empty() -> Result<()> { let table_scan = test_table_scan()?; - let fields = test_table_scan_fields() - .into_iter() - .map(DFField::from) - .collect(); + let fields = test_table_scan_fields(); let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())?), + schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())), }); let one = LogicalPlanBuilder::from(empty.clone()).build()?; diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index e93e171e0324..9532fdd279f1 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -36,8 +36,8 @@ use datafusion_expr::logical_plan::{ }; use datafusion_expr::utils::{conjunction, split_conjunction, split_conjunction_owned}; use datafusion_expr::{ - and, build_join_schema, or, BinaryExpr, Expr, Filter, LogicalPlanBuilder, Operator, - ScalarFunctionDefinition, TableProviderFilterPushDown, + and, build_join_schema, col, or, BinaryExpr, Expr, Filter, LogicalPlanBuilder, + Operator, ScalarFunctionDefinition, TableProviderFilterPushDown, }; use itertools::Itertools; @@ -198,13 +198,12 @@ fn on_lr_is_preserved(plan: &LogicalPlan) -> Result<(bool, bool)> { // relevant columns are contained on the relevant join side's schema. fn can_pushdown_join_predicate(predicate: &Expr, schema: &DFSchema) -> Result { let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|dffield| { [ - f.qualified_column(), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(dffield.name()), + dffield.into(), ] }) .collect::>(); @@ -305,13 +304,12 @@ fn extract_or_clauses_for_join<'a>( schema: &'a DFSchema, ) -> impl Iterator + 'a { let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|dffield| { [ - f.qualified_column(), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(dffield.name()), + dffield.into(), ] }) .collect::>(); @@ -672,18 +670,9 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::SubqueryAlias(subquery_alias) => { let mut replace_map = HashMap::new(); - for (i, field) in - subquery_alias.input.schema().fields().iter().enumerate() - { - replace_map.insert( - subquery_alias - .schema - .fields() - .get(i) - .unwrap() - .qualified_name(), - Expr::Column(field.qualified_column()), - ); + for (i, dffield) in subquery_alias.input.schema().iter().enumerate() { + let sub_dffield = subquery_alias.schema.qualified_field(i); + replace_map.insert(sub_dffield.qualified_name(), col(dffield)); } let new_predicate = replace_cols_by_name(filter.predicate.clone(), &replace_map)?; @@ -700,17 +689,16 @@ impl OptimizerRule for PushDownFilter { let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) = projection .schema - .fields() .iter() .enumerate() - .map(|(i, field)| { + .map(|(i, dffield)| { // strip alias, as they should not be part of filters let expr = match &projection.expr[i] { Expr::Alias(Alias { expr, .. }) => expr.as_ref().clone(), expr => expr.clone(), }; - (field.qualified_name(), expr) + (dffield.qualified_name(), expr) }) .partition(|(_, value)| { is_volatile_expression(value).unwrap_or(true) @@ -760,11 +748,9 @@ impl OptimizerRule for PushDownFilter { let mut inputs = Vec::with_capacity(union.inputs.len()); for input in &union.inputs { let mut replace_map = HashMap::new(); - for (i, field) in input.schema().fields().iter().enumerate() { - replace_map.insert( - union.schema.fields().get(i).unwrap().qualified_name(), - Expr::Column(field.qualified_column()), - ); + for (i, dffield) in input.schema().iter().enumerate() { + let union_field = union.schema.qualified_field(i); + replace_map.insert(union_field.qualified_name(), col(dffield)); } let push_predicate = diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 28b3ff090fe6..8da353ecabb1 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -29,7 +29,9 @@ mod tests { use crate::test::*; use crate::OptimizerContext; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, Result}; + use datafusion_common::DFFieldRef; + use datafusion_common::OwnedTableReference; + use datafusion_common::{Column, DFSchema, Result}; use datafusion_expr::builder::table_scan_with_filters; use datafusion_expr::expr::{self, Cast}; use datafusion_expr::logical_plan::{ @@ -221,15 +223,22 @@ mod tests { // make sure schema for join node include both join columns let optimized_join = optimized_plan; + + let field_a = Field::new("a", DataType::UInt32, false); + let field_b = Field::new("b", DataType::UInt32, false); + let field_c1 = Field::new("c1", DataType::UInt32, true); + let qualifier_test: OwnedTableReference = "test".into(); + let qualifier_test2: OwnedTableReference = "test2".into(); + assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + DFFieldRef::new(Some(&qualifier_test), &field_a), + DFFieldRef::new(Some(&qualifier_test), &field_b), + DFFieldRef::new(Some(&qualifier_test2), &field_c1), ], - HashMap::new(), + HashMap::new() )?, ); @@ -264,15 +273,20 @@ mod tests { // make sure schema for join node include both join columns let optimized_join = optimized_plan.inputs()[0]; + let field_a = Field::new("a", DataType::UInt32, false); + let field_b = Field::new("b", DataType::UInt32, false); + let field_c1 = Field::new("c1", DataType::UInt32, true); + let qualifier_test: OwnedTableReference = "test".into(); + let qualifier_test2: OwnedTableReference = "test2".into(); assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + DFFieldRef::new(Some(&qualifier_test), &field_a), + DFFieldRef::new(Some(&qualifier_test), &field_b), + DFFieldRef::new(Some(&qualifier_test2), &field_c1), ], - HashMap::new(), + HashMap::new() )?, ); @@ -305,15 +319,20 @@ mod tests { // make sure schema for join node include both join columns let optimized_join = optimized_plan.inputs()[0]; + let field_a = Field::new("a", DataType::UInt32, false); + let field_a_nullable = Field::new("a", DataType::UInt32, true); + let field_b = Field::new("b", DataType::UInt32, false); + let qualifier_test: OwnedTableReference = "test".into(); + let qualifier_test2: OwnedTableReference = "test2".into(); assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "a", DataType::UInt32, true), + DFFieldRef::new(Some(&qualifier_test), &field_a), + DFFieldRef::new(Some(&qualifier_test), &field_b), + DFFieldRef::new(Some(&qualifier_test2), &field_a_nullable), ], - HashMap::new(), + HashMap::new() )?, ); diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 0666c324d12c..ceb229ba3afc 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -122,15 +122,12 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { // expressions, for `DISTINCT ON` we only need to emit the original selection expressions. let project_exprs = plan .schema() - .fields() .iter() .skip(on_expr.len()) - .zip(schema.fields().iter()) - .map(|(new_field, old_field)| { - Ok(col(new_field.qualified_column()).alias_qualified( - old_field.qualifier().cloned(), - old_field.name(), - )) + .zip(schema.iter()) + .map(|(new_dffield, dffield)| { + Ok(col(new_dffield) + .alias_qualified(dffield.owned_qualifier(), dffield.name())) }) .collect::>>()?; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 1cbe7decf15b..c7d33c9be40e 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1729,7 +1729,7 @@ mod tests { use crate::test::test_table_scan_with_name; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{assert_contains, DFField, ToDFSchema}; + use datafusion_common::{assert_contains, ToDFSchema}; use datafusion_expr::{interval_arithmetic::Interval, *}; use datafusion_physical_expr::execution_props::ExecutionProps; @@ -3084,22 +3084,19 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::new_with_metadata( - vec![ - DFField::new_unqualified("c1", DataType::Utf8, true), - DFField::new_unqualified("c2", DataType::Boolean, true), - DFField::new_unqualified("c3", DataType::Int64, true), - DFField::new_unqualified("c4", DataType::UInt32, true), - DFField::new_unqualified("c1_non_null", DataType::Utf8, false), - DFField::new_unqualified("c2_non_null", DataType::Boolean, false), - DFField::new_unqualified("c3_non_null", DataType::Int64, false), - DFField::new_unqualified("c4_non_null", DataType::UInt32, false), - ], - HashMap::new(), - ) - .unwrap(), - ) + Arc::new(DFSchema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, true), + Field::new("c2", DataType::Boolean, true), + Field::new("c3", DataType::Int64, true), + Field::new("c4", DataType::UInt32, true), + Field::new("c1_non_null", DataType::Utf8, false), + Field::new("c2_non_null", DataType::Boolean, false), + Field::new("c3_non_null", DataType::Int64, false), + Field::new("c4_non_null", DataType::UInt32, false), + ], + HashMap::new(), + )) } #[test] diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 07a9d84f7d48..c434ef4aa6ab 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -118,7 +118,6 @@ impl OptimizerRule for SingleDistinctToGroupBy { .. }) => { if is_single_distinct_agg(plan)? && !contains_grouping_set(group_expr) { - let fields = schema.fields(); // alias all original group_by exprs let (mut inner_group_exprs, out_group_expr_with_alias): ( Vec, @@ -150,9 +149,10 @@ impl OptimizerRule for SingleDistinctToGroupBy { // Second aggregate refers to the `test.a + Int32(1)` expression However, its input do not have `test.a` expression in it. let alias_str = format!("group_alias_{i}"); let alias_expr = group_expr.clone().alias(&alias_str); + let dffield = schema.qualified_field(i); ( alias_expr, - (col(alias_str), Some(fields[i].qualified_name())), + (col(alias_str), Some(dffield.qualified_name())), ) } }) @@ -230,8 +230,16 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(inner_aggr_exprs.iter()) .map(|expr| expr.to_field(input.schema())) .collect::>>()?; - let inner_schema = DFSchema::new_with_metadata( - inner_fields, + + let inner_dffields = inner_group_exprs + .iter() + .chain(inner_aggr_exprs.iter()) + .zip(inner_fields.iter()) + .map(|(expr, field)| expr.to_dffield(field)) + .collect::>>()?; + + let inner_schema = DFSchema::from_qualified_fields( + inner_dffields, input.schema().metadata().clone(), )?; let inner_agg = LogicalPlan::Aggregate(Aggregate::try_new( @@ -245,8 +253,16 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(outer_aggr_exprs.iter()) .map(|expr| expr.to_field(&inner_schema)) .collect::>>()?; - let outer_aggr_schema = Arc::new(DFSchema::new_with_metadata( - outer_fields, + + let outer_dffields = outer_group_exprs + .iter() + .chain(outer_aggr_exprs.iter()) + .zip(outer_fields.iter()) + .map(|(expr, field)| expr.to_dffield(field)) + .collect::>>()?; + + let outer_aggr_schema = Arc::new(DFSchema::from_qualified_fields( + outer_dffields, input.schema().metadata().clone(), )?); @@ -266,7 +282,8 @@ impl OptimizerRule for SingleDistinctToGroupBy { }) .chain(outer_aggr_exprs.iter().enumerate().map(|(idx, expr)| { let idx = idx + group_size; - let name = fields[idx].qualified_name(); + let field = schema.qualified_field(idx); + let name = field.qualified_name(); columnize_expr(expr.clone().alias(name), &outer_aggr_schema) })) .collect(); diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 196a35ee9ae8..ae93eb80f180 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -484,7 +484,7 @@ mod tests { use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; use datafusion_expr::{cast, col, in_list, lit, try_cast, Expr}; #[test] @@ -739,30 +739,19 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::new_with_metadata( - vec![ - DFField::new_unqualified("c1", DataType::Int32, false), - DFField::new_unqualified("c2", DataType::Int64, false), - DFField::new_unqualified("c3", DataType::Decimal128(18, 2), false), - DFField::new_unqualified("c4", DataType::Decimal128(38, 37), false), - DFField::new_unqualified("c5", DataType::Float32, false), - DFField::new_unqualified("c6", DataType::UInt32, false), - DFField::new_unqualified( - "ts_nano_none", - timestamp_nano_none_type(), - false, - ), - DFField::new_unqualified( - "ts_nano_utf", - timestamp_nano_utc_type(), - false, - ), - ], - HashMap::new(), - ) - .unwrap(), - ) + Arc::new(DFSchema::new_with_metadata( + vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int64, false), + Field::new("c3", DataType::Decimal128(18, 2), false), + Field::new("c4", DataType::Decimal128(38, 37), false), + Field::new("c5", DataType::Float32, false), + Field::new("c6", DataType::UInt32, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + ], + HashMap::new(), + )) } fn null_i8() -> Expr { diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 64ceb37d2961..53a1ac51d7bb 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -41,7 +41,7 @@ use arrow::{ use datafusion::execution::registry::FunctionRegistry; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, Column, Constraint, - Constraints, DFField, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, + Constraints, DFFieldRef, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, ScalarValue, }; use datafusion_expr::expr::Unnest; @@ -173,13 +173,35 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema { type Error = Error; fn try_from(df_schema: &protobuf::DfSchema) -> Result { - let fields = df_schema - .columns + let df_fields = df_schema.columns.clone(); + + let fields = df_fields + .iter() + .map(|df_field| { + let field: Result = + df_field.field.as_ref().required("field"); + field + }) + .collect::, Error>>()?; + + let qualifiers = df_fields + .iter() + .map(|df_field| { + df_field + .qualifier + .as_ref() + .map(|q| q.relation.clone().into()) + }) + .collect::>(); + + let qualifiers_and_fields = qualifiers .iter() - .map(|c| c.try_into()) - .collect::, _>>()?; - Ok(DFSchema::new_with_metadata( - fields, + .zip(fields.iter()) + .map(|(q, field)| Ok(DFFieldRef::new(q.as_ref(), field))) + .collect::, Error>>()?; + + Ok(DFSchema::from_qualified_fields( + qualifiers_and_fields, df_schema.metadata.clone(), )?) } @@ -194,19 +216,6 @@ impl TryFrom for DFSchemaRef { } } -impl TryFrom<&protobuf::DfField> for DFField { - type Error = Error; - - fn try_from(df_field: &protobuf::DfField) -> Result { - let field: Field = df_field.field.as_ref().required("field")?; - - Ok(match &df_field.qualifier { - Some(q) => DFField::from_qualified(q.relation.clone(), field), - None => DFField::from(field), - }) - } -} - impl From for WindowFrameUnits { fn from(units: protobuf::WindowFrameUnits) -> Self { match units { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 89bd93550a04..fd37ed35edae 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -43,7 +43,7 @@ use arrow::{ record_batch::RecordBatch, }; use datafusion_common::{ - Column, Constraint, Constraints, DFField, DFSchema, DFSchemaRef, OwnedTableReference, + Column, Constraint, Constraints, DFSchema, DFSchemaRef, OwnedTableReference, ScalarValue, }; use datafusion_expr::expr::{ @@ -287,27 +287,20 @@ impl TryFrom for protobuf::Schema { } } -impl TryFrom<&DFField> for protobuf::DfField { - type Error = Error; - - fn try_from(f: &DFField) -> Result { - Ok(Self { - field: Some(f.field().as_ref().try_into()?), - qualifier: f.qualifier().map(|r| protobuf::ColumnRelation { - relation: r.to_string(), - }), - }) - } -} - impl TryFrom<&DFSchema> for protobuf::DfSchema { type Error = Error; fn try_from(s: &DFSchema) -> Result { let columns = s - .fields() .iter() - .map(|f| f.try_into()) + .map(|dffield| { + Ok(protobuf::DfField { + field: Some(dffield.field().try_into()?), + qualifier: dffield.qualifier().map(|r| protobuf::ColumnRelation { + relation: r.to_string(), + }), + }) + }) .collect::, Error>>()?; Ok(Self { columns, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 3c43f100750f..3002537682b2 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -19,6 +19,7 @@ use std::any::Any; use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; +use std::vec; use arrow::array::{ArrayRef, FixedSizeListArray}; use arrow::datatypes::{ @@ -34,8 +35,8 @@ use datafusion::test_util::{TestTableFactory, TestTableProvider}; use datafusion_common::config::{FormatOptions, TableOptions}; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ - internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef, - DataFusionError, FileType, Result, ScalarValue, + internal_err, not_impl_err, plan_err, DFFieldRef, DFSchema, DFSchemaRef, + DataFusionError, FileType, OwnedTableReference, Result, ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -1410,11 +1411,15 @@ fn roundtrip_schema() { #[test] fn roundtrip_dfschema() { - let dfschema = DFSchema::new_with_metadata( + let field_a = Field::new("a", DataType::Int64, false); + let field_b = Field::new("b", DataType::Decimal128(15, 2), true) + .with_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); + let qualifier_t: OwnedTableReference = "t".into(); + + let dfschema = DFSchema::from_qualified_fields( vec![ - DFField::new_unqualified("a", DataType::Int64, false), - DFField::new(Some("t"), "b", DataType::Decimal128(15, 2), true) - .with_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])), + DFFieldRef::new(None, &field_a), + DFFieldRef::new(Some(&qualifier_t), &field_b), ], HashMap::from([ (String::from("k2"), String::from("v2")), diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9f53ff579e7c..8424a67dc0f1 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -17,10 +17,10 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFField, DFSchema, DataFusionError, + internal_err, plan_datafusion_err, Column, DFFieldRef, DFSchema, DataFusionError, Result, TableReference, }; -use datafusion_expr::{Case, Expr}; +use datafusion_expr::{col, Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; impl<'a, S: ContextProvider> SqlToRel<'a, S> { @@ -57,13 +57,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Err(_) => { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { - match outer.field_with_unqualified_name(normalize_ident.as_str()) - { + match outer.qualified_field_with_unqualified_name( + normalize_ident.as_str(), + ) { Ok(field) => { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - field.qualified_column(), + field.into(), )) } Err(_) => Ok(Expr::Column(Column { @@ -122,21 +123,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let search_result = search_dfschema(&ids, schema); match search_result { // found matching field with spare identifier(s) for nested field(s) in structure - Some((field, nested_names)) if !nested_names.is_empty() => { + Some((dffield, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support multiple nested identifiers if nested_names.len() > 1 { + let col: Column = dffield.into(); return internal_err!( "Nested identifiers not yet supported for column {}", - field.qualified_column().quoted_flat_name() + col.quoted_flat_name() ); } let nested_name = nested_names[0].to_string(); - Ok(Expr::Column(field.qualified_column()).field(nested_name)) + Ok(col(dffield).field(nested_name)) } // found matching field with no spare identifier(s) - Some((field, _nested_names)) => { - Ok(Expr::Column(field.qualified_column())) - } + Some((dffield, _nested_names)) => Ok(col(dffield)), None => { // return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field @@ -152,9 +152,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if !nested_names.is_empty() => { // TODO: remove when can support nested identifiers for OuterReferenceColumn + let col: Column = field.into(); internal_err!( "Nested identifiers are not yet supported for OuterReferenceColumn {}", - field.qualified_column().quoted_flat_name() + col.quoted_flat_name() ) } // found matching field with no spare identifier(s) @@ -162,7 +163,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - field.qualified_column(), + field.into(), )) } // found no matching field, will return a default @@ -269,11 +270,17 @@ fn form_identifier(idents: &[String]) -> Result<(Option, &String fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, -) -> Option<(&'schema DFField, &'ids [String])> { - generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { - let field = schema.field_with_name(qualifier.as_ref(), column).ok(); - field.map(|f| (f, nested_names)) - }) +) -> Option<(DFFieldRef<'schema>, &'ids [String])> { + let res = generate_schema_search_terms(ids).find_map( + |(qualifier, column, nested_names)| { + let qualifier_and_field = schema + .qualified_field_with_name(qualifier.as_ref(), column) + .ok(); + qualifier_and_field.map(|f| (f, nested_names)) + }, + ); + + res } // Possibilities we search with, in order from top to bottom for each len: diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index c34b42193cec..4e09b3bf3b30 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -30,8 +30,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::DataType; use arrow_schema::TimeUnit; use datafusion_common::{ - internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, DFSchema, - Result, ScalarValue, + internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, + ScalarValue, }; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::expr::InList; @@ -133,20 +133,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match expr { Expr::Column(col) => match &col.relation { Some(q) => { - match schema - .fields() - .iter() - .find(|field| match field.qualifier() { - Some(field_q) => { - field.name() == &col.name - && field_q.to_string().ends_with(&format!(".{q}")) - } - _ => false, - }) { - Some(df_field) => Expr::Column(Column { - relation: df_field.qualifier().cloned(), - name: df_field.name().clone(), - }), + match schema.iter().find(|f| match f.qualifier() { + Some(field_q) => { + f.name() == &col.name + && field_q.to_string().ends_with(&format!(".{q}")) + } + _ => false, + }) { + Some(dffield) => Expr::Column(dffield.into()), None => Expr::Column(col), } } diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index 46f19f436ccc..6ce4a26a11ab 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -17,8 +17,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{plan_datafusion_err, plan_err, DFSchema, Result}; -use datafusion_expr::expr::Sort; use datafusion_expr::Expr; +use datafusion_expr::{col, expr::Sort}; use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value}; impl<'a, S: ContextProvider> SqlToRel<'a, S> { @@ -60,8 +60,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ); } - let field = schema.field(field_index - 1); - Expr::Column(field.qualified_column()) + let dffield = schema.qualified_field(field_index - 1); + col(dffield) } e => self.sql_expr_to_logical_expr(e.clone(), schema, planner_context)?, }; diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index 4ba089f48630..262bae397cee 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -145,17 +145,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .build() } JoinConstraint::Natural => { - let left_cols: HashSet<&String> = left - .schema() - .fields() - .iter() - .map(|f| f.field().name()) - .collect(); + let left_cols: HashSet<&String> = + left.schema().fields().iter().map(|f| f.name()).collect(); let keys: Vec = right .schema() .fields() .iter() - .map(|f| f.field().name()) + .map(|f| f.name()) .filter(|f| left_cols.contains(f)) .map(Column::from_name) .collect(); diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index e50aceb757df..08245620d752 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -29,11 +29,11 @@ use crate::planner::{ }; use crate::utils::normalize_ident; -use arrow_schema::DataType; +use arrow_schema::{DataType, Fields}; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, schema_err, - unqualified_field_not_found, Column, Constraints, DFField, DFSchema, DFSchemaRef, + unqualified_field_not_found, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, FileType, OwnedTableReference, Result, ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, }; @@ -1250,7 +1250,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Build updated values for each column, using the previous value if not modified let exprs = table_schema - .fields() .iter() .map(|field| { let expr = match assign_map.remove(field.name()) { @@ -1279,7 +1278,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { field.name(), )) } else { - datafusion_expr::Expr::Column(field.qualified_column()) + datafusion_expr::Expr::Column(Column::new( + field.owned_qualifier(), + field.name(), + )) } } }; @@ -1345,8 +1347,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } Ok(table_schema.field(column_index).clone()) }) - .collect::>>()?; - (fields, value_indices) + .collect::>>()?; + (Fields::from(fields), value_indices) }; // infer types for Values clause... other types should be resolvable the regular way @@ -1365,7 +1367,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { idx + 1 ) })?; - let dt = field.field().data_type().clone(); + let dt = field.data_type().clone(); let _ = prepare_param_data_types.insert(name, dt); } } @@ -1388,9 +1390,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let target_field = table_schema.field(i); let expr = match value_index { Some(v) => { - let source_field = source.schema().field(v); - datafusion_expr::Expr::Column(source_field.qualified_column()) - .cast_to(target_field.data_type(), source.schema())? + let dffield = source.schema().qualified_field(v); + col(dffield).cast_to(target_field.data_type(), source.schema())? } // The value is not specified. Fill in the default value for the column. None => table_source diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index abb896ab113e..b9212e777823 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -28,7 +28,7 @@ use datafusion_common::{ }; use datafusion_expr::expr::{Alias, GroupingSet, WindowFunction}; use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs}; -use datafusion_expr::{expr_vec_fmt, Expr, LogicalPlan}; +use datafusion_expr::{col, expr_vec_fmt, Expr, LogicalPlan}; use sqlparser::ast::Ident; /// Make a best-effort attempt at resolving all columns in the expression tree @@ -36,9 +36,9 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { expr.clone() .transform_up(&|nested_expr| { match nested_expr { - Expr::Column(col) => { - let field = plan.schema().field_from_column(&col)?; - Ok(Transformed::yes(Expr::Column(field.qualified_column()))) + Expr::Column(c) => { + let dffield = plan.schema().qualified_field_from_column(&c)?; + Ok(Transformed::yes(col(dffield))) } _ => { // keep recursing diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index c0db111bc60d..04e80b77bb9f 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::Fields; use arrow::util::display::ArrayFormatter; use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; -use datafusion_common::DFField; use datafusion_common::DataFusionError; use std::path::PathBuf; use std::sync::OnceLock; @@ -239,7 +239,7 @@ pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { } /// Converts columns to a result as expected by sqllogicteset. -pub(crate) fn convert_schema_to_types(columns: &[DFField]) -> Vec { +pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { columns .iter() .map(|f| f.data_type()) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index ed1e48ca71a6..9348214434a5 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -18,7 +18,7 @@ use async_recursion::async_recursion; use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion::common::{ - not_impl_err, substrait_datafusion_err, substrait_err, DFField, DFSchema, DFSchemaRef, + not_impl_err, substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef, }; use datafusion::execution::FunctionRegistry; @@ -27,7 +27,7 @@ use datafusion::logical_expr::{ Case, Expr, LogicalPlan, Operator, }; use datafusion::logical_expr::{ - expr, Cast, Extension, GroupingSet, Like, LogicalPlanBuilder, Partitioning, + col, expr, Cast, Extension, GroupingSet, Like, LogicalPlanBuilder, Partitioning, Repartition, ScalarUDF, Subquery, WindowFrameBound, WindowFrameUnits, }; use datafusion::prelude::JoinType; @@ -484,17 +484,20 @@ pub async fn from_substrait_rel( .collect(); match &t { LogicalPlan::TableScan(scan) => { - let fields: Vec = column_indices + let fields = column_indices .iter() - .map(|i| scan.projected_schema.field(*i).clone()) + .map(|i| { + scan.projected_schema.qualified_field(*i) + }) .collect(); let mut scan = scan.clone(); scan.projection = Some(column_indices); - scan.projected_schema = - DFSchemaRef::new(DFSchema::new_with_metadata( + scan.projected_schema = DFSchemaRef::new( + DFSchema::from_qualified_fields( fields, HashMap::new(), - )?); + )?, + ); Ok(LogicalPlan::TableScan(scan)) } _ => plan_err!("unexpected plan for table"), @@ -1390,11 +1393,8 @@ fn from_substrait_field_reference( "Direct reference StructField with child is not supported" ), None => { - let column = input_schema.field(x.field as usize).qualified_column(); - Ok(Expr::Column(Column { - relation: column.relation, - name: column.name, - })) + let dffield = input_schema.qualified_field(x.field as usize); + Ok(col(dffield)) } }, _ => not_impl_err!(