From ea83c37d3661d128045e7da3ff51e9bbea74d1bb Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 10 Aug 2023 14:44:46 +0300 Subject: [PATCH 01/29] separate implementation of oeq properties --- .../core/src/physical_plan/joins/utils.rs | 95 +++++++------- datafusion/physical-expr/src/equivalence.rs | 120 ++++++++++++------ datafusion/physical-expr/src/utils.rs | 34 +++-- 3 files changed, 152 insertions(+), 97 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index abba191f047b..60497782dff3 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -322,20 +322,24 @@ pub fn cross_join_equivalence_properties( /// when join schema consist of combination of left and right schema (Inner, Left, Full, Right joins). fn get_updated_right_ordering_equivalence_properties( join_type: &JoinType, - right_oeq_classes: &[OrderingEquivalentClass], + right_oeq_classes: Option<&OrderingEquivalentClass>, left_columns_len: usize, -) -> Result> { - match join_type { - // In these modes, indices of the right schema should be offset by - // the left table size. - JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { - add_offset_to_ordering_equivalence_classes( - right_oeq_classes, - left_columns_len, - ) - } - _ => Ok(right_oeq_classes.to_vec()), - } +) -> Result> { + right_oeq_classes + .map(|right_oeq_classes| { + match join_type { + // In these modes, indices of the right schema should be offset by + // the left table size. + JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { + add_offset_to_ordering_equivalence_classes( + right_oeq_classes, + left_columns_len, + ) + } + _ => Ok(right_oeq_classes.clone()), + } + }) + .transpose() } /// Merge left and right sort expressions, checking for duplicates. @@ -353,26 +357,22 @@ fn merge_vectors( /// Prefix with existing ordering. fn prefix_ordering_equivalence_with_existing_ordering( existing_ordering: &[PhysicalSortExpr], - oeq_classes: &[OrderingEquivalentClass], + oeq_class: Option<&OrderingEquivalentClass>, eq_classes: &[EquivalentClass], -) -> Vec { - oeq_classes - .iter() - .map(|oeq_class| { - let normalized_head = normalize_sort_exprs(oeq_class.head(), eq_classes, &[]); - let updated_head = merge_vectors(existing_ordering, &normalized_head); - let updated_others = oeq_class - .others() - .iter() - .map(|ordering| { - let normalized_ordering = - normalize_sort_exprs(ordering, eq_classes, &[]); - merge_vectors(existing_ordering, &normalized_ordering) - }) - .collect(); - OrderingEquivalentClass::new(updated_head, updated_others) - }) - .collect() +) -> Option { + oeq_class.map(|oeq_class| { + let normalized_head = normalize_sort_exprs(oeq_class.head(), eq_classes, &[]); + let updated_head = merge_vectors(existing_ordering, &normalized_head); + let updated_others = oeq_class + .others() + .iter() + .map(|ordering| { + let normalized_ordering = normalize_sort_exprs(ordering, eq_classes, &[]); + merge_vectors(existing_ordering, &normalized_ordering) + }) + .collect(); + OrderingEquivalentClass::new(updated_head, updated_others) + }) } /// Calculate ordering equivalence properties for the given join operation. @@ -400,7 +400,7 @@ pub fn combine_join_ordering_equivalence_properties( )) } (true, false) => { - new_properties.extend(left_oeq_properties.classes().iter().cloned()); + new_properties.extend(left_oeq_properties.oeq_class().cloned()); // In this special case, right side ordering can be prefixed with left side ordering. if probe_side == Some(JoinSide::Left) && right.output_ordering().is_some() @@ -409,7 +409,7 @@ pub fn combine_join_ordering_equivalence_properties( let right_oeq_classes = get_updated_right_ordering_equivalence_properties( join_type, - right_oeq_properties.classes(), + right_oeq_properties.oeq_class(), left_columns_len, )?; let left_output_ordering = left.output_ordering().unwrap_or(&[]); @@ -424,7 +424,7 @@ pub fn combine_join_ordering_equivalence_properties( let updated_right_oeq_classes = prefix_ordering_equivalence_with_existing_ordering( left_output_ordering, - &right_oeq_classes, + right_oeq_classes.as_ref(), join_eq_properties.classes(), ); new_properties.extend(updated_right_oeq_classes); @@ -433,7 +433,7 @@ pub fn combine_join_ordering_equivalence_properties( (false, true) => { let right_oeq_classes = get_updated_right_ordering_equivalence_properties( join_type, - right_oeq_properties.classes(), + right_oeq_properties.oeq_class(), left_columns_len, )?; new_properties.extend(right_oeq_classes); @@ -442,7 +442,7 @@ pub fn combine_join_ordering_equivalence_properties( && left.output_ordering().is_some() && *join_type == JoinType::Inner { - let left_oeq_classes = right_oeq_properties.classes(); + let left_oeq_classes = right_oeq_properties.oeq_class(); let right_output_ordering = right.output_ordering().unwrap_or(&[]); // Left side ordering equivalence properties should be prepended with // those of the right side while constructing output ordering equivalence @@ -507,21 +507,16 @@ pub(crate) fn add_offset_to_lex_ordering( /// Adds the `offset` value to `Column` indices for all expressions inside the /// given `OrderingEquivalentClass`es. pub(crate) fn add_offset_to_ordering_equivalence_classes( - oeq_classes: &[OrderingEquivalentClass], + oeq_classes: &OrderingEquivalentClass, offset: usize, -) -> Result> { - oeq_classes +) -> Result { + let new_head = add_offset_to_lex_ordering(oeq_classes.head(), offset)?; + let new_others = oeq_classes + .others() .iter() - .map(|prop| { - let new_head = add_offset_to_lex_ordering(prop.head(), offset)?; - let new_others = prop - .others() - .iter() - .map(|ordering| add_offset_to_lex_ordering(ordering, offset)) - .collect::>>()?; - Ok(OrderingEquivalentClass::new(new_head, new_others)) - }) - .collect() + .map(|ordering| add_offset_to_lex_ordering(ordering, offset)) + .collect::>>()?; + Ok(OrderingEquivalentClass::new(new_head, new_others)) } impl Display for JoinSide { diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index a2477b054623..9cbf9cc212c2 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -17,8 +17,8 @@ use crate::expressions::{CastExpr, Column}; use crate::{ - normalize_expr_with_equivalence_properties, LexOrdering, PhysicalExpr, - PhysicalSortExpr, + normalize_expr_with_equivalence_properties, LexOrdering, LexOrderingRef, + PhysicalExpr, PhysicalSortExpr, }; use arrow::datatypes::SchemaRef; @@ -37,12 +37,12 @@ use std::sync::Arc; /// 1. Equality conditions (like `A=B`), when `T` = [`Column`] /// 2. Ordering (like `A ASC = B ASC`), when `T` = [`PhysicalSortExpr`] #[derive(Debug, Clone)] -pub struct EquivalenceProperties { - classes: Vec>, +pub struct EquivalenceProperties { + classes: Vec>, schema: SchemaRef, } -impl EquivalenceProperties { +impl EquivalenceProperties { pub fn new(schema: SchemaRef) -> Self { EquivalenceProperties { classes: vec![], @@ -51,7 +51,7 @@ impl EquivalenceProperties { } /// return the set of equivalences - pub fn classes(&self) -> &[EquivalentClass] { + pub fn classes(&self) -> &[EquivalentClass] { &self.classes } @@ -60,7 +60,7 @@ impl EquivalenceProperties { } /// Add the [`EquivalentClass`] from `iter` to this list - pub fn extend>>(&mut self, iter: I) { + pub fn extend>>(&mut self, iter: I) { for ec in iter { self.classes.push(ec) } @@ -68,7 +68,7 @@ impl EquivalenceProperties { /// Adds new equal conditions into the EquivalenceProperties. New equal /// conditions usually come from equality predicates in a join/filter. - pub fn add_equal_conditions(&mut self, new_conditions: (&T, &T)) { + pub fn add_equal_conditions(&mut self, new_conditions: (&Column, &Column)) { let mut idx1: Option = None; let mut idx2: Option = None; for (idx, class) in self.classes.iter_mut().enumerate() { @@ -106,7 +106,7 @@ impl EquivalenceProperties { } (None, None) => { // adding new pairs - self.classes.push(EquivalentClass::::new( + self.classes.push(EquivalentClass::::new( new_conditions.0.clone(), vec![new_conditions.1.clone()], )); @@ -131,7 +131,54 @@ impl EquivalenceProperties { /// where both `a ASC` and `b DESC` can describe the table ordering. With /// `OrderingEquivalenceProperties`, we can keep track of these equivalences /// and treat `a ASC` and `b DESC` as the same ordering requirement. -pub type OrderingEquivalenceProperties = EquivalenceProperties; +#[derive(Debug, Clone)] +pub struct OrderingEquivalenceProperties { + oeq_class: Option, + schema: SchemaRef, +} + +impl OrderingEquivalenceProperties { + pub fn new(schema: SchemaRef) -> Self { + Self { + oeq_class: None, + schema, + } + } +} + +impl OrderingEquivalenceProperties { + pub fn extend(&mut self, other: Option) { + if let Some(other) = other { + if let Some(class) = &mut self.oeq_class { + class.others.insert(other.head); + class.others.extend(other.others); + } else { + self.oeq_class = Some(other); + } + } + } + + pub fn oeq_class(&self) -> Option<&OrderingEquivalentClass> { + self.oeq_class.as_ref() + } + + /// Adds new equal conditions into the EquivalenceProperties. New equal + /// conditions usually come from equality predicates in a join/filter. + pub fn add_equal_conditions(&mut self, new_conditions: (&LexOrdering, &LexOrdering)) { + if let Some(class) = &mut self.oeq_class { + class.insert(new_conditions.0.clone()); + class.insert(new_conditions.1.clone()); + } else { + let head = new_conditions.0.clone(); + let others = vec![new_conditions.1.clone()]; + self.oeq_class = Some(OrderingEquivalentClass::new(head, others)) + } + } + + pub fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} /// EquivalentClass is a set of [`Column`]s or [`PhysicalSortExpr`]s that are known /// to have the same value in all tuples in a relation. `EquivalentClass` @@ -290,7 +337,7 @@ impl OrderingEquivalenceBuilder { new_ordering_eq_properties: OrderingEquivalenceProperties, ) -> Self { self.ordering_eq_properties - .extend(new_ordering_eq_properties.classes().iter().cloned()); + .extend(new_ordering_eq_properties.oeq_class().cloned()); self } @@ -435,40 +482,41 @@ pub fn project_ordering_equivalence_properties( let schema = output_eq.schema(); let fields = schema.fields(); - let mut eq_classes = input_eq.classes().to_vec(); + let oeq_class = input_eq.oeq_class(); + let mut oeq_class = if let Some(oeq_class) = oeq_class { + oeq_class.clone() + } else { + return (); + }; let mut oeq_alias_map = vec![]; for (column, columns) in columns_map { if is_column_invalid_in_new_schema(column, fields) { oeq_alias_map.push((column.clone(), columns[0].clone())); } } - for class in eq_classes.iter_mut() { - class.update_with_aliases(&oeq_alias_map, fields); - } + oeq_class.update_with_aliases(&oeq_alias_map, fields); // Prune columns that no longer is in the schema from from the OrderingEquivalenceProperties. - for class in eq_classes.iter_mut() { - let sort_exprs_to_remove = class - .iter() - .filter(|sort_exprs| { - sort_exprs.iter().any(|sort_expr| { - let cols_in_expr = collect_columns(&sort_expr.expr); - // If any one of the columns, used in Expression is invalid, remove expression - // from ordering equivalences - cols_in_expr - .iter() - .any(|col| is_column_invalid_in_new_schema(col, fields)) - }) + let sort_exprs_to_remove = oeq_class + .iter() + .filter(|sort_exprs| { + sort_exprs.iter().any(|sort_expr| { + let cols_in_expr = collect_columns(&sort_expr.expr); + // If any one of the columns, used in Expression is invalid, remove expression + // from ordering equivalences + cols_in_expr + .iter() + .any(|col| is_column_invalid_in_new_schema(col, fields)) }) - .cloned() - .collect::>(); - for sort_exprs in sort_exprs_to_remove { - class.remove(&sort_exprs); - } + }) + .cloned() + .collect::>(); + for sort_exprs in sort_exprs_to_remove { + oeq_class.remove(&sort_exprs); + } + if oeq_class.len() > 1 { + output_eq.extend(Some(oeq_class)); } - eq_classes.retain(|props| props.len() > 1); - - output_eq.extend(eq_classes); } /// Update `ordering` if it contains cast expression with target column @@ -496,7 +544,7 @@ pub fn update_ordering_equivalence_with_cast( cast_exprs: &[(CastExpr, Column)], input_oeq: &mut OrderingEquivalenceProperties, ) { - for cls in input_oeq.classes.iter_mut() { + if let Some(cls) = &mut input_oeq.oeq_class { for ordering in std::iter::once(cls.head().clone()).chain(cls.others().clone().into_iter()) { diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 8f63d94b3b34..16b62f18fffa 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -313,13 +313,16 @@ pub fn ordering_satisfy_concrete< ordering_equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties.classes(); + let ordering_eq_classes = oeq_properties + .oeq_class() + .map(|item| vec![item.clone()]) + .unwrap_or(vec![]); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_exprs(required, eq_classes, ordering_eq_classes); + normalize_sort_exprs(required, eq_classes, &ordering_eq_classes); let provided_normalized = - normalize_sort_exprs(provided, eq_classes, ordering_eq_classes); + normalize_sort_exprs(provided, eq_classes, &ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -364,13 +367,16 @@ pub fn ordering_satisfy_requirement_concrete< ordering_equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties.classes(); + let ordering_eq_classes = oeq_properties + .oeq_class() + .map(|item| vec![item.clone()]) + .unwrap_or(vec![]); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements(required, eq_classes, ordering_eq_classes); + normalize_sort_requirements(required, eq_classes, &ordering_eq_classes); let provided_normalized = - normalize_sort_exprs(provided, eq_classes, ordering_eq_classes); + normalize_sort_exprs(provided, eq_classes, &ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -415,14 +421,17 @@ fn requirements_compatible_concrete< equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties.classes(); + let ordering_eq_classes = oeq_properties + .oeq_class() + .map(|item| vec![item.clone()]) + .unwrap_or(vec![]); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements(required, eq_classes, ordering_eq_classes); + normalize_sort_requirements(required, eq_classes, &ordering_eq_classes); let provided_normalized = - normalize_sort_requirements(provided, eq_classes, ordering_eq_classes); + normalize_sort_requirements(provided, eq_classes, &ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -1239,13 +1248,16 @@ mod tests { ]; let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; let eq_classes = eq_properties.classes(); - let ordering_eq_classes = ordering_eq_properties.classes(); + let ordering_eq_classes = ordering_eq_properties + .oeq_class() + .map(|item| vec![item.clone()]) + .unwrap_or(vec![]); for (reqs, expected_normalized) in requirements.into_iter() { let req = convert_to_requirement(&reqs); let expected_normalized = convert_to_requirement(&expected_normalized); assert_eq!( - normalize_sort_requirements(&req, eq_classes, ordering_eq_classes), + normalize_sort_requirements(&req, eq_classes, &ordering_eq_classes), expected_normalized ); } From 4ad5ec546421e6b16a08f46844a3fbc9da609b14 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 14:46:11 +0300 Subject: [PATCH 02/29] Simplifications --- datafusion/core/src/physical_plan/memory.rs | 6 +++--- datafusion/physical-expr/src/equivalence.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/physical_plan/memory.rs b/datafusion/core/src/physical_plan/memory.rs index 877410c97ca5..1e35dbc0893f 100644 --- a/datafusion/core/src/physical_plan/memory.rs +++ b/datafusion/core/src/physical_plan/memory.rs @@ -296,9 +296,9 @@ mod tests { assert_eq!(mem_exec.output_ordering().unwrap(), expected_output_order); let order_eq = mem_exec.ordering_equivalence_properties(); assert!(order_eq - .classes() - .iter() - .any(|class| class.contains(&expected_order_eq))); + .oeq_class() + .map(|class| class.contains(&expected_order_eq)) + .unwrap_or(false)); Ok(()) } } diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 9cbf9cc212c2..3ba719ef22f0 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -17,8 +17,8 @@ use crate::expressions::{CastExpr, Column}; use crate::{ - normalize_expr_with_equivalence_properties, LexOrdering, LexOrderingRef, - PhysicalExpr, PhysicalSortExpr, + normalize_expr_with_equivalence_properties, LexOrdering, PhysicalExpr, + PhysicalSortExpr, }; use arrow::datatypes::SchemaRef; @@ -486,7 +486,7 @@ pub fn project_ordering_equivalence_properties( let mut oeq_class = if let Some(oeq_class) = oeq_class { oeq_class.clone() } else { - return (); + return; }; let mut oeq_alias_map = vec![]; for (column, columns) in columns_map { From 016558b4951a999cc5a2f162311f32400de5d1c0 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 16:09:22 +0300 Subject: [PATCH 03/29] Move utils to methods --- .../physical_optimizer/dist_enforcement.rs | 25 +- .../core/src/physical_plan/joins/utils.rs | 42 ++-- datafusion/physical-expr/src/equivalence.rs | 213 +++++++++++++++- datafusion/physical-expr/src/lib.rs | 3 +- datafusion/physical-expr/src/partitioning.rs | 19 +- datafusion/physical-expr/src/utils.rs | 228 ++---------------- 6 files changed, 262 insertions(+), 268 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/dist_enforcement.rs b/datafusion/core/src/physical_optimizer/dist_enforcement.rs index cb98e69d7afc..2b125ed2a09f 100644 --- a/datafusion/core/src/physical_optimizer/dist_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/dist_enforcement.rs @@ -39,9 +39,7 @@ use datafusion_physical_expr::equivalence::EquivalenceProperties; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::expressions::NoOp; use datafusion_physical_expr::utils::map_columns_before_projection; -use datafusion_physical_expr::{ - expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, PhysicalExpr, -}; +use datafusion_physical_expr::{expr_list_eq_strict_order, PhysicalExpr}; use std::sync::Arc; /// The EnforceDistribution rule ensures that distribution requirements are met @@ -675,36 +673,21 @@ fn try_reorder( } else if !equivalence_properties.classes().is_empty() { normalized_expected = expected .iter() - .map(|e| { - normalize_expr_with_equivalence_properties( - e.clone(), - equivalence_properties.classes(), - ) - }) + .map(|e| equivalence_properties.normalize_expr(e.clone())) .collect::>(); assert_eq!(normalized_expected.len(), expected.len()); normalized_left_keys = join_keys .left_keys .iter() - .map(|e| { - normalize_expr_with_equivalence_properties( - e.clone(), - equivalence_properties.classes(), - ) - }) + .map(|e| equivalence_properties.normalize_expr(e.clone())) .collect::>(); assert_eq!(join_keys.left_keys.len(), normalized_left_keys.len()); normalized_right_keys = join_keys .right_keys .iter() - .map(|e| { - normalize_expr_with_equivalence_properties( - e.clone(), - equivalence_properties.classes(), - ) - }) + .map(|e| equivalence_properties.normalize_expr(e.clone())) .collect::>(); assert_eq!(join_keys.right_keys.len(), normalized_right_keys.len()); diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index 60497782dff3..a369b568cddf 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -49,8 +49,6 @@ use datafusion_physical_expr::{ OrderingEquivalentClass, PhysicalExpr, PhysicalSortExpr, }; -use datafusion_physical_expr::utils::normalize_sort_exprs; - use futures::future::{BoxFuture, Shared}; use futures::{ready, FutureExt}; use itertools::Itertools; @@ -322,21 +320,22 @@ pub fn cross_join_equivalence_properties( /// when join schema consist of combination of left and right schema (Inner, Left, Full, Right joins). fn get_updated_right_ordering_equivalence_properties( join_type: &JoinType, - right_oeq_classes: Option<&OrderingEquivalentClass>, + right_oeq_properties: OrderingEquivalenceProperties, left_columns_len: usize, ) -> Result> { - right_oeq_classes - .map(|right_oeq_classes| { + right_oeq_properties + .oeq_class() + .map(|oeq_class| { match join_type { // In these modes, indices of the right schema should be offset by // the left table size. JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { add_offset_to_ordering_equivalence_classes( - right_oeq_classes, + oeq_class, left_columns_len, ) } - _ => Ok(right_oeq_classes.clone()), + _ => Ok(oeq_class.clone()), } }) .transpose() @@ -357,17 +356,18 @@ fn merge_vectors( /// Prefix with existing ordering. fn prefix_ordering_equivalence_with_existing_ordering( existing_ordering: &[PhysicalSortExpr], - oeq_class: Option<&OrderingEquivalentClass>, - eq_classes: &[EquivalentClass], + oeq_properties: &OrderingEquivalenceProperties, + eq_properties: &EquivalenceProperties, ) -> Option { - oeq_class.map(|oeq_class| { - let normalized_head = normalize_sort_exprs(oeq_class.head(), eq_classes, &[]); + oeq_properties.oeq_class().map(|oeq_class| { + let normalized_head = eq_properties.normalize_sort_exprs(oeq_class.head()); let updated_head = merge_vectors(existing_ordering, &normalized_head); + let updated_others = oeq_class .others() .iter() .map(|ordering| { - let normalized_ordering = normalize_sort_exprs(ordering, eq_classes, &[]); + let normalized_ordering = eq_properties.normalize_sort_exprs(ordering); merge_vectors(existing_ordering, &normalized_ordering) }) .collect(); @@ -409,9 +409,14 @@ pub fn combine_join_ordering_equivalence_properties( let right_oeq_classes = get_updated_right_ordering_equivalence_properties( join_type, - right_oeq_properties.oeq_class(), + right_oeq_properties, left_columns_len, )?; + // Create new ordering equivalence properties from updated class. + let mut right_oeq_properties = + OrderingEquivalenceProperties::new(join_eq_properties.schema()); + right_oeq_properties.extend(right_oeq_classes); + let left_output_ordering = left.output_ordering().unwrap_or(&[]); // Right side ordering equivalence properties should be prepended with // those of the left side while constructing output ordering equivalence @@ -424,8 +429,8 @@ pub fn combine_join_ordering_equivalence_properties( let updated_right_oeq_classes = prefix_ordering_equivalence_with_existing_ordering( left_output_ordering, - right_oeq_classes.as_ref(), - join_eq_properties.classes(), + &right_oeq_properties, + &join_eq_properties, ); new_properties.extend(updated_right_oeq_classes); } @@ -433,7 +438,7 @@ pub fn combine_join_ordering_equivalence_properties( (false, true) => { let right_oeq_classes = get_updated_right_ordering_equivalence_properties( join_type, - right_oeq_properties.oeq_class(), + right_oeq_properties, left_columns_len, )?; new_properties.extend(right_oeq_classes); @@ -442,7 +447,6 @@ pub fn combine_join_ordering_equivalence_properties( && left.output_ordering().is_some() && *join_type == JoinType::Inner { - let left_oeq_classes = right_oeq_properties.oeq_class(); let right_output_ordering = right.output_ordering().unwrap_or(&[]); // Left side ordering equivalence properties should be prepended with // those of the right side while constructing output ordering equivalence @@ -455,8 +459,8 @@ pub fn combine_join_ordering_equivalence_properties( let updated_left_oeq_classes = prefix_ordering_equivalence_with_existing_ordering( right_output_ordering, - left_oeq_classes, - join_eq_properties.classes(), + &left_oeq_properties, + &join_eq_properties, ); new_properties.extend(updated_left_oeq_classes); } diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 3ba719ef22f0..41671eac5313 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -16,17 +16,16 @@ // under the License. use crate::expressions::{CastExpr, Column}; -use crate::{ - normalize_expr_with_equivalence_properties, LexOrdering, PhysicalExpr, - PhysicalSortExpr, -}; +use crate::{LexOrdering, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; use arrow::datatypes::SchemaRef; use arrow_schema::Fields; use crate::utils::collect_columns; +use datafusion_common::tree_node::{Transformed, TreeNode}; use std::collections::{HashMap, HashSet}; use std::hash::Hash; +use std::ops::Range; use std::sync::Arc; /// Represents a collection of [`EquivalentClass`] (equivalences @@ -114,6 +113,69 @@ impl EquivalenceProperties { _ => {} } } + + pub fn normalize_expr(&self, expr: Arc) -> Arc { + expr.clone() + .transform(&|expr| { + let normalized_form = + expr.as_any().downcast_ref::().and_then(|column| { + for class in &self.classes { + if class.contains(column) { + return Some(Arc::new(class.head().clone()) as _); + } + } + None + }); + Ok(if let Some(normalized_form) = normalized_form { + Transformed::Yes(normalized_form) + } else { + Transformed::No(expr) + }) + }) + .unwrap_or(expr) + } + + pub fn normalize_exprs( + &self, + exprs: &[Arc], + ) -> Vec> { + exprs + .iter() + .map(|expr| self.normalize_expr(expr.clone())) + .collect::>() + } + + pub fn normalize_sort_requirement( + &self, + mut sort_requirement: PhysicalSortRequirement, + ) -> PhysicalSortRequirement { + sort_requirement.expr = self.normalize_expr(sort_requirement.expr); + sort_requirement + } + + pub fn normalize_sort_requirements( + &self, + sort_reqs: &[PhysicalSortRequirement], + ) -> Vec { + // TODO: Move collapse logic to here from normalize_sort_exprs + sort_reqs + .iter() + .map(|sort_req| self.normalize_sort_requirement(sort_req.clone())) + .collect::>() + } + + pub fn normalize_sort_exprs( + &self, + sort_exprs: &[PhysicalSortExpr], + ) -> Vec { + let sort_requirements = + PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); + let normalized_sort_requirement = + self.normalize_sort_requirements(&sort_requirements); + let normalized_sort_exprs = + PhysicalSortRequirement::to_sort_exprs(normalized_sort_requirement); + collapse_vec(normalized_sort_exprs) + } } /// `OrderingEquivalenceProperties` keeps track of columns that describe the @@ -178,6 +240,46 @@ impl OrderingEquivalenceProperties { pub fn schema(&self) -> SchemaRef { self.schema.clone() } + + pub fn normalize_sort_requirements( + &self, + sort_reqs: &[PhysicalSortRequirement], + ) -> Vec { + let mut normalized_sort_reqs = sort_reqs.to_vec(); + if let Some(oeq_class) = &self.oeq_class { + for item in oeq_class.others() { + let item = item + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); + let ranges = get_compatible_ranges(&normalized_sort_reqs, &item); + let mut offset: i64 = 0; + for Range { start, end } in ranges { + let mut head = oeq_class + .head() + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); + let updated_start = (start as i64 + offset) as usize; + let updated_end = (end as i64 + offset) as usize; + let range = end - start; + offset += head.len() as i64 - range as i64; + let all_none = normalized_sort_reqs[updated_start..updated_end] + .iter() + .all(|req| req.options.is_none()); + if all_none { + for req in head.iter_mut() { + req.options = None; + } + } + normalized_sort_reqs.splice(updated_start..updated_end, head); + } + } + } + collapse_vec(normalized_sort_reqs) + } } /// EquivalentClass is a set of [`Column`]s or [`PhysicalSortExpr`]s that are known @@ -363,10 +465,7 @@ impl OrderingEquivalenceBuilder { let mut normalized_out_ordering = vec![]; for item in &self.existing_ordering { // To account for ordering equivalences, first normalize the expression: - let normalized = normalize_expr_with_equivalence_properties( - item.expr.clone(), - self.eq_properties.classes(), - ); + let normalized = self.eq_properties.normalize_expr(item.expr.clone()); normalized_out_ordering.push(PhysicalSortExpr { expr: normalized, options: item.options, @@ -576,6 +675,43 @@ pub fn ordering_equivalence_properties_helper( oep } +/// This function constructs a duplicate-free vector by filtering out duplicate +/// entries inside the given vector `input`. +fn collapse_vec(input: Vec) -> Vec { + let mut output = vec![]; + for item in input { + if !output.contains(&item) { + output.push(item); + } + } + output +} + +/// This function searches for the slice `section` inside the slice `given`. +/// It returns each range where `section` is compatible with the corresponding +/// slice in `given`. +fn get_compatible_ranges( + given: &[PhysicalSortRequirement], + section: &[PhysicalSortRequirement], +) -> Vec> { + let n_section = section.len(); + let n_end = if given.len() >= n_section { + given.len() - n_section + 1 + } else { + 0 + }; + (0..n_end) + .filter_map(|idx| { + let end = idx + n_section; + given[idx..end] + .iter() + .zip(section) + .all(|(req, given)| given.compatible(req)) + .then_some(Range { start: idx, end }) + }) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -583,8 +719,20 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::Result; + use arrow_schema::SortOptions; use std::sync::Arc; + fn convert_to_requirement( + in_data: &[(&Column, Option)], + ) -> Vec { + in_data + .iter() + .map(|(col, options)| { + PhysicalSortRequirement::new(Arc::new((*col).clone()) as _, *options) + }) + .collect::>() + } + #[test] fn add_equal_conditions_test() -> Result<()> { let schema = Arc::new(Schema::new(vec![ @@ -675,4 +823,53 @@ mod tests { Ok(()) } + + #[test] + fn test_collapse_vec() -> Result<()> { + assert_eq!(collapse_vec(vec![1, 2, 3]), vec![1, 2, 3]); + assert_eq!(collapse_vec(vec![1, 2, 3, 2, 3]), vec![1, 2, 3]); + assert_eq!(collapse_vec(vec![3, 1, 2, 3, 2, 3]), vec![3, 1, 2]); + Ok(()) + } + + #[test] + fn test_get_compatible_ranges() -> Result<()> { + let col_a = &Column::new("a", 0); + let col_b = &Column::new("b", 1); + let option1 = SortOptions { + descending: false, + nulls_first: false, + }; + let test_data = vec![ + ( + vec![(col_a, Some(option1)), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + vec![(0, 1)], + ), + ( + vec![(col_a, None), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + vec![(0, 1)], + ), + ( + vec![ + (col_a, None), + (col_b, Some(option1)), + (col_a, Some(option1)), + ], + vec![(col_a, Some(option1))], + vec![(0, 1), (2, 3)], + ), + ]; + for (searched, to_search, expected) in test_data { + let searched = convert_to_requirement(&searched); + let to_search = convert_to_requirement(&to_search); + let expected = expected + .into_iter() + .map(|(start, end)| Range { start, end }) + .collect::>(); + assert_eq!(get_compatible_ranges(&searched, &to_search), expected); + } + Ok(()) + } } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 168ee0138df8..ed8b60aa0044 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -69,6 +69,5 @@ pub use sort_expr::{ }; pub use utils::{ expr_list_eq_any_order, expr_list_eq_strict_order, - normalize_expr_with_equivalence_properties, normalize_out_expr_with_columns_map, - reverse_order_bys, split_conjunction, + normalize_out_expr_with_columns_map, reverse_order_bys, split_conjunction, }; diff --git a/datafusion/physical-expr/src/partitioning.rs b/datafusion/physical-expr/src/partitioning.rs index 991d6ba9826d..8687880ea50a 100644 --- a/datafusion/physical-expr/src/partitioning.rs +++ b/datafusion/physical-expr/src/partitioning.rs @@ -20,10 +20,7 @@ use std::fmt; use std::sync::Arc; -use crate::{ - expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, - EquivalenceProperties, PhysicalExpr, -}; +use crate::{expr_list_eq_strict_order, EquivalenceProperties, PhysicalExpr}; /// Partitioning schemes supported by operators. #[derive(Debug, Clone)] @@ -90,21 +87,11 @@ impl Partitioning { if !eq_classes.is_empty() { let normalized_required_exprs = required_exprs .iter() - .map(|e| { - normalize_expr_with_equivalence_properties( - e.clone(), - eq_classes, - ) - }) + .map(|e| eq_properties.normalize_expr(e.clone())) .collect::>(); let normalized_partition_exprs = partition_exprs .iter() - .map(|e| { - normalize_expr_with_equivalence_properties( - e.clone(), - eq_classes, - ) - }) + .map(|e| eq_properties.normalize_expr(e.clone())) .collect::>(); expr_list_eq_strict_order( &normalized_required_exprs, diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 16b62f18fffa..f469fc43a8ae 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -15,10 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::equivalence::{ - EquivalenceProperties, EquivalentClass, OrderingEquivalenceProperties, - OrderingEquivalentClass, -}; +use crate::equivalence::{EquivalenceProperties, OrderingEquivalenceProperties}; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; @@ -36,7 +33,6 @@ use petgraph::stable_graph::StableGraph; use std::borrow::Borrow; use std::collections::HashMap; use std::collections::HashSet; -use std::ops::Range; use std::sync::Arc; /// Compare the two expr lists are equal no matter the order. @@ -129,64 +125,6 @@ pub fn normalize_out_expr_with_columns_map( .unwrap_or(expr) } -pub fn normalize_expr_with_equivalence_properties( - expr: Arc, - eq_properties: &[EquivalentClass], -) -> Arc { - expr.clone() - .transform(&|expr| { - let normalized_form = - expr.as_any().downcast_ref::().and_then(|column| { - for class in eq_properties { - if class.contains(column) { - return Some(Arc::new(class.head().clone()) as _); - } - } - None - }); - Ok(if let Some(normalized_form) = normalized_form { - Transformed::Yes(normalized_form) - } else { - Transformed::No(expr) - }) - }) - .unwrap_or(expr) -} - -fn normalize_sort_requirement_with_equivalence_properties( - mut sort_requirement: PhysicalSortRequirement, - eq_properties: &[EquivalentClass], -) -> PhysicalSortRequirement { - sort_requirement.expr = - normalize_expr_with_equivalence_properties(sort_requirement.expr, eq_properties); - sort_requirement -} - -/// This function searches for the slice `section` inside the slice `given`. -/// It returns each range where `section` is compatible with the corresponding -/// slice in `given`. -fn get_compatible_ranges( - given: &[PhysicalSortRequirement], - section: &[PhysicalSortRequirement], -) -> Vec> { - let n_section = section.len(); - let n_end = if given.len() >= n_section { - given.len() - n_section + 1 - } else { - 0 - }; - (0..n_end) - .filter_map(|idx| { - let end = idx + n_section; - given[idx..end] - .iter() - .zip(section) - .all(|(req, given)| given.compatible(req)) - .then_some(Range { start: idx, end }) - }) - .collect() -} - /// This function constructs a duplicate-free vector by filtering out duplicate /// entries inside the given vector `input`. fn collapse_vec(input: Vec) -> Vec { @@ -209,8 +147,8 @@ fn collapse_vec(input: Vec) -> Vec { /// Standardized version `vec![d ASC]` is used in subsequent operations. pub fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], - eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass], + eq_properties: &EquivalenceProperties, + ordering_eq_properties: &OrderingEquivalenceProperties, ) -> Vec { let sort_requirements = PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); let normalized_exprs = normalize_sort_requirements( @@ -232,51 +170,11 @@ pub fn normalize_sort_exprs( /// Standardized version `vec![d Some(ASC)]` is used in subsequent operations. pub fn normalize_sort_requirements( sort_reqs: &[PhysicalSortRequirement], - eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass], + eq_properties: &EquivalenceProperties, + ordering_eq_properties: &OrderingEquivalenceProperties, ) -> Vec { - let mut normalized_exprs = sort_reqs - .iter() - .map(|sort_req| { - normalize_sort_requirement_with_equivalence_properties( - sort_req.clone(), - eq_properties, - ) - }) - .collect::>(); - for ordering_eq_class in ordering_eq_properties { - for item in ordering_eq_class.others() { - let item = item - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); - let ranges = get_compatible_ranges(&normalized_exprs, &item); - let mut offset: i64 = 0; - for Range { start, end } in ranges { - let mut head = ordering_eq_class - .head() - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); - let updated_start = (start as i64 + offset) as usize; - let updated_end = (end as i64 + offset) as usize; - let range = end - start; - offset += head.len() as i64 - range as i64; - let all_none = normalized_exprs[updated_start..updated_end] - .iter() - .all(|req| req.options.is_none()); - if all_none { - for req in head.iter_mut() { - req.options = None; - } - } - normalized_exprs.splice(updated_start..updated_end, head); - } - } - } - collapse_vec(normalized_exprs) + let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); + ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs) } /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. @@ -313,16 +211,11 @@ pub fn ordering_satisfy_concrete< ordering_equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties - .oeq_class() - .map(|item| vec![item.clone()]) - .unwrap_or(vec![]); let eq_properties = equal_properties(); - let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_exprs(required, eq_classes, &ordering_eq_classes); + normalize_sort_exprs(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_exprs(provided, eq_classes, &ordering_eq_classes); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } @@ -367,16 +260,11 @@ pub fn ordering_satisfy_requirement_concrete< ordering_equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties - .oeq_class() - .map(|item| vec![item.clone()]) - .unwrap_or(vec![]); let eq_properties = equal_properties(); - let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements(required, eq_classes, &ordering_eq_classes); + normalize_sort_requirements(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_exprs(provided, eq_classes, &ordering_eq_classes); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } @@ -421,17 +309,12 @@ fn requirements_compatible_concrete< equal_properties: F2, ) -> bool { let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties - .oeq_class() - .map(|item| vec![item.clone()]) - .unwrap_or(vec![]); let eq_properties = equal_properties(); - let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements(required, eq_classes, &ordering_eq_classes); + normalize_sort_requirements(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_requirements(provided, eq_classes, &ordering_eq_classes); + normalize_sort_requirements(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } @@ -485,26 +368,15 @@ pub fn convert_to_expr>( /// This function finds the indices of `targets` within `items`, taking into /// account equivalences according to `equal_properties`. -pub fn get_indices_of_matching_exprs< - T: Borrow>, - F: FnOnce() -> EquivalenceProperties, ->( - targets: impl IntoIterator, +pub fn get_indices_of_matching_exprs EquivalenceProperties>( + targets: &[Arc], items: &[Arc], equal_properties: F, ) -> Vec { - if let eq_classes @ [_, ..] = equal_properties().classes() { - let normalized_targets = targets.into_iter().map(|e| { - normalize_expr_with_equivalence_properties(e.borrow().clone(), eq_classes) - }); - let normalized_items = items - .iter() - .map(|e| normalize_expr_with_equivalence_properties(e.clone(), eq_classes)) - .collect::>(); - get_indices_of_exprs_strict(normalized_targets, &normalized_items) - } else { - get_indices_of_exprs_strict(targets, items) - } + let eq_properties = equal_properties(); + let normalized_items = eq_properties.normalize_exprs(items); + let normalized_targets = eq_properties.normalize_exprs(targets); + get_indices_of_exprs_strict(normalized_targets, &normalized_items) } /// This function finds the indices of `targets` within `items` using strict @@ -1247,17 +1119,16 @@ mod tests { (vec![(col_e, None), (col_b, None)], vec![(col_a, None)]), ]; let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; - let eq_classes = eq_properties.classes(); - let ordering_eq_classes = ordering_eq_properties - .oeq_class() - .map(|item| vec![item.clone()]) - .unwrap_or(vec![]); for (reqs, expected_normalized) in requirements.into_iter() { let req = convert_to_requirement(&reqs); let expected_normalized = convert_to_requirement(&expected_normalized); assert_eq!( - normalize_sort_requirements(&req, eq_classes, &ordering_eq_classes), + normalize_sort_requirements( + &req, + &eq_properties, + &ordering_eq_properties + ), expected_normalized ); } @@ -1326,10 +1197,7 @@ mod tests { ]; for (expr, expected_eq) in expressions { assert!( - expected_eq.eq(&normalize_expr_with_equivalence_properties( - expr.clone(), - eq_properties.classes() - )), + expected_eq.eq(&eq_properties.normalize_expr(expr.clone())), "error in test: expr: {expr:?}" ); } @@ -1373,10 +1241,7 @@ mod tests { sort_options, ); assert!( - expected.eq(&normalize_sort_requirement_with_equivalence_properties( - arg.clone(), - eq_properties.classes() - )), + expected.eq(&eq_properties.normalize_sort_requirement(arg.clone())), "error in test: expr: {expr:?}, sort_options: {sort_options:?}" ); } @@ -1475,47 +1340,6 @@ mod tests { Ok(()) } - #[test] - fn test_get_compatible_ranges() -> Result<()> { - let col_a = &Column::new("a", 0); - let col_b = &Column::new("b", 1); - let option1 = SortOptions { - descending: false, - nulls_first: false, - }; - let test_data = vec![ - ( - vec![(col_a, Some(option1)), (col_b, Some(option1))], - vec![(col_a, Some(option1))], - vec![(0, 1)], - ), - ( - vec![(col_a, None), (col_b, Some(option1))], - vec![(col_a, Some(option1))], - vec![(0, 1)], - ), - ( - vec![ - (col_a, None), - (col_b, Some(option1)), - (col_a, Some(option1)), - ], - vec![(col_a, Some(option1))], - vec![(0, 1), (2, 3)], - ), - ]; - for (searched, to_search, expected) in test_data { - let searched = convert_to_requirement(&searched); - let to_search = convert_to_requirement(&to_search); - let expected = expected - .into_iter() - .map(|(start, end)| Range { start, end }) - .collect::>(); - assert_eq!(get_compatible_ranges(&searched, &to_search), expected); - } - Ok(()) - } - #[test] fn test_collapse_vec() -> Result<()> { assert_eq!(collapse_vec(vec![1, 2, 3]), vec![1, 2, 3]); From 8007b1b22804e16ae744cdb3369e40d4f597131c Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 17:07:47 +0300 Subject: [PATCH 04/29] Remove unnecesary code --- datafusion/physical-expr/src/utils.rs | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index f469fc43a8ae..2849ab6bf2ee 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -125,18 +125,6 @@ pub fn normalize_out_expr_with_columns_map( .unwrap_or(expr) } -/// This function constructs a duplicate-free vector by filtering out duplicate -/// entries inside the given vector `input`. -fn collapse_vec(input: Vec) -> Vec { - let mut output = vec![]; - for item in input { - if !output.contains(&item) { - output.push(item); - } - } - output -} - /// Transform `sort_exprs` vector, to standardized version using `eq_properties` and `ordering_eq_properties` /// Assume `eq_properties` states that `Column a` and `Column b` are aliases. /// Also assume `ordering_eq_properties` states that ordering `vec![d ASC]` and `vec![a ASC, c ASC]` are @@ -156,8 +144,7 @@ pub fn normalize_sort_exprs( eq_properties, ordering_eq_properties, ); - let normalized_exprs = PhysicalSortRequirement::to_sort_exprs(normalized_exprs); - collapse_vec(normalized_exprs) + PhysicalSortRequirement::to_sort_exprs(normalized_exprs) } /// Transform `sort_reqs` vector, to standardized version using `eq_properties` and `ordering_eq_properties` @@ -1340,14 +1327,6 @@ mod tests { Ok(()) } - #[test] - fn test_collapse_vec() -> Result<()> { - assert_eq!(collapse_vec(vec![1, 2, 3]), vec![1, 2, 3]); - assert_eq!(collapse_vec(vec![1, 2, 3, 2, 3]), vec![1, 2, 3]); - assert_eq!(collapse_vec(vec![3, 1, 2, 3, 2, 3]), vec![3, 1, 2]); - Ok(()) - } - #[test] fn test_collect_columns() -> Result<()> { let expr1 = Arc::new(Column::new("col1", 2)) as _; From 5d896a8e67c8800ee218f7405c29650c5a76dd98 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 17:10:44 +0300 Subject: [PATCH 05/29] Address todo --- datafusion/physical-expr/src/equivalence.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 41671eac5313..2b9f9110e30c 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -157,11 +157,11 @@ impl EquivalenceProperties { &self, sort_reqs: &[PhysicalSortRequirement], ) -> Vec { - // TODO: Move collapse logic to here from normalize_sort_exprs - sort_reqs + let normalized_sort_reqs = sort_reqs .iter() .map(|sort_req| self.normalize_sort_requirement(sort_req.clone())) - .collect::>() + .collect::>(); + collapse_vec(normalized_sort_reqs) } pub fn normalize_sort_exprs( @@ -172,9 +172,7 @@ impl EquivalenceProperties { PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); let normalized_sort_requirement = self.normalize_sort_requirements(&sort_requirements); - let normalized_sort_exprs = - PhysicalSortRequirement::to_sort_exprs(normalized_sort_requirement); - collapse_vec(normalized_sort_exprs) + PhysicalSortRequirement::to_sort_exprs(normalized_sort_requirement) } } From b8def0a9d5e5f206984e91d65f3d14d7408d8487 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 17:44:58 +0300 Subject: [PATCH 06/29] Buggy is_aggressive mod eklenecek --- datafusion/physical-expr/src/equivalence.rs | 7 +- datafusion/physical-expr/src/utils.rs | 100 ++++++++++++-------- 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 2b9f9110e30c..fbe4b3332244 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -242,15 +242,12 @@ impl OrderingEquivalenceProperties { pub fn normalize_sort_requirements( &self, sort_reqs: &[PhysicalSortRequirement], + is_aggressive: bool, ) -> Vec { let mut normalized_sort_reqs = sort_reqs.to_vec(); if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { - let item = item - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); + let item = PhysicalSortRequirement::from_sort_exprs(item); let ranges = get_compatible_ranges(&normalized_sort_reqs, &item); let mut offset: i64 = 0; for Range { start, end } in ranges { diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 2849ab6bf2ee..e707e5356ce3 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -137,6 +137,7 @@ pub fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, + is_aggressive: bool, ) -> Vec { let sort_requirements = PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); let normalized_exprs = normalize_sort_requirements( @@ -159,9 +160,14 @@ pub fn normalize_sort_requirements( sort_reqs: &[PhysicalSortRequirement], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, + is_aggressive: bool, ) -> Vec { + println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs) + println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); + let res = ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); + println!("normalized_sort_reqs after oeq:{:?}", res); + res } /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. @@ -206,6 +212,8 @@ pub fn ordering_satisfy_concrete< if required_normalized.len() > provided_normalized.len() { return false; } + println!("required_normalized: {:?}", required_normalized); + println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -980,28 +988,58 @@ mod tests { let provided = Some(&provided[..]); let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + // let requirements = vec![ + // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it + // (vec![(col_a, option1)], true), + // (vec![(col_a, option2)], false), + // // Test whether equivalence works as expected + // (vec![(col_c, option1)], true), + // (vec![(col_c, option2)], false), + // // Test whether ordering equivalence works as expected + // (vec![(col_d, option1)], false), + // (vec![(col_d, option1), (col_b, option1)], true), + // (vec![(col_d, option2), (col_b, option1)], false), + // (vec![(col_e, option2), (col_b, option1)], true), + // (vec![(col_e, option1), (col_b, option1)], false), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_d, option1), + // (col_b, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_e, option2), + // (col_b, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_d, option2), + // (col_b, option1), + // ], + // false, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_e, option1), + // (col_b, option1), + // ], + // false, + // ), + // ]; + let requirements = vec![ - // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it - (vec![(col_a, option1)], true), - (vec![(col_a, option2)], false), - // Test whether equivalence works as expected - (vec![(col_c, option1)], true), - (vec![(col_c, option2)], false), - // Test whether ordering equivalence works as expected - (vec![(col_d, option1)], false), - (vec![(col_d, option1), (col_b, option1)], true), - (vec![(col_d, option2), (col_b, option1)], false), - (vec![(col_e, option2), (col_b, option1)], true), - (vec![(col_e, option1), (col_b, option1)], false), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_d, option1), - (col_b, option1), - ], - true, - ), ( vec![ (col_d, option1), @@ -1011,24 +1049,6 @@ mod tests { ], true, ), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_d, option2), - (col_b, option1), - ], - false, - ), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_e, option1), - (col_b, option1), - ], - false, - ), ]; for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); From 8850f33df9189c6e836d5dd74759c0fa0b850620 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 17 Aug 2023 18:00:24 +0300 Subject: [PATCH 07/29] start implementing aggressive mode --- datafusion/physical-expr/src/equivalence.rs | 46 ++++++++++++++------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index fbe4b3332244..692dddb74f59 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -688,23 +688,39 @@ fn collapse_vec(input: Vec) -> Vec { fn get_compatible_ranges( given: &[PhysicalSortRequirement], section: &[PhysicalSortRequirement], + is_aggressive: bool, ) -> Vec> { - let n_section = section.len(); - let n_end = if given.len() >= n_section { - given.len() - n_section + 1 + if is_aggressive { + let mut res = vec![]; + for i in 0..section.len(){ + let mut count = 0; + let mut j = 0; + while given[j] == section[i+j] && j < given.len() && i+j < section.len(){ + count += 1; + } + if count > 0{ + res.push(Range{start:i, end:i+count}) + } + } + res } else { - 0 - }; - (0..n_end) - .filter_map(|idx| { - let end = idx + n_section; - given[idx..end] - .iter() - .zip(section) - .all(|(req, given)| given.compatible(req)) - .then_some(Range { start: idx, end }) - }) - .collect() + let n_section = section.len(); + let n_end = if given.len() >= n_section { + given.len() - n_section + 1 + } else { + 0 + }; + (0..n_end) + .filter_map(|idx| { + let end = idx + n_section; + given[idx..end] + .iter() + .zip(section) + .all(|(req, given)| given.compatible(req)) + .then_some(Range { start: idx, end }) + }) + .collect() + } } #[cfg(test)] From 0d32ca5409a2c7e953569afdca839fc0ddefe316 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 6 Sep 2023 15:36:03 +0300 Subject: [PATCH 08/29] all tests pass --- datafusion/physical-expr/src/equivalence.rs | 4 ++-- datafusion/physical-expr/src/utils.rs | 26 +++++++++++---------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 692dddb74f59..b6ba228091e9 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -248,7 +248,7 @@ impl OrderingEquivalenceProperties { if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { let item = PhysicalSortRequirement::from_sort_exprs(item); - let ranges = get_compatible_ranges(&normalized_sort_reqs, &item); + let ranges = get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); let mut offset: i64 = 0; for Range { start, end } in ranges { let mut head = oeq_class @@ -879,7 +879,7 @@ mod tests { .into_iter() .map(|(start, end)| Range { start, end }) .collect::>(); - assert_eq!(get_compatible_ranges(&searched, &to_search), expected); + assert_eq!(get_compatible_ranges(&searched, &to_search, false), expected); } Ok(()) } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index e707e5356ce3..d83d59377f21 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -144,6 +144,7 @@ pub fn normalize_sort_exprs( &sort_requirements, eq_properties, ordering_eq_properties, + is_aggressive ); PhysicalSortRequirement::to_sort_exprs(normalized_exprs) } @@ -162,11 +163,11 @@ pub fn normalize_sort_requirements( ordering_eq_properties: &OrderingEquivalenceProperties, is_aggressive: bool, ) -> Vec { - println!("sort_reqs at the start:{:?}", sort_reqs); + // println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); + // println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); let res = ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); - println!("normalized_sort_reqs after oeq:{:?}", res); + // println!("normalized_sort_reqs after oeq:{:?}", res); res } @@ -206,14 +207,14 @@ pub fn ordering_satisfy_concrete< let oeq_properties = ordering_equal_properties(); let eq_properties = equal_properties(); let required_normalized = - normalize_sort_exprs(required, &eq_properties, &oeq_properties); + normalize_sort_exprs(required, &eq_properties, &oeq_properties, false); let provided_normalized = - normalize_sort_exprs(provided, &eq_properties, &oeq_properties); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties, false); if required_normalized.len() > provided_normalized.len() { return false; } - println!("required_normalized: {:?}", required_normalized); - println!("provided_normalized: {:?}", provided_normalized); + // println!("required_normalized: {:?}", required_normalized); + // println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -257,9 +258,9 @@ pub fn ordering_satisfy_requirement_concrete< let oeq_properties = ordering_equal_properties(); let eq_properties = equal_properties(); let required_normalized = - normalize_sort_requirements(required, &eq_properties, &oeq_properties); + normalize_sort_requirements(required, &eq_properties, &oeq_properties, false); let provided_normalized = - normalize_sort_exprs(provided, &eq_properties, &oeq_properties); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties, false); if required_normalized.len() > provided_normalized.len() { return false; } @@ -307,9 +308,9 @@ fn requirements_compatible_concrete< let eq_properties = equal_properties(); let required_normalized = - normalize_sort_requirements(required, &eq_properties, &oeq_properties); + normalize_sort_requirements(required, &eq_properties, &oeq_properties, false); let provided_normalized = - normalize_sort_requirements(provided, &eq_properties, &oeq_properties); + normalize_sort_requirements(provided, &eq_properties, &oeq_properties, false); if required_normalized.len() > provided_normalized.len() { return false; } @@ -1134,7 +1135,8 @@ mod tests { normalize_sort_requirements( &req, &eq_properties, - &ordering_eq_properties + &ordering_eq_properties, + false ), expected_normalized ); From aac0a0cb4b141217bb58309b8fe400aec64b9f48 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 6 Sep 2023 16:13:04 +0300 Subject: [PATCH 09/29] minor changes --- datafusion/core/tests/sql/order.rs | 4 ++- .../simplify_expressions/expr_simplifier.rs | 4 ++- .../src/simplify_expressions/regex.rs | 4 ++- datafusion/physical-expr/src/equivalence.rs | 22 ++++++++----- datafusion/physical-expr/src/utils.rs | 31 +++++++++---------- datafusion/sql/src/statement.rs | 4 +-- .../substrait/src/logical_plan/consumer.rs | 26 ++++++++-------- .../substrait/src/logical_plan/producer.rs | 5 ++- 8 files changed, 58 insertions(+), 42 deletions(-) diff --git a/datafusion/core/tests/sql/order.rs b/datafusion/core/tests/sql/order.rs index 3981fbaa4d7a..a400a78fc914 100644 --- a/datafusion/core/tests/sql/order.rs +++ b/datafusion/core/tests/sql/order.rs @@ -48,7 +48,9 @@ async fn sort_with_lots_of_repetition_values() -> Result<()> { async fn create_external_table_with_order() -> Result<()> { let ctx = SessionContext::new(); let sql = "CREATE EXTERNAL TABLE dt (a_id integer, a_str string, a_bool boolean) STORED AS CSV WITH ORDER (a_id ASC) LOCATION 'file://path/to/table';"; - let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = ctx.state().create_logical_plan(sql).await? else { + let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = + ctx.state().create_logical_plan(sql).await? + else { panic!("Wrong command") }; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index b7e8612d538e..1b7cc52549ef 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -412,7 +412,9 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) if list.len() == 1 && matches!(list.first(), Some(Expr::ScalarSubquery { .. })) => { - let Expr::ScalarSubquery(subquery) = list.remove(0) else { unreachable!() }; + let Expr::ScalarSubquery(subquery) = list.remove(0) else { + unreachable!() + }; Expr::InSubquery(InSubquery::new(expr, subquery, negated)) } diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 5094623b82c0..b9d9821b43f0 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -203,7 +203,9 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option { match v.len() { 2 => Some(lit("")), 3 => { - let HirKind::Literal(l) = v[1].kind() else { return None }; + let HirKind::Literal(l) = v[1].kind() else { + return None; + }; like_str_from_literal(l).map(lit) } _ => None, diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index b6ba228091e9..ce4ea129ea6a 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -248,7 +248,8 @@ impl OrderingEquivalenceProperties { if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { let item = PhysicalSortRequirement::from_sort_exprs(item); - let ranges = get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); + let ranges = + get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); let mut offset: i64 = 0; for Range { start, end } in ranges { let mut head = oeq_class @@ -691,15 +692,19 @@ fn get_compatible_ranges( is_aggressive: bool, ) -> Vec> { if is_aggressive { + println!("given: {:?}", given); + println!("section: {:?}", section); let mut res = vec![]; - for i in 0..section.len(){ + for i in 0..given.len() { let mut count = 0; - let mut j = 0; - while given[j] == section[i+j] && j < given.len() && i+j < section.len(){ + while i + count < given.len() && count < section.len() && given[i + count] == section[count] { count += 1; } - if count > 0{ - res.push(Range{start:i, end:i+count}) + if count > 0 { + res.push(Range { + start: i, + end: i + count, + }) } } res @@ -879,7 +884,10 @@ mod tests { .into_iter() .map(|(start, end)| Range { start, end }) .collect::>(); - assert_eq!(get_compatible_ranges(&searched, &to_search, false), expected); + assert_eq!( + get_compatible_ranges(&searched, &to_search, false), + expected + ); } Ok(()) } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index d83d59377f21..d542078e240e 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -144,7 +144,7 @@ pub fn normalize_sort_exprs( &sort_requirements, eq_properties, ordering_eq_properties, - is_aggressive + is_aggressive, ); PhysicalSortRequirement::to_sort_exprs(normalized_exprs) } @@ -166,7 +166,8 @@ pub fn normalize_sort_requirements( // println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); // println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); - let res = ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); + let res = ordering_eq_properties + .normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); // println!("normalized_sort_reqs after oeq:{:?}", res); res } @@ -207,14 +208,14 @@ pub fn ordering_satisfy_concrete< let oeq_properties = ordering_equal_properties(); let eq_properties = equal_properties(); let required_normalized = - normalize_sort_exprs(required, &eq_properties, &oeq_properties, false); + normalize_sort_exprs(required, &eq_properties, &oeq_properties, true); let provided_normalized = normalize_sort_exprs(provided, &eq_properties, &oeq_properties, false); if required_normalized.len() > provided_normalized.len() { return false; } - // println!("required_normalized: {:?}", required_normalized); - // println!("provided_normalized: {:?}", provided_normalized); + println!("required_normalized: {:?}", required_normalized); + println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -1040,17 +1041,15 @@ mod tests { // ), // ]; - let requirements = vec![ - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_e, option2), - (col_b, option1), - ], - true, - ), - ]; + let requirements = vec![( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option2), + (col_b, option1), + ], + true, + )]; for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); let required = cols diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 0f5dbb9ec0b7..fbe9da854e41 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -499,10 +499,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { "DELETE FROM only supports single table, got: joins".to_string(), )); } - let TableFactor::Table{name, ..} = table_factor.relation else { + let TableFactor::Table { name, .. } = table_factor.relation else { return Err(DataFusionError::NotImplemented(format!( "DELETE FROM only supports single table, got: {table_factor:?}" - ))) + ))); }; Ok(name) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 4e4d71ddb604..54f1facb4ada 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -573,9 +573,9 @@ pub async fn from_substrait_sorts( Some(k) => match k { Direction(d) => { let Some(direction) = SortDirection::from_i32(*d) else { - return Err(DataFusionError::NotImplemented( - format!("Unsupported Substrait SortDirection value {d}"), - )) + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Substrait SortDirection value {d}" + ))); }; match direction { @@ -1313,27 +1313,27 @@ async fn make_datafusion_like( } let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let expr = from_substrait_rex(expr_substrait, input_schema, extensions) .await? .as_ref() .clone(); let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let pattern = from_substrait_rex(pattern_substrait, input_schema, extensions) .await? .as_ref() .clone(); let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let escape_char_expr = from_substrait_rex(escape_char_substrait, input_schema, extensions) @@ -1343,7 +1343,7 @@ async fn make_datafusion_like( let Expr::Literal(ScalarValue::Utf8(escape_char)) = escape_char_expr else { return Err(DataFusionError::Substrait(format!( "Expect Utf8 literal for escape char, but found {escape_char_expr:?}", - ))) + ))); }; Ok(Arc::new(Expr::Like(Like { diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index 495ccf1e3527..bfa9465cad73 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -1666,7 +1666,10 @@ mod test { println!("Checking round trip of {scalar:?}"); let substrait = to_substrait_literal(&scalar)?; - let Expression { rex_type: Some(RexType::Literal(substrait_literal)) } = substrait else { + let Expression { + rex_type: Some(RexType::Literal(substrait_literal)), + } = substrait + else { panic!("Expected Literal expression, got {substrait:?}"); }; From f0dbd85ef7a1a12487de391dac784779892bd733 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 6 Sep 2023 17:29:45 +0300 Subject: [PATCH 10/29] All tests pass --- datafusion/physical-expr/src/equivalence.rs | 4 +- datafusion/physical-expr/src/utils.rs | 113 ++++++++++---------- 2 files changed, 56 insertions(+), 61 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index ce4ea129ea6a..32eab6eae071 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -692,8 +692,8 @@ fn get_compatible_ranges( is_aggressive: bool, ) -> Vec> { if is_aggressive { - println!("given: {:?}", given); - println!("section: {:?}", section); + // println!("given: {:?}", given); + // println!("section: {:?}", section); let mut res = vec![]; for i in 0..given.len() { let mut count = 0; diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index d542078e240e..064edc04bac2 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -214,8 +214,8 @@ pub fn ordering_satisfy_concrete< if required_normalized.len() > provided_normalized.len() { return false; } - println!("required_normalized: {:?}", required_normalized); - println!("provided_normalized: {:?}", provided_normalized); + // println!("required_normalized: {:?}", required_normalized); + // println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -990,66 +990,61 @@ mod tests { let provided = Some(&provided[..]); let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + let requirements = vec![ + // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it + (vec![(col_a, option1)], true), + (vec![(col_a, option2)], false), + // Test whether equivalence works as expected + (vec![(col_c, option1)], true), + (vec![(col_c, option2)], false), + // Test whether ordering equivalence works as expected + (vec![(col_d, option1)], true), + (vec![(col_d, option1), (col_b, option1)], true), + (vec![(col_d, option2), (col_b, option1)], false), + (vec![(col_e, option2), (col_b, option1)], true), + (vec![(col_e, option1), (col_b, option1)], false), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option1), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option2), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option2), + (col_b, option1), + ], + false, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option1), + (col_b, option1), + ], + false, + ), + ]; + // let requirements = vec![ - // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it - // (vec![(col_a, option1)], true), - // (vec![(col_a, option2)], false), - // // Test whether equivalence works as expected - // (vec![(col_c, option1)], true), - // (vec![(col_c, option2)], false), - // // Test whether ordering equivalence works as expected - // (vec![(col_d, option1)], false), - // (vec![(col_d, option1), (col_b, option1)], true), - // (vec![(col_d, option2), (col_b, option1)], false), - // (vec![(col_e, option2), (col_b, option1)], true), - // (vec![(col_e, option1), (col_b, option1)], false), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_d, option1), - // (col_b, option1), - // ], - // true, - // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_e, option2), - // (col_b, option1), - // ], - // true, - // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_d, option2), - // (col_b, option1), - // ], - // false, - // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_e, option1), - // (col_b, option1), - // ], - // false, - // ), + // (vec![(col_d, option1)], true), // ]; - let requirements = vec![( - vec![ - (col_d, option1), - (col_b, option1), - (col_e, option2), - (col_b, option1), - ], - true, - )]; for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); let required = cols From 7112a2511a64d1a903a52be3a02ca713c9d9f39c Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 6 Sep 2023 18:20:00 +0300 Subject: [PATCH 11/29] Minor changes --- .../physical_optimizer/sort_enforcement.rs | 94 +++++++++++++++++++ datafusion/core/src/physical_plan/filter.rs | 44 ++++++++- datafusion/physical-expr/src/equivalence.rs | 44 ++++++++- datafusion/sqllogictest/test_files/select.slt | 17 ++++ 4 files changed, 195 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 644aea1f3812..67f55ac74043 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -550,6 +550,10 @@ fn analyze_immediate_sort_removal( ) -> Option { if let Some(sort_exec) = plan.as_any().downcast_ref::() { let sort_input = sort_exec.input().clone(); + println!("sort_input.output_ordering(): {:?}", sort_input.output_ordering()); + println!("sort_exec.output_ordering(): {:?}", sort_exec.output_ordering()); + println!("sort_input.equivalence_properties().classes(): {:?}", sort_input.equivalence_properties().classes()); + println!("sort_input.ordering_equivalence_properties().oeq_class(): {:?}", sort_input.ordering_equivalence_properties().oeq_class()); // If this sort is unnecessary, we should remove it: if ordering_satisfy( sort_input.output_ordering(), @@ -2855,3 +2859,93 @@ mod tests { Ok(()) } } + + +mod tmp_tests { + use crate::assert_batches_eq; + use crate::physical_plan::{collect, displayable, ExecutionPlan}; + use crate::prelude::SessionContext; + use arrow::util::pretty::print_batches; + use datafusion_common::Result; + use datafusion_execution::config::SessionConfig; + use std::sync::Arc; + + fn print_plan(plan: &Arc) -> Result<()> { + let formatted = displayable(plan.as_ref()).indent(true).to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + println!("{:#?}", actual); + Ok(()) + } + + #[tokio::test] + async fn test_first_value() -> Result<()> { + let config = SessionConfig::new().with_target_partitions(1); + let ctx = SessionContext::with_config(config); + + ctx.sql( + "CREATE EXTERNAL TABLE annotated_data_finite2 ( + a0 INTEGER, + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER + ) + STORED AS CSV + WITH HEADER ROW + WITH ORDER (a ASC, b ASC, c ASC) + LOCATION 'tests/data/window_2.csv'", + ) + .await?; + + + let sql = "SELECT * + FROM annotated_data_finite2 + WHERE a=0 + ORDER BY b, c"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + print_plan(&physical_plan)?; + let actual = collect(physical_plan, ctx.task_ctx()).await?; + print_batches(&actual)?; + Ok(()) + } + + #[tokio::test] + async fn test_subquery() -> Result<()> { + let config = SessionConfig::new().with_target_partitions(2); + let ctx = SessionContext::with_config(config); + ctx.sql( + "CREATE TABLE sales_global (zip_code INT, + country VARCHAR(3), + sn INT, + ts TIMESTAMP, + currency VARCHAR(3), + amount FLOAT + ) as VALUES + (0, 'GRC', 0, '2022-01-01 06:00:00'::timestamp, 'EUR', 30.0), + (1, 'FRA', 1, '2022-01-01 08:00:00'::timestamp, 'EUR', 50.0), + (1, 'TUR', 2, '2022-01-01 11:30:00'::timestamp, 'TRY', 75.0), + (1, 'FRA', 3, '2022-01-02 12:00:00'::timestamp, 'EUR', 200.0), + (1, 'TUR', 4, '2022-01-03 10:00:00'::timestamp, 'TRY', 100.0), + (0, 'GRC', 4, '2022-01-03 10:00:00'::timestamp, 'EUR', 80.0)", + ) + .await?; + + let sql = "SELECT country, ARRAY_AGG(amount ORDER BY ts DESC) AS amounts, + FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, + LAST_VALUE(amount ORDER BY ts DESC) AS fv2 + FROM sales_global + GROUP BY country"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + print_plan(&physical_plan)?; + let batches = collect(physical_plan, ctx.task_ctx()).await?; + print_batches(&batches)?; + assert_eq!(0, 1); + Ok(()) + } +} diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index 084b0b15d102..bec097e7efed 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -40,7 +40,7 @@ use datafusion_common::cast::as_boolean_array; use datafusion_common::{plan_err, DataFusionError, Result}; use datafusion_execution::TaskContext; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::BinaryExpr; +use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; use datafusion_physical_expr::intervals::check_support; use datafusion_physical_expr::{ analyze, split_conjunction, AnalysisContext, ExprBoundaries, @@ -153,7 +153,11 @@ impl ExecutionPlan for FilterExec { } fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { - self.input.ordering_equivalence_properties() + let mut res = self.input.ordering_equivalence_properties(); + let constants = collect_constants_from_predicate(self.predicate()); + println!("constants: {:?}", constants); + res.add_constants(constants); + res } fn with_new_children( @@ -374,6 +378,42 @@ fn collect_columns_from_predicate(predicate: &Arc) -> EqualAnd pub type EqualAndNonEqual<'a> = (Vec<(&'a Column, &'a Column)>, Vec<(&'a Column, &'a Column)>); +fn is_literal(expr: &Arc) -> bool { + expr.as_any().is::() +} + +fn is_column(expr: &Arc) -> bool { + expr.as_any().is::() +} + +fn get_constant_expr(binary: &BinaryExpr) -> Option> { + if binary.op() == &Operator::Eq + && is_literal(binary.left()) + && is_column(binary.right()) + { + Some(binary.right().clone()) + } else if binary.op() == &Operator::Eq + && is_literal(binary.right()) + && is_column(binary.left()) + { + Some(binary.left().clone()) + } else { + None + } +} + +fn collect_constants_from_predicate( + predicate: &Arc, +) -> Vec> { + let mut res = vec![]; + if let Some(binary) = predicate.as_any().downcast_ref::() { + if let Some(expr) = get_constant_expr(binary) { + res.push(expr); + } + } + res +} + #[cfg(test)] mod tests { diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 32eab6eae071..e74703aa3b50 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -194,6 +194,7 @@ impl EquivalenceProperties { #[derive(Debug, Clone)] pub struct OrderingEquivalenceProperties { oeq_class: Option, + constants: Vec>, schema: SchemaRef, } @@ -201,6 +202,7 @@ impl OrderingEquivalenceProperties { pub fn new(schema: SchemaRef) -> Self { Self { oeq_class: None, + constants: vec![], schema, } } @@ -235,6 +237,13 @@ impl OrderingEquivalenceProperties { } } + pub fn add_constants(&mut self, constants: Vec>) { + for constant in constants { + // TODO: Add already inserted check + self.constants.push(constant); + } + } + pub fn schema(&self) -> SchemaRef { self.schema.clone() } @@ -247,7 +256,15 @@ impl OrderingEquivalenceProperties { let mut normalized_sort_reqs = sort_reqs.to_vec(); if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { + // // println!("item bef: {:?}", item); + // let item2 = prune_constants(item, &self.constants); + // // println!("item aft: {:?}", item2); + // if &item2 != item{ + // println!("item bef: {:?}", item); + // println!("item aft: {:?}", item2); + // } let item = PhysicalSortRequirement::from_sort_exprs(item); + let item = prune_constants(&item, &self.constants); let ranges = get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); let mut offset: i64 = 0; @@ -274,7 +291,8 @@ impl OrderingEquivalenceProperties { } } } - collapse_vec(normalized_sort_reqs) + let res = collapse_vec(normalized_sort_reqs); + prune_constants(&res, &self.constants) } } @@ -697,7 +715,10 @@ fn get_compatible_ranges( let mut res = vec![]; for i in 0..given.len() { let mut count = 0; - while i + count < given.len() && count < section.len() && given[i + count] == section[count] { + while i + count < given.len() + && count < section.len() + && given[i + count] == section[count] + { count += 1; } if count > 0 { @@ -728,6 +749,25 @@ fn get_compatible_ranges( } } +fn expr_contains(constants: &[Arc], expr: &Arc) -> bool { + for constant in constants{ + if constant.eq(expr){ + return true; + } + } + false +} + +fn prune_constants(ordering: &[PhysicalSortRequirement], constants: &[Arc]) -> Vec { + let mut new_ordering = vec![]; + for order in ordering{ + if !expr_contains(constants, &order.expr){ + new_ordering.push(order.clone()) + } + } + new_ordering +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 7b39bccf3f26..457f9fd953d8 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -882,6 +882,23 @@ physical_plan ProjectionExec: expr=[a@0 as a, b@1 as b, 2 as Int64(2)] --CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], has_header=true +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE a=0 +ORDER BY b, c; +---- +logical_plan +Sort: annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST +--Filter: annotated_data_finite2.a = Int32(0) +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0)] +physical_plan +SortPreservingMergeExec: [b@2 ASC NULLS LAST,c@3 ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: a@1 = 0 +------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + statement ok drop table annotated_data_finite2; From ec411945f39f021e89189eaed105c7fd610e8e25 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 10:33:57 +0300 Subject: [PATCH 12/29] All tests pass --- .../physical_optimizer/sort_enforcement.rs | 53 ++++++----- datafusion/core/src/physical_plan/filter.rs | 39 ++++---- datafusion/physical-expr/src/equivalence.rs | 33 +++---- datafusion/sqllogictest/test_files/select.slt | 89 +++++++++++++++++++ 4 files changed, 144 insertions(+), 70 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 67f55ac74043..41fde5768be2 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -550,10 +550,10 @@ fn analyze_immediate_sort_removal( ) -> Option { if let Some(sort_exec) = plan.as_any().downcast_ref::() { let sort_input = sort_exec.input().clone(); - println!("sort_input.output_ordering(): {:?}", sort_input.output_ordering()); - println!("sort_exec.output_ordering(): {:?}", sort_exec.output_ordering()); - println!("sort_input.equivalence_properties().classes(): {:?}", sort_input.equivalence_properties().classes()); - println!("sort_input.ordering_equivalence_properties().oeq_class(): {:?}", sort_input.ordering_equivalence_properties().oeq_class()); + // println!("sort_input.output_ordering(): {:?}", sort_input.output_ordering()); + // println!("sort_exec.output_ordering(): {:?}", sort_exec.output_ordering()); + // println!("sort_input.equivalence_properties().classes(): {:?}", sort_input.equivalence_properties().classes()); + // println!("sort_input.ordering_equivalence_properties().oeq_class(): {:?}", sort_input.ordering_equivalence_properties().oeq_class()); // If this sort is unnecessary, we should remove it: if ordering_satisfy( sort_input.output_ordering(), @@ -2913,39 +2913,38 @@ mod tmp_tests { } #[tokio::test] - async fn test_subquery() -> Result<()> { - let config = SessionConfig::new().with_target_partitions(2); + async fn test_first_value2() -> Result<()> { + let config = SessionConfig::new().with_target_partitions(1); let ctx = SessionContext::with_config(config); + ctx.sql( - "CREATE TABLE sales_global (zip_code INT, - country VARCHAR(3), - sn INT, - ts TIMESTAMP, - currency VARCHAR(3), - amount FLOAT - ) as VALUES - (0, 'GRC', 0, '2022-01-01 06:00:00'::timestamp, 'EUR', 30.0), - (1, 'FRA', 1, '2022-01-01 08:00:00'::timestamp, 'EUR', 50.0), - (1, 'TUR', 2, '2022-01-01 11:30:00'::timestamp, 'TRY', 75.0), - (1, 'FRA', 3, '2022-01-02 12:00:00'::timestamp, 'EUR', 200.0), - (1, 'TUR', 4, '2022-01-03 10:00:00'::timestamp, 'TRY', 100.0), - (0, 'GRC', 4, '2022-01-03 10:00:00'::timestamp, 'EUR', 80.0)", + "CREATE EXTERNAL TABLE annotated_data_finite2 ( + a0 INTEGER, + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER + ) + STORED AS CSV + WITH HEADER ROW + WITH ORDER (a ASC, b ASC, c ASC) + LOCATION 'tests/data/window_2.csv'", ) .await?; - let sql = "SELECT country, ARRAY_AGG(amount ORDER BY ts DESC) AS amounts, - FIRST_VALUE(amount ORDER BY ts ASC) AS fv1, - LAST_VALUE(amount ORDER BY ts DESC) AS fv2 - FROM sales_global - GROUP BY country"; + + let sql = "SELECT * + FROM annotated_data_finite2 + WHERE a=0 and b=0 + ORDER BY c"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); let physical_plan = dataframe.create_physical_plan().await?; print_plan(&physical_plan)?; - let batches = collect(physical_plan, ctx.task_ctx()).await?; - print_batches(&batches)?; - assert_eq!(0, 1); + let actual = collect(physical_plan, ctx.task_ctx()).await?; + print_batches(&actual)?; Ok(()) } + } diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index bec097e7efed..e6db6b4edaac 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -155,7 +155,6 @@ impl ExecutionPlan for FilterExec { fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { let mut res = self.input.ordering_equivalence_properties(); let constants = collect_constants_from_predicate(self.predicate()); - println!("constants: {:?}", constants); res.add_constants(constants); res } @@ -386,32 +385,30 @@ fn is_column(expr: &Arc) -> bool { expr.as_any().is::() } -fn get_constant_expr(binary: &BinaryExpr) -> Option> { - if binary.op() == &Operator::Eq - && is_literal(binary.left()) - && is_column(binary.right()) - { - Some(binary.right().clone()) - } else if binary.op() == &Operator::Eq - && is_literal(binary.right()) - && is_column(binary.left()) - { - Some(binary.left().clone()) - } else { - None +fn get_constant_expr_helper(expr: &Arc, mut res: Vec>) -> Vec>{ + if let Some(binary) = expr.as_any().downcast_ref::(){ + if binary.op() == &Operator::Eq + && is_literal(binary.left()) + && is_column(binary.right()) + { + res.push(binary.right().clone()) + } else if binary.op() == &Operator::Eq + && is_literal(binary.right()) + && is_column(binary.left()) + { + res.push(binary.left().clone()) + } else if binary.op() == &Operator::And { + res = get_constant_expr_helper(binary.left(), res); + res = get_constant_expr_helper(binary.right(), res); + } } + res } fn collect_constants_from_predicate( predicate: &Arc, ) -> Vec> { - let mut res = vec![]; - if let Some(binary) = predicate.as_any().downcast_ref::() { - if let Some(expr) = get_constant_expr(binary) { - res.push(expr); - } - } - res + get_constant_expr_helper(predicate, vec![]) } #[cfg(test)] diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index e74703aa3b50..5c3640640cac 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -239,8 +239,9 @@ impl OrderingEquivalenceProperties { pub fn add_constants(&mut self, constants: Vec>) { for constant in constants { - // TODO: Add already inserted check - self.constants.push(constant); + if !exprs_contains(&self.constants, &constant){ + self.constants.push(constant); + } } } @@ -253,28 +254,17 @@ impl OrderingEquivalenceProperties { sort_reqs: &[PhysicalSortRequirement], is_aggressive: bool, ) -> Vec { - let mut normalized_sort_reqs = sort_reqs.to_vec(); + let mut normalized_sort_reqs = prune_sort_reqs_with_constants(sort_reqs, &self.constants); if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { - // // println!("item bef: {:?}", item); - // let item2 = prune_constants(item, &self.constants); - // // println!("item aft: {:?}", item2); - // if &item2 != item{ - // println!("item bef: {:?}", item); - // println!("item aft: {:?}", item2); - // } let item = PhysicalSortRequirement::from_sort_exprs(item); - let item = prune_constants(&item, &self.constants); + let item = prune_sort_reqs_with_constants(&item, &self.constants); let ranges = get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); let mut offset: i64 = 0; for Range { start, end } in ranges { - let mut head = oeq_class - .head() - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); + let head = PhysicalSortRequirement::from_sort_exprs(oeq_class.head()); + let mut head = prune_sort_reqs_with_constants(&head, &self.constants); let updated_start = (start as i64 + offset) as usize; let updated_end = (end as i64 + offset) as usize; let range = end - start; @@ -291,8 +281,7 @@ impl OrderingEquivalenceProperties { } } } - let res = collapse_vec(normalized_sort_reqs); - prune_constants(&res, &self.constants) + collapse_vec(normalized_sort_reqs) } } @@ -749,7 +738,7 @@ fn get_compatible_ranges( } } -fn expr_contains(constants: &[Arc], expr: &Arc) -> bool { +fn exprs_contains(constants: &[Arc], expr: &Arc) -> bool { for constant in constants{ if constant.eq(expr){ return true; @@ -758,10 +747,10 @@ fn expr_contains(constants: &[Arc], expr: &Arc]) -> Vec { +fn prune_sort_reqs_with_constants(ordering: &[PhysicalSortRequirement], constants: &[Arc]) -> Vec { let mut new_ordering = vec![]; for order in ordering{ - if !expr_contains(constants, &order.expr){ + if !exprs_contains(constants, &order.expr){ new_ordering.push(order.clone()) } } diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 457f9fd953d8..8d83cd0e8ec0 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -882,6 +882,10 @@ physical_plan ProjectionExec: expr=[a@0 as a, b@1 as b, 2 as Int64(2)] --CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], has_header=true +# source is ordered by a,b,c +# when filter result is constant for column a +# ordering b, c is still satisfied. Final plan shouldn't have +# SortExec. query TT EXPLAIN SELECT * FROM annotated_data_finite2 @@ -899,6 +903,91 @@ SortPreservingMergeExec: [b@2 ASC NULLS LAST,c@3 ASC NULLS LAST] ------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 --------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +# source is ordered by a,b,c +# when filter result is constant for column a and b +# ordering c is still satisfied. Final plan shouldn't have +# SortExec. +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE a=0 and b=0 +ORDER BY c; +---- +logical_plan +Sort: annotated_data_finite2.c ASC NULLS LAST +--Filter: annotated_data_finite2.a = Int32(0) AND annotated_data_finite2.b = Int32(0) +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)] +physical_plan +SortPreservingMergeExec: [c@3 ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: a@1 = 0 AND b@2 = 0 +------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + +# source is ordered by a,b,c +# when filter result is constant for column a and b +# ordering b, c is still satisfied. Final plan shouldn't have +# SortExec. +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE a=0 and b=0 +ORDER BY b, c; +---- +logical_plan +Sort: annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST +--Filter: annotated_data_finite2.a = Int32(0) AND annotated_data_finite2.b = Int32(0) +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)] +physical_plan +SortPreservingMergeExec: [b@2 ASC NULLS LAST,c@3 ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: a@1 = 0 AND b@2 = 0 +------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + +# source is ordered by a,b,c +# when filter result is constant for column a and b +# ordering a, b, c is still satisfied. Final plan shouldn't have +# SortExec. +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE a=0 and b=0 +ORDER BY a, b, c; +---- +logical_plan +Sort: annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST +--Filter: annotated_data_finite2.a = Int32(0) AND annotated_data_finite2.b = Int32(0) +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0), annotated_data_finite2.b = Int32(0)] +physical_plan +SortPreservingMergeExec: [a@1 ASC NULLS LAST,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: a@1 = 0 AND b@2 = 0 +------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + +# source is ordered by a,b,c +# when filter result is when filter contains or +# column a, and b may not be constant. Hence final plan +# should contain SortExec +query TT +EXPLAIN SELECT * +FROM annotated_data_finite2 +WHERE a=0 or b=0 +ORDER BY c; +---- +logical_plan +Sort: annotated_data_finite2.c ASC NULLS LAST +--Filter: annotated_data_finite2.a = Int32(0) OR annotated_data_finite2.b = Int32(0) +----TableScan: annotated_data_finite2 projection=[a0, a, b, c, d], partial_filters=[annotated_data_finite2.a = Int32(0) OR annotated_data_finite2.b = Int32(0)] +physical_plan +SortPreservingMergeExec: [c@3 ASC NULLS LAST] +--SortExec: expr=[c@3 ASC NULLS LAST] +----CoalesceBatchesExec: target_batch_size=8192 +------FilterExec: a@1 = 0 OR b@2 = 0 +--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true + statement ok drop table annotated_data_finite2; From b16ad154a35ab462ef6da083c4c0f18a7e9e387a Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 10:58:14 +0300 Subject: [PATCH 13/29] minor changes --- datafusion/physical-expr/src/utils.rs | 134 ++++++++++++++------------ 1 file changed, 74 insertions(+), 60 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 064edc04bac2..8ed64919d3d2 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -163,12 +163,12 @@ pub fn normalize_sort_requirements( ordering_eq_properties: &OrderingEquivalenceProperties, is_aggressive: bool, ) -> Vec { - // println!("sort_reqs at the start:{:?}", sort_reqs); + println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - // println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); + println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); let res = ordering_eq_properties .normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); - // println!("normalized_sort_reqs after oeq:{:?}", res); + println!("normalized_sort_reqs after oeq:{:?}", res); res } @@ -214,8 +214,8 @@ pub fn ordering_satisfy_concrete< if required_normalized.len() > provided_normalized.len() { return false; } - // println!("required_normalized: {:?}", required_normalized); - // println!("provided_normalized: {:?}", provided_normalized); + println!("required_normalized: {:?}", required_normalized); + println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -702,7 +702,8 @@ mod tests { let c = Field::new("c", DataType::Int32, true); let d = Field::new("d", DataType::Int32, true); let e = Field::new("e", DataType::Int32, true); - let schema = Arc::new(Schema::new(vec![a, b, c, d, e])); + let f = Field::new("f", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f])); Ok(schema) } @@ -713,13 +714,14 @@ mod tests { OrderingEquivalenceProperties, )> { // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, b ASC NULLS LAST + // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, f ASC NULLS LAST // Assume that column a and c are aliases. let col_a = &Column::new("a", 0); let col_b = &Column::new("b", 1); let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); + let col_f = &Column::new("f", 5); let option1 = SortOptions { descending: false, nulls_first: false, @@ -760,7 +762,7 @@ mod tests { options: option2, }, PhysicalSortExpr { - expr: Arc::new(col_b.clone()), + expr: Arc::new(col_f.clone()), options: option1, }, ], @@ -968,6 +970,7 @@ mod tests { let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); + let col_f = &Column::new("f", 4); let option1 = SortOptions { descending: false, nulls_first: false, @@ -989,62 +992,73 @@ mod tests { ]; let provided = Some(&provided[..]); let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; - // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function - let requirements = vec![ - // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it - (vec![(col_a, option1)], true), - (vec![(col_a, option2)], false), - // Test whether equivalence works as expected - (vec![(col_c, option1)], true), - (vec![(col_c, option2)], false), - // Test whether ordering equivalence works as expected - (vec![(col_d, option1)], true), - (vec![(col_d, option1), (col_b, option1)], true), - (vec![(col_d, option2), (col_b, option1)], false), - (vec![(col_e, option2), (col_b, option1)], true), - (vec![(col_e, option1), (col_b, option1)], false), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_d, option1), - (col_b, option1), - ], - true, - ), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_e, option2), - (col_b, option1), - ], - true, - ), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_d, option2), - (col_b, option1), - ], - false, - ), - ( - vec![ - (col_d, option1), - (col_b, option1), - (col_e, option1), - (col_b, option1), - ], - false, - ), - ]; - + // // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function // let requirements = vec![ + // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it + // (vec![(col_a, option1)], true), + // (vec![(col_a, option2)], false), + // // Test whether equivalence works as expected + // (vec![(col_c, option1)], true), + // (vec![(col_c, option2)], false), + // // Test whether ordering equivalence works as expected // (vec![(col_d, option1)], true), + // (vec![(col_d, option1), (col_b, option1)], true), + // (vec![(col_d, option2), (col_b, option1)], false), + // (vec![(col_e, option2), (col_f, option1)], true), + // (vec![(col_e, option1), (col_f, option1)], false), + // (vec![(col_e, option2), (col_b, option1)], false), + // (vec![(col_e, option1), (col_b, option1)], false), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_d, option1), + // (col_b, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_e, option2), + // (col_f, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_e, option2), + // (col_b, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_d, option2), + // (col_b, option1), + // ], + // true, + // ), + // ( + // vec![ + // (col_d, option1), + // (col_b, option1), + // (col_e, option1), + // (col_b, option1), + // ], + // false, + // ), // ]; + let requirements = vec![ + (vec![(col_e, option2), (col_f, option1)], true), + ]; + for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); let required = cols From 717631ef571f859d66d01a733105a9ec0ef1d960 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 15:08:52 +0300 Subject: [PATCH 14/29] all tests pass --- .../physical_optimizer/sort_enforcement.rs | 8 +- datafusion/core/src/physical_plan/filter.rs | 7 +- datafusion/physical-expr/src/equivalence.rs | 96 ++++++++-- datafusion/physical-expr/src/utils.rs | 172 +++++++++--------- 4 files changed, 182 insertions(+), 101 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 41fde5768be2..2a0a7b2183b4 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -2860,7 +2860,6 @@ mod tests { } } - mod tmp_tests { use crate::assert_batches_eq; use crate::physical_plan::{collect, displayable, ExecutionPlan}; @@ -2895,8 +2894,7 @@ mod tmp_tests { WITH ORDER (a ASC, b ASC, c ASC) LOCATION 'tests/data/window_2.csv'", ) - .await?; - + .await?; let sql = "SELECT * FROM annotated_data_finite2 @@ -2930,8 +2928,7 @@ mod tmp_tests { WITH ORDER (a ASC, b ASC, c ASC) LOCATION 'tests/data/window_2.csv'", ) - .await?; - + .await?; let sql = "SELECT * FROM annotated_data_finite2 @@ -2946,5 +2943,4 @@ mod tmp_tests { print_batches(&actual)?; Ok(()) } - } diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index e6db6b4edaac..f2a63784969c 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -385,8 +385,11 @@ fn is_column(expr: &Arc) -> bool { expr.as_any().is::() } -fn get_constant_expr_helper(expr: &Arc, mut res: Vec>) -> Vec>{ - if let Some(binary) = expr.as_any().downcast_ref::(){ +fn get_constant_expr_helper( + expr: &Arc, + mut res: Vec>, +) -> Vec> { + if let Some(binary) = expr.as_any().downcast_ref::() { if binary.op() == &Operator::Eq && is_literal(binary.left()) && is_column(binary.right()) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 5c3640640cac..2cd5bbcd65e4 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -16,7 +16,9 @@ // under the License. use crate::expressions::{CastExpr, Column}; -use crate::{LexOrdering, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; +use crate::{ + LexOrdering, LexOrderingReq, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement, +}; use arrow::datatypes::SchemaRef; use arrow_schema::Fields; @@ -239,7 +241,7 @@ impl OrderingEquivalenceProperties { pub fn add_constants(&mut self, constants: Vec>) { for constant in constants { - if !exprs_contains(&self.constants, &constant){ + if !exprs_contains(&self.constants, &constant) { self.constants.push(constant); } } @@ -254,7 +256,9 @@ impl OrderingEquivalenceProperties { sort_reqs: &[PhysicalSortRequirement], is_aggressive: bool, ) -> Vec { - let mut normalized_sort_reqs = prune_sort_reqs_with_constants(sort_reqs, &self.constants); + let normalized_sort_reqs = + prune_sort_reqs_with_constants(sort_reqs, &self.constants); + let mut normalized_sort_reqs = collapse_sort_reqs(normalized_sort_reqs); if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { let item = PhysicalSortRequirement::from_sort_exprs(item); @@ -280,8 +284,9 @@ impl OrderingEquivalenceProperties { normalized_sort_reqs.splice(updated_start..updated_end, head); } } + normalized_sort_reqs = simplify_sort_reqs(normalized_sort_reqs, oeq_class); } - collapse_vec(normalized_sort_reqs) + collapse_sort_reqs(normalized_sort_reqs) } } @@ -690,6 +695,55 @@ fn collapse_vec(input: Vec) -> Vec { output } +/// This function constructs a duplicate-free vector by filtering out duplicate +/// entries inside the given vector `input`. +fn collapse_sort_reqs(input: LexOrderingReq) -> LexOrderingReq { + let mut output = vec![]; + for item in input { + if !sort_reqs_contains(&output, &item) { + output.push(item); + } + } + output +} + +fn is_global_req( + req: &PhysicalSortRequirement, + oeq_class: &OrderingEquivalentClass, +) -> bool { + for ordering in std::iter::once(oeq_class.head()).chain(oeq_class.others().iter()) { + let PhysicalSortRequirement { expr, options } = req; + let global_ordering = &ordering[0]; + if let Some(options) = options { + if options == &global_ordering.options && expr.eq(&global_ordering.expr) { + return true; + } + } else { + if expr.eq(&global_ordering.expr) { + return true; + } + } + } + false +} + +/// This function constructs a duplicate-free vector by filtering out duplicate +/// entries inside the given vector `input`. +fn simplify_sort_reqs( + input: LexOrderingReq, + oeq_class: &OrderingEquivalentClass, +) -> LexOrderingReq { + let mut last_idx = input.len(); + while last_idx > 0 { + if is_global_req(&input[last_idx - 1], oeq_class) { + last_idx -= 1; + } else { + break; + } + } + input[0..last_idx].to_vec() +} + /// This function searches for the slice `section` inside the slice `given`. /// It returns each range where `section` is compatible with the corresponding /// slice in `given`. @@ -698,7 +752,7 @@ fn get_compatible_ranges( section: &[PhysicalSortRequirement], is_aggressive: bool, ) -> Vec> { - if is_aggressive { + if is_aggressive && false { // println!("given: {:?}", given); // println!("section: {:?}", section); let mut res = vec![]; @@ -710,7 +764,7 @@ fn get_compatible_ranges( { count += 1; } - if count > 0 { + if count > 0 && (i + count == given.len() || count == section.len()) { res.push(Range { start: i, end: i + count, @@ -738,19 +792,37 @@ fn get_compatible_ranges( } } -fn exprs_contains(constants: &[Arc], expr: &Arc) -> bool { - for constant in constants{ - if constant.eq(expr){ +fn exprs_contains( + constants: &[Arc], + expr: &Arc, +) -> bool { + for constant in constants { + if constant.eq(expr) { + return true; + } + } + false +} + +fn sort_reqs_contains( + sort_reqs: &[PhysicalSortRequirement], + sort_req: &PhysicalSortRequirement, +) -> bool { + for constant in sort_reqs { + if constant.expr.eq(&sort_req.expr) { return true; } } false } -fn prune_sort_reqs_with_constants(ordering: &[PhysicalSortRequirement], constants: &[Arc]) -> Vec { +fn prune_sort_reqs_with_constants( + ordering: &[PhysicalSortRequirement], + constants: &[Arc], +) -> Vec { let mut new_ordering = vec![]; - for order in ordering{ - if !exprs_contains(constants, &order.expr){ + for order in ordering { + if !exprs_contains(constants, &order.expr) { new_ordering.push(order.clone()) } } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 8ed64919d3d2..64a0f319c172 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -163,12 +163,12 @@ pub fn normalize_sort_requirements( ordering_eq_properties: &OrderingEquivalenceProperties, is_aggressive: bool, ) -> Vec { - println!("sort_reqs at the start:{:?}", sort_reqs); + // println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); + // println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); let res = ordering_eq_properties .normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); - println!("normalized_sort_reqs after oeq:{:?}", res); + // println!("normalized_sort_reqs after oeq:{:?}", res); res } @@ -214,8 +214,8 @@ pub fn ordering_satisfy_concrete< if required_normalized.len() > provided_normalized.len() { return false; } - println!("required_normalized: {:?}", required_normalized); - println!("provided_normalized: {:?}", provided_normalized); + // println!("required_normalized: {:?}", required_normalized); + // println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -970,7 +970,7 @@ mod tests { let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); - let col_f = &Column::new("f", 4); + let col_f = &Column::new("f", 5); let option1 = SortOptions { descending: false, nulls_first: false, @@ -992,73 +992,89 @@ mod tests { ]; let provided = Some(&provided[..]); let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; - // // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + let requirements = vec![ + // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it + (vec![(col_a, option1)], true), + (vec![(col_a, option2)], false), + // Test whether equivalence works as expected + (vec![(col_c, option1)], true), + (vec![(col_c, option2)], false), + // Test whether ordering equivalence works as expected + (vec![(col_d, option1)], true), + (vec![(col_d, option1), (col_b, option1)], true), + (vec![(col_d, option2), (col_b, option1)], false), + (vec![(col_e, option2), (col_f, option1)], true), + (vec![(col_e, option1), (col_f, option1)], false), + (vec![(col_e, option2), (col_b, option1)], false), + (vec![(col_e, option1), (col_b, option1)], false), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option1), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option2), + (col_f, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option2), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option2), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option1), + (col_f, option1), + ], + false, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option1), + (col_b, option1), + ], + false, + ), + (vec![(col_d, option1), (col_e, option2)], true), + ]; + // let requirements = vec![ - // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it - // (vec![(col_a, option1)], true), - // (vec![(col_a, option2)], false), - // // Test whether equivalence works as expected - // (vec![(col_c, option1)], true), - // (vec![(col_c, option2)], false), - // // Test whether ordering equivalence works as expected - // (vec![(col_d, option1)], true), - // (vec![(col_d, option1), (col_b, option1)], true), - // (vec![(col_d, option2), (col_b, option1)], false), - // (vec![(col_e, option2), (col_f, option1)], true), - // (vec![(col_e, option1), (col_f, option1)], false), - // (vec![(col_e, option2), (col_b, option1)], false), - // (vec![(col_e, option1), (col_b, option1)], false), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_d, option1), - // (col_b, option1), - // ], - // true, - // ), // ( // vec![ // (col_d, option1), - // (col_b, option1), // (col_e, option2), - // (col_f, option1), // ], // true, // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_e, option2), - // (col_b, option1), - // ], - // true, - // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_d, option2), - // (col_b, option1), - // ], - // true, - // ), - // ( - // vec![ - // (col_d, option1), - // (col_b, option1), - // (col_e, option1), - // (col_b, option1), - // ], - // false, - // ), // ]; - let requirements = vec![ - (vec![(col_e, option2), (col_f, option1)], true), - ]; - for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); let required = cols @@ -1102,6 +1118,7 @@ mod tests { let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); + let col_f = &Column::new("f", 5); let option1 = SortOptions { descending: false, nulls_first: false, @@ -1112,28 +1129,21 @@ mod tests { }; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function let requirements = vec![ - (vec![(col_a, Some(option1))], vec![(col_a, Some(option1))]), - (vec![(col_a, None)], vec![(col_a, None)]), + (vec![(col_a, Some(option1))], vec![]), + (vec![(col_a, Some(option2))], vec![(col_a, Some(option2))]), + (vec![(col_a, None)], vec![]), // Test whether equivalence works as expected - (vec![(col_c, Some(option1))], vec![(col_a, Some(option1))]), - (vec![(col_c, None)], vec![(col_a, None)]), + (vec![(col_c, Some(option1))], vec![]), + (vec![(col_c, None)], vec![]), // Test whether ordering equivalence works as expected - ( - vec![(col_d, Some(option1)), (col_b, Some(option1))], - vec![(col_a, Some(option1))], - ), - (vec![(col_d, None), (col_b, None)], vec![(col_a, None)]), - ( - vec![(col_e, Some(option2)), (col_b, Some(option1))], - vec![(col_a, Some(option1))], - ), + (vec![(col_d, Some(option1)), (col_b, Some(option1))], vec![]), + (vec![(col_d, None), (col_b, None)], vec![]), + (vec![(col_e, Some(option2)), (col_f, Some(option1))], vec![]), // We should be able to normalize in compatible requirements also (not exactly equal) - ( - vec![(col_e, Some(option2)), (col_b, None)], - vec![(col_a, Some(option1))], - ), - (vec![(col_e, None), (col_b, None)], vec![(col_a, None)]), + (vec![(col_e, Some(option2)), (col_f, None)], vec![]), + (vec![(col_e, None), (col_f, None)], vec![]), ]; + let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; for (reqs, expected_normalized) in requirements.into_iter() { let req = convert_to_requirement(&reqs); From b93cc5d55765f2ccf82492f0bc42a23a05cfea16 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 15:12:40 +0300 Subject: [PATCH 15/29] Simplifications --- datafusion/physical-expr/src/equivalence.rs | 62 ++++++--------------- datafusion/physical-expr/src/utils.rs | 24 +++----- 2 files changed, 24 insertions(+), 62 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 2cd5bbcd65e4..3d88f42b453c 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -254,7 +254,6 @@ impl OrderingEquivalenceProperties { pub fn normalize_sort_requirements( &self, sort_reqs: &[PhysicalSortRequirement], - is_aggressive: bool, ) -> Vec { let normalized_sort_reqs = prune_sort_reqs_with_constants(sort_reqs, &self.constants); @@ -263,8 +262,7 @@ impl OrderingEquivalenceProperties { for item in oeq_class.others() { let item = PhysicalSortRequirement::from_sort_exprs(item); let item = prune_sort_reqs_with_constants(&item, &self.constants); - let ranges = - get_compatible_ranges(&normalized_sort_reqs, &item, is_aggressive); + let ranges = get_compatible_ranges(&normalized_sort_reqs, &item); let mut offset: i64 = 0; for Range { start, end } in ranges { let head = PhysicalSortRequirement::from_sort_exprs(oeq_class.head()); @@ -750,46 +748,23 @@ fn simplify_sort_reqs( fn get_compatible_ranges( given: &[PhysicalSortRequirement], section: &[PhysicalSortRequirement], - is_aggressive: bool, ) -> Vec> { - if is_aggressive && false { - // println!("given: {:?}", given); - // println!("section: {:?}", section); - let mut res = vec![]; - for i in 0..given.len() { - let mut count = 0; - while i + count < given.len() - && count < section.len() - && given[i + count] == section[count] - { - count += 1; - } - if count > 0 && (i + count == given.len() || count == section.len()) { - res.push(Range { - start: i, - end: i + count, - }) - } - } - res + let n_section = section.len(); + let n_end = if given.len() >= n_section { + given.len() - n_section + 1 } else { - let n_section = section.len(); - let n_end = if given.len() >= n_section { - given.len() - n_section + 1 - } else { - 0 - }; - (0..n_end) - .filter_map(|idx| { - let end = idx + n_section; - given[idx..end] - .iter() - .zip(section) - .all(|(req, given)| given.compatible(req)) - .then_some(Range { start: idx, end }) - }) - .collect() - } + 0 + }; + (0..n_end) + .filter_map(|idx| { + let end = idx + n_section; + given[idx..end] + .iter() + .zip(section) + .all(|(req, given)| given.compatible(req)) + .then_some(Range { start: idx, end }) + }) + .collect() } fn exprs_contains( @@ -985,10 +960,7 @@ mod tests { .into_iter() .map(|(start, end)| Range { start, end }) .collect::>(); - assert_eq!( - get_compatible_ranges(&searched, &to_search, false), - expected - ); + assert_eq!(get_compatible_ranges(&searched, &to_search), expected); } Ok(()) } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 64a0f319c172..f0644f9ab621 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -137,14 +137,12 @@ pub fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, - is_aggressive: bool, ) -> Vec { let sort_requirements = PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); let normalized_exprs = normalize_sort_requirements( &sort_requirements, eq_properties, ordering_eq_properties, - is_aggressive, ); PhysicalSortRequirement::to_sort_exprs(normalized_exprs) } @@ -161,14 +159,9 @@ pub fn normalize_sort_requirements( sort_reqs: &[PhysicalSortRequirement], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, - is_aggressive: bool, ) -> Vec { - // println!("sort_reqs at the start:{:?}", sort_reqs); let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - // println!("normalized_sort_reqs after eq:{:?}", normalized_sort_reqs); - let res = ordering_eq_properties - .normalize_sort_requirements(&normalized_sort_reqs, is_aggressive); - // println!("normalized_sort_reqs after oeq:{:?}", res); + let res = ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs); res } @@ -208,14 +201,12 @@ pub fn ordering_satisfy_concrete< let oeq_properties = ordering_equal_properties(); let eq_properties = equal_properties(); let required_normalized = - normalize_sort_exprs(required, &eq_properties, &oeq_properties, true); + normalize_sort_exprs(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_exprs(provided, &eq_properties, &oeq_properties, false); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } - // println!("required_normalized: {:?}", required_normalized); - // println!("provided_normalized: {:?}", provided_normalized); required_normalized .into_iter() .zip(provided_normalized) @@ -259,9 +250,9 @@ pub fn ordering_satisfy_requirement_concrete< let oeq_properties = ordering_equal_properties(); let eq_properties = equal_properties(); let required_normalized = - normalize_sort_requirements(required, &eq_properties, &oeq_properties, false); + normalize_sort_requirements(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_exprs(provided, &eq_properties, &oeq_properties, false); + normalize_sort_exprs(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } @@ -309,9 +300,9 @@ fn requirements_compatible_concrete< let eq_properties = equal_properties(); let required_normalized = - normalize_sort_requirements(required, &eq_properties, &oeq_properties, false); + normalize_sort_requirements(required, &eq_properties, &oeq_properties); let provided_normalized = - normalize_sort_requirements(provided, &eq_properties, &oeq_properties, false); + normalize_sort_requirements(provided, &eq_properties, &oeq_properties); if required_normalized.len() > provided_normalized.len() { return false; } @@ -1154,7 +1145,6 @@ mod tests { &req, &eq_properties, &ordering_eq_properties, - false ), expected_normalized ); From b832b2dea5d9c6a682c6b0b489811cd7ce9b1a41 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 15:19:58 +0300 Subject: [PATCH 16/29] minor changes --- datafusion/physical-expr/src/utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index f0644f9ab621..97ba6a5a0283 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -133,7 +133,7 @@ pub fn normalize_out_expr_with_columns_map( /// This function converts `sort_exprs` `vec![b ASC, c ASC]` to first `vec![a ASC, c ASC]` after considering `eq_properties` /// Then converts `vec![a ASC, c ASC]` to `vec![d ASC]` after considering `ordering_eq_properties`. /// Standardized version `vec![d ASC]` is used in subsequent operations. -pub fn normalize_sort_exprs( +fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, @@ -155,7 +155,7 @@ pub fn normalize_sort_exprs( /// This function converts `sort_exprs` `vec![b Some(ASC), c None]` to first `vec![a Some(ASC), c None]` after considering `eq_properties` /// Then converts `vec![a Some(ASC), c None]` to `vec![d Some(ASC)]` after considering `ordering_eq_properties`. /// Standardized version `vec![d Some(ASC)]` is used in subsequent operations. -pub fn normalize_sort_requirements( +fn normalize_sort_requirements( sort_reqs: &[PhysicalSortRequirement], eq_properties: &EquivalenceProperties, ordering_eq_properties: &OrderingEquivalenceProperties, From 858576b744cfe4cbda3cbd9c4bba36d895866d30 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 16:59:01 +0300 Subject: [PATCH 17/29] Resolve linter error --- .../physical_optimizer/sort_enforcement.rs | 89 ------------------- .../core/src/physical_plan/joins/utils.rs | 54 ----------- datafusion/physical-expr/src/equivalence.rs | 12 +-- datafusion/physical-expr/src/utils.rs | 19 ++-- 4 files changed, 9 insertions(+), 165 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 0b3b077312a0..261bec723296 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -550,10 +550,6 @@ fn analyze_immediate_sort_removal( ) -> Option { if let Some(sort_exec) = plan.as_any().downcast_ref::() { let sort_input = sort_exec.input().clone(); - // println!("sort_input.output_ordering(): {:?}", sort_input.output_ordering()); - // println!("sort_exec.output_ordering(): {:?}", sort_exec.output_ordering()); - // println!("sort_input.equivalence_properties().classes(): {:?}", sort_input.equivalence_properties().classes()); - // println!("sort_input.ordering_equivalence_properties().oeq_class(): {:?}", sort_input.ordering_equivalence_properties().oeq_class()); // If this sort is unnecessary, we should remove it: if ordering_satisfy( sort_input.output_ordering(), @@ -2769,88 +2765,3 @@ mod tests { Ok(()) } } - -mod tmp_tests { - use crate::assert_batches_eq; - use crate::physical_plan::{collect, displayable, ExecutionPlan}; - use crate::prelude::SessionContext; - use arrow::util::pretty::print_batches; - use datafusion_common::Result; - use datafusion_execution::config::SessionConfig; - use std::sync::Arc; - - fn print_plan(plan: &Arc) -> Result<()> { - let formatted = displayable(plan.as_ref()).indent(true).to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - println!("{:#?}", actual); - Ok(()) - } - - #[tokio::test] - async fn test_first_value() -> Result<()> { - let config = SessionConfig::new().with_target_partitions(1); - let ctx = SessionContext::with_config(config); - - ctx.sql( - "CREATE EXTERNAL TABLE annotated_data_finite2 ( - a0 INTEGER, - a INTEGER, - b INTEGER, - c INTEGER, - d INTEGER - ) - STORED AS CSV - WITH HEADER ROW - WITH ORDER (a ASC, b ASC, c ASC) - LOCATION 'tests/data/window_2.csv'", - ) - .await?; - - let sql = "SELECT * - FROM annotated_data_finite2 - WHERE a=0 - ORDER BY b, c"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - print_plan(&physical_plan)?; - let actual = collect(physical_plan, ctx.task_ctx()).await?; - print_batches(&actual)?; - Ok(()) - } - - #[tokio::test] - async fn test_first_value2() -> Result<()> { - let config = SessionConfig::new().with_target_partitions(1); - let ctx = SessionContext::with_config(config); - - ctx.sql( - "CREATE EXTERNAL TABLE annotated_data_finite2 ( - a0 INTEGER, - a INTEGER, - b INTEGER, - c INTEGER, - d INTEGER - ) - STORED AS CSV - WITH HEADER ROW - WITH ORDER (a ASC, b ASC, c ASC) - LOCATION 'tests/data/window_2.csv'", - ) - .await?; - - let sql = "SELECT * - FROM annotated_data_finite2 - WHERE a=0 and b=0 - ORDER BY c"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - print_plan(&physical_plan)?; - let actual = collect(physical_plan, ctx.task_ctx()).await?; - print_batches(&actual)?; - Ok(()) - } -} diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index 836f59ddaa51..f8cea1d129e7 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -457,7 +457,6 @@ pub fn combine_join_ordering_equivalence_properties( && left.output_ordering().is_some() && *join_type == JoinType::Inner { - let left_oeq_classes = left_oeq_properties.oeq_class(); let right_output_ordering = right.output_ordering().unwrap_or(&[]); let right_output_ordering = add_offset_to_lex_ordering(right_output_ordering, left_columns_len)?; @@ -483,59 +482,6 @@ pub fn combine_join_ordering_equivalence_properties( Ok(new_properties) } -// /// Adds the `offset` value to `Column` indices inside `expr`. This function is -// /// generally used during the update of the right table schema in join operations. -// pub(crate) fn add_offset_to_expr( -// expr: Arc, -// offset: usize, -// ) -> Result> { -// expr.transform_down(&|e| match e.as_any().downcast_ref::() { -// Some(col) => Ok(Transformed::Yes(Arc::new(Column::new( -// col.name(), -// offset + col.index(), -// )))), -// None => Ok(Transformed::No(e)), -// }) -// } -// -// /// Adds the `offset` value to `Column` indices inside `sort_expr.expr`. -// pub(crate) fn add_offset_to_sort_expr( -// sort_expr: &PhysicalSortExpr, -// offset: usize, -// ) -> Result { -// Ok(PhysicalSortExpr { -// expr: add_offset_to_expr(sort_expr.expr.clone(), offset)?, -// options: sort_expr.options, -// }) -// } - -// /// Adds the `offset` value to `Column` indices for each `sort_expr.expr` -// /// inside `sort_exprs`. -// pub(crate) fn add_offset_to_lex_ordering( -// sort_exprs: LexOrderingRef, -// offset: usize, -// ) -> Result { -// sort_exprs -// .iter() -// .map(|sort_expr| add_offset_to_sort_expr(sort_expr, offset)) -// .collect() -// } - -// /// Adds the `offset` value to `Column` indices for all expressions inside the -// /// given `OrderingEquivalentClass`es. -// pub(crate) fn add_offset_to_ordering_equivalence_classes( -// oeq_classes: &OrderingEquivalentClass, -// offset: usize, -// ) -> Result { -// let new_head = add_offset_to_lex_ordering(oeq_classes.head(), offset)?; -// let new_others = oeq_classes -// .others() -// .iter() -// .map(|ordering| add_offset_to_lex_ordering(ordering, offset)) -// .collect::>>()?; -// Ok(OrderingEquivalentClass::new(new_head, new_others)) -// } - impl Display for JoinSide { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 1c3223b5988e..c5ae5d411df9 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -338,12 +338,6 @@ impl OrderingEquivalenceProperties { } collapse_sort_reqs(normalized_sort_reqs) } - - pub fn normalize_with_equivalence_properties( - &mut self, - eq_properties: &EquivalenceProperties, - ) { - } } impl OrderingEquivalenceProperties { @@ -802,10 +796,8 @@ fn is_global_req( if options == &global_ordering.options && expr.eq(&global_ordering.expr) { return true; } - } else { - if expr.eq(&global_ordering.expr) { - return true; - } + } else if expr.eq(&global_ordering.expr) { + return true; } } false diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index b21820e89ffe..9549b8e73a1b 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -244,8 +244,7 @@ fn normalize_sort_requirements( ordering_eq_properties: &OrderingEquivalenceProperties, ) -> Vec { let normalized_sort_reqs = eq_properties.normalize_sort_requirements(sort_reqs); - let res = ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs); - res + ordering_eq_properties.normalize_sort_requirements(&normalized_sort_reqs) } /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. @@ -1832,17 +1831,13 @@ mod tests { let normalized_class = normalize_ordering_equivalence_classes( ordering_equal_properties.oeq_class(), &equal_properties, + ) + .unwrap(); + let expected = expected_oeq.oeq_class().unwrap(); + assert!( + normalized_class.head().eq(expected.head()) + && normalized_class.others().eq(expected.others()) ); - let expected = expected_oeq.oeq_class(); - match (normalized_class, expected) { - (Some(normalized), Some(expected)) => { - assert!( - normalized.head().eq(expected.head()) - && normalized.others().eq(expected.others()) - ) - } - (_, _) => assert!(false), - } Ok(()) } From 09aa6c8d648c01938199b7724f64befd1ecff453 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 7 Sep 2023 17:33:29 +0300 Subject: [PATCH 18/29] Minor changes --- datafusion/core/src/physical_plan/joins/utils.rs | 5 ----- datafusion/physical-expr/src/utils.rs | 10 ---------- 2 files changed, 15 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index f8cea1d129e7..0dec91fa6829 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -333,10 +333,6 @@ fn get_updated_right_ordering_equivalence_properties( // In these modes, indices of the right schema should be offset by // the left table size. JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { - // add_offset_to_ordering_equivalence_classes( - // right_oeq_classes, - // left_columns_len, - // )? right_oeq_properties.add_offset(left_columns_len)?; } _ => {} @@ -346,7 +342,6 @@ fn get_updated_right_ordering_equivalence_properties( right_oeq_properties.oeq_class(), join_eq_properties, )) - // Ok(vec![]) } /// Merge left and right sort expressions, checking for duplicates. diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 9549b8e73a1b..a804a7b090e0 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -1294,16 +1294,6 @@ mod tests { (vec![(col_d, option1), (col_e, option2)], true), ]; - // let requirements = vec![ - // ( - // vec![ - // (col_d, option1), - // (col_e, option2), - // ], - // true, - // ), - // ]; - for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); let required = cols From 7212e56e3dc725261b777f1c7904fc5cc6714cd3 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 8 Sep 2023 15:12:43 +0300 Subject: [PATCH 19/29] minor changes --- datafusion/physical-expr/src/equivalence.rs | 61 +++++++++++++++------ datafusion/physical-expr/src/utils.rs | 45 +++++++++++---- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index c5ae5d411df9..247b9849ce60 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -27,6 +27,7 @@ use arrow_schema::Fields; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; +use itertools::izip; use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::ops::Range; @@ -785,22 +786,43 @@ fn collapse_sort_reqs(input: LexOrderingReq) -> LexOrderingReq { output } -fn is_global_req( - req: &PhysicalSortRequirement, +fn req_satisfied(given: LexOrderingRef, req: &[PhysicalSortRequirement]) -> bool { + if given.len() != req.len() { + false + } else { + for (given, req) in izip!(given.iter(), req.iter()) { + let PhysicalSortRequirement { expr, options } = req; + if let Some(options) = options { + if options != &given.options || !expr.eq(&given.expr) { + return false; + } + } else if !expr.eq(&given.expr) { + return false; + } + } + true + } +} + +fn prune_last_n_that_is_in_oeq( + input: &[PhysicalSortRequirement], oeq_class: &OrderingEquivalentClass, -) -> bool { +) -> usize { + let input_len = input.len(); for ordering in std::iter::once(oeq_class.head()).chain(oeq_class.others().iter()) { - let PhysicalSortRequirement { expr, options } = req; - let global_ordering = &ordering[0]; - if let Some(options) = options { - if options == &global_ordering.options && expr.eq(&global_ordering.expr) { - return true; + let ordering_len = ordering.len(); + let mut search_range = std::cmp::min(ordering_len, input_len); + while search_range > 0 { + let req_section = &input[input_len - search_range..]; + let given_section = &ordering[0..search_range]; + if req_satisfied(given_section, req_section) { + return search_range; + } else { + search_range -= 1; } - } else if expr.eq(&global_ordering.expr) { - return true; } } - false + 0 } /// This function constructs a duplicate-free vector by filtering out duplicate @@ -809,15 +831,20 @@ fn simplify_sort_reqs( input: LexOrderingReq, oeq_class: &OrderingEquivalentClass, ) -> LexOrderingReq { - let mut last_idx = input.len(); - while last_idx > 0 { - if is_global_req(&input[last_idx - 1], oeq_class) { - last_idx -= 1; - } else { + let mut section = &input[..]; + loop { + let n_prune = prune_last_n_that_is_in_oeq(section, oeq_class); + // Cannot prune entries from the end of requirement + if n_prune == 0 { break; } + section = §ion[0..section.len() - n_prune]; + } + if section.is_empty() { + PhysicalSortRequirement::from_sort_exprs(oeq_class.head()) + } else { + section.to_vec() } - input[0..last_idx].to_vec() } /// This function searches for the slice `section` inside the slice `given`. diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index a804a7b090e0..8219f640c97c 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -943,7 +943,7 @@ mod tests { OrderingEquivalenceProperties, )> { // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, f ASC NULLS LAST + // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, f ASC NULLS LAST, g ASC NULLS LAST // Assume that column a and c are aliases. let col_a = &Column::new("a", 0); let col_b = &Column::new("b", 1); @@ -951,6 +951,7 @@ mod tests { let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); let col_f = &Column::new("f", 5); + let col_g = &Column::new("g", 6); let option1 = SortOptions { descending: false, nulls_first: false, @@ -994,6 +995,10 @@ mod tests { expr: Arc::new(col_f.clone()), options: option1, }, + PhysicalSortExpr { + expr: Arc::new(col_g.clone()), + options: option1, + }, ], )); Ok((test_schema, eq_properties, ordering_eq_properties)) @@ -1200,6 +1205,7 @@ mod tests { let col_d = &Column::new("d", 3); let col_e = &Column::new("e", 4); let col_f = &Column::new("f", 5); + let col_g = &Column::new("g", 6); let option1 = SortOptions { descending: false, nulls_first: false, @@ -1233,6 +1239,10 @@ mod tests { (vec![(col_d, option1)], true), (vec![(col_d, option1), (col_b, option1)], true), (vec![(col_d, option2), (col_b, option1)], false), + ( + vec![(col_e, option2), (col_f, option1), (col_g, option1)], + true, + ), (vec![(col_e, option2), (col_f, option1)], true), (vec![(col_e, option1), (col_f, option1)], false), (vec![(col_e, option2), (col_b, option1)], false), @@ -1348,19 +1358,34 @@ mod tests { }; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function let requirements = vec![ - (vec![(col_a, Some(option1))], vec![]), + (vec![(col_a, Some(option1))], vec![(col_a, Some(option1))]), (vec![(col_a, Some(option2))], vec![(col_a, Some(option2))]), - (vec![(col_a, None)], vec![]), + (vec![(col_a, None)], vec![(col_a, Some(option1))]), // Test whether equivalence works as expected - (vec![(col_c, Some(option1))], vec![]), - (vec![(col_c, None)], vec![]), + (vec![(col_c, Some(option1))], vec![(col_a, Some(option1))]), + (vec![(col_c, None)], vec![(col_a, Some(option1))]), // Test whether ordering equivalence works as expected - (vec![(col_d, Some(option1)), (col_b, Some(option1))], vec![]), - (vec![(col_d, None), (col_b, None)], vec![]), - (vec![(col_e, Some(option2)), (col_f, Some(option1))], vec![]), + ( + vec![(col_d, Some(option1)), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + ), + ( + vec![(col_d, None), (col_b, None)], + vec![(col_a, Some(option1))], + ), + ( + vec![(col_e, Some(option2)), (col_f, Some(option1))], + vec![(col_a, Some(option1))], + ), // We should be able to normalize in compatible requirements also (not exactly equal) - (vec![(col_e, Some(option2)), (col_f, None)], vec![]), - (vec![(col_e, None), (col_f, None)], vec![]), + ( + vec![(col_e, Some(option2)), (col_f, None)], + vec![(col_a, Some(option1))], + ), + ( + vec![(col_e, None), (col_f, None)], + vec![(col_a, Some(option1))], + ), ]; let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; From 49ea33358eb81a3d0a28fbf4ae8ce36281a5cd1b Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 8 Sep 2023 15:22:58 +0300 Subject: [PATCH 20/29] Update plan --- datafusion/sqllogictest/test_files/window.slt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 207bf211014b..a559ac50d25a 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -2341,11 +2341,10 @@ Limit: skip=0, fetch=5 ----------TableScan: aggregate_test_100 projection=[c9] physical_plan GlobalLimitExec: skip=0, fetch=5 ---SortExec: fetch=5, expr=[rn1@1 ASC NULLS LAST,c9@0 ASC NULLS LAST] -----ProjectionExec: expr=[c9@0 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] -------BoundedWindowAggExec: wdw=[ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }], mode=[Sorted] ---------SortExec: expr=[c9@0 DESC] -----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], has_header=true +--ProjectionExec: expr=[c9@0 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] +----BoundedWindowAggExec: wdw=[ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }], mode=[Sorted] +------SortExec: expr=[c9@0 DESC] +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], has_header=true query II SELECT c9, rn1 FROM (SELECT c9, From 18c4bab2b9d2d03faf79e6dc38c23f46d6fcee28 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 8 Sep 2023 15:37:56 +0300 Subject: [PATCH 21/29] Simplifications, update comments --- datafusion/core/src/physical_plan/filter.rs | 22 ++- .../core/src/physical_plan/joins/utils.rs | 149 +++++++----------- datafusion/physical-expr/src/equivalence.rs | 142 ++++++++++------- datafusion/physical-expr/src/utils.rs | 131 ++++----------- 4 files changed, 190 insertions(+), 254 deletions(-) diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index d5f320122e38..295da145cff4 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -377,25 +377,39 @@ fn collect_columns_from_predicate(predicate: &Arc) -> EqualAnd pub type EqualAndNonEqual<'a> = (Vec<(&'a Column, &'a Column)>, Vec<(&'a Column, &'a Column)>); +/// Checks whether physical expression is literal fn is_literal(expr: &Arc) -> bool { expr.as_any().is::() } +/// Checks whether physical expression is column fn is_column(expr: &Arc) -> bool { expr.as_any().is::() } +/// Collects physical expressions that have constant result according to +/// predicate expression. +fn collect_constants_from_predicate( + predicate: &Arc, +) -> Vec> { + get_constant_expr_helper(predicate, vec![]) +} + +/// Helper function to collect expression have constant values fn get_constant_expr_helper( expr: &Arc, mut res: Vec>, ) -> Vec> { if let Some(binary) = expr.as_any().downcast_ref::() { + // in the 3=a form if binary.op() == &Operator::Eq && is_literal(binary.left()) && is_column(binary.right()) { res.push(binary.right().clone()) - } else if binary.op() == &Operator::Eq + } + // In the a = 3 form + else if binary.op() == &Operator::Eq && is_literal(binary.right()) && is_column(binary.left()) { @@ -408,12 +422,6 @@ fn get_constant_expr_helper( res } -fn collect_constants_from_predicate( - predicate: &Arc, -) -> Vec> { - get_constant_expr_helper(predicate, vec![]) -} - #[cfg(test)] mod tests { diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index 0dec91fa6829..ceb401b032b5 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -44,16 +44,15 @@ use datafusion_common::{ exec_err, plan_err, DataFusionError, JoinType, Result, ScalarValue, SharedResult, }; use datafusion_physical_expr::expressions::Column; -use datafusion_physical_expr::utils::normalize_ordering_equivalence_classes; use datafusion_physical_expr::{ add_offset_to_lex_ordering, EquivalentClass, LexOrdering, LexOrderingRef, OrderingEquivalenceProperties, OrderingEquivalentClass, PhysicalExpr, PhysicalSortExpr, }; +use datafusion_physical_expr::utils::merge_vectors; use futures::future::{BoxFuture, Shared}; use futures::{ready, FutureExt}; -use itertools::Itertools; use parking_lot::Mutex; /// The on clause of the join, as vector of (left, right) columns. @@ -323,59 +322,21 @@ pub fn cross_join_equivalence_properties( /// /// This way; once we normalize an expression according to equivalence properties, /// it can thereafter safely be used for ordering equivalence normalization. -fn get_updated_right_ordering_equivalence_properties( +fn get_updated_right_ordering_equivalent_class( join_type: &JoinType, - mut right_oeq_properties: OrderingEquivalenceProperties, + mut right_oeq_class: OrderingEquivalentClass, left_columns_len: usize, join_eq_properties: &EquivalenceProperties, -) -> Result> { +) -> Result { match join_type { // In these modes, indices of the right schema should be offset by // the left table size. JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { - right_oeq_properties.add_offset(left_columns_len)?; + right_oeq_class = right_oeq_class.add_offset(left_columns_len)?; } _ => {} }; - - Ok(normalize_ordering_equivalence_classes( - right_oeq_properties.oeq_class(), - join_eq_properties, - )) -} - -/// Merge left and right sort expressions, checking for duplicates. -fn merge_vectors( - left: &[PhysicalSortExpr], - right: &[PhysicalSortExpr], -) -> Vec { - left.iter() - .cloned() - .chain(right.iter().cloned()) - .unique() - .collect() -} - -/// Prefix with existing ordering. -fn prefix_ordering_equivalence_with_existing_ordering( - existing_ordering: &[PhysicalSortExpr], - oeq_properties: &OrderingEquivalenceProperties, - eq_properties: &EquivalenceProperties, -) -> Option { - let existing_ordering = eq_properties.normalize_sort_exprs(existing_ordering); - oeq_properties.oeq_class().map(|oeq_class| { - let normalized_head = eq_properties.normalize_sort_exprs(oeq_class.head()); - let updated_head = merge_vectors(&existing_ordering, &normalized_head); - let updated_others = oeq_class - .others() - .iter() - .map(|ordering| { - let normalized_ordering = eq_properties.normalize_sort_exprs(ordering); - merge_vectors(&existing_ordering, &normalized_ordering) - }) - .collect(); - OrderingEquivalentClass::new(updated_head, updated_others) - }) + Ok(right_oeq_class.normalize_with_equivalence_properties(join_eq_properties)) } /// Calculate ordering equivalence properties for the given join operation. @@ -405,23 +366,26 @@ pub fn combine_join_ordering_equivalence_properties( (true, false) => { new_properties.extend(left_oeq_properties.oeq_class().cloned()); // In this special case, right side ordering can be prefixed with left side ordering. - if probe_side == Some(JoinSide::Left) - && right.output_ordering().is_some() - && *join_type == JoinType::Inner - { - let right_oeq_classes = - get_updated_right_ordering_equivalence_properties( - join_type, - right_oeq_properties, - left_columns_len, - &join_eq_properties, - )?; - // Create new ordering equivalence properties from updated class. - let mut right_oeq_properties = - OrderingEquivalenceProperties::new(join_eq_properties.schema()); - right_oeq_properties.extend(right_oeq_classes); - + if let ( + Some(JoinSide::Left), + Some(_right_ordering), + JoinType::Inner, + Some(oeq_class), + ) = ( + probe_side, + right.output_ordering(), + join_type, + right_oeq_properties.oeq_class(), + ) { let left_output_ordering = left.output_ordering().unwrap_or(&[]); + + let updated_right_oeq = get_updated_right_ordering_equivalent_class( + join_type, + oeq_class.clone(), + left_columns_len, + &join_eq_properties, + )?; + // Right side ordering equivalence properties should be prepended with // those of the left side while constructing output ordering equivalence // properties since stream side is the left side. @@ -430,31 +394,43 @@ pub fn combine_join_ordering_equivalence_properties( // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC` // for the right table should be converted to `a ASC, b ASC` before it is added // to the ordering equivalences of the join. - let updated_right_oeq_classes = - prefix_ordering_equivalence_with_existing_ordering( + let updated_right_oeq_class = updated_right_oeq + .prefix_ordering_equivalent_class_with_existing_ordering( left_output_ordering, - &right_oeq_properties, &join_eq_properties, ); - new_properties.extend(updated_right_oeq_classes); + new_properties.extend(Some(updated_right_oeq_class)); } } (false, true) => { - let right_oeq_classes = get_updated_right_ordering_equivalence_properties( - join_type, - right_oeq_properties, - left_columns_len, - &join_eq_properties, - )?; - new_properties.extend(right_oeq_classes); + let updated_right_oeq = right_oeq_properties + .oeq_class() + .map(|right_oeq_class| { + get_updated_right_ordering_equivalent_class( + join_type, + right_oeq_class.clone(), + left_columns_len, + &join_eq_properties, + ) + }) + .transpose()?; + new_properties.extend(updated_right_oeq); // In this special case, left side ordering can be prefixed with right side ordering. - if probe_side == Some(JoinSide::Right) - && left.output_ordering().is_some() - && *join_type == JoinType::Inner - { + if let ( + Some(JoinSide::Right), + Some(_left_ordering), + JoinType::Inner, + Some(left_oeq_class), + ) = ( + probe_side, + left.output_ordering(), + join_type, + left_oeq_properties.oeq_class(), + ) { let right_output_ordering = right.output_ordering().unwrap_or(&[]); let right_output_ordering = add_offset_to_lex_ordering(right_output_ordering, left_columns_len)?; + // Left side ordering equivalence properties should be prepended with // those of the right side while constructing output ordering equivalence // properties since stream side is the right side. @@ -463,13 +439,12 @@ pub fn combine_join_ordering_equivalence_properties( // ordering of the left table is `a ASC`, then the ordering equivalence `b ASC` // for the right table should be converted to `a ASC, b ASC` before it is added // to the ordering equivalences of the join. - let updated_left_oeq_classes = - prefix_ordering_equivalence_with_existing_ordering( + let updated_left_oeq_class = left_oeq_class + .prefix_ordering_equivalent_class_with_existing_ordering( &right_output_ordering, - &left_oeq_properties, &join_eq_properties, ); - new_properties.extend(updated_left_oeq_classes); + new_properties.extend(Some(updated_left_oeq_class)); } } (false, false) => {} @@ -1858,12 +1833,6 @@ mod tests { let join_type = JoinType::Inner; let options = SortOptions::default(); - let fields: Fields = ["x", "y", "z", "w"] - .into_iter() - .map(|name| Field::new(name, DataType::Int32, true)) - .collect(); - let mut right_oeq_properties = - OrderingEquivalenceProperties::new(Arc::new(Schema::new(fields))); let right_oeq_class = OrderingEquivalentClass::new( vec![ PhysicalSortExpr { @@ -1886,7 +1855,6 @@ mod tests { }, ]], ); - right_oeq_properties.extend(Some(right_oeq_class)); let left_columns_len = 4; @@ -1902,13 +1870,12 @@ mod tests { join_eq_properties .add_equal_conditions((&Column::new("d", 3), &Column::new("w", 7))); - let result = get_updated_right_ordering_equivalence_properties( + let result = get_updated_right_ordering_equivalent_class( &join_type, - right_oeq_properties, + right_oeq_class.clone(), left_columns_len, &join_eq_properties, - )? - .unwrap(); + )?; let expected = OrderingEquivalentClass::new( vec![ diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 247b9849ce60..da65e08a7225 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -16,7 +16,7 @@ // under the License. use crate::expressions::{CastExpr, Column}; -use crate::utils::collect_columns; +use crate::utils::{collect_columns, merge_vectors}; use crate::{ LexOrdering, LexOrderingRef, LexOrderingReq, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement, @@ -211,47 +211,7 @@ impl OrderingEquivalenceProperties { schema, } } -} - -/// Adds the `offset` value to `Column` indices inside `expr`. This function is -/// generally used during the update of the right table schema in join operations. -pub(crate) fn add_offset_to_expr( - expr: Arc, - offset: usize, -) -> Result> { - expr.transform_down(&|e| match e.as_any().downcast_ref::() { - Some(col) => Ok(Transformed::Yes(Arc::new(Column::new( - col.name(), - offset + col.index(), - )))), - None => Ok(Transformed::No(e)), - }) -} - -/// Adds the `offset` value to `Column` indices inside `sort_expr.expr`. -pub(crate) fn add_offset_to_sort_expr( - sort_expr: &PhysicalSortExpr, - offset: usize, -) -> Result { - Ok(PhysicalSortExpr { - expr: add_offset_to_expr(sort_expr.expr.clone(), offset)?, - options: sort_expr.options, - }) -} -/// Adds the `offset` value to `Column` indices for each `sort_expr.expr` -/// inside `sort_exprs`. -pub fn add_offset_to_lex_ordering( - sort_exprs: LexOrderingRef, - offset: usize, -) -> Result { - sort_exprs - .iter() - .map(|sort_expr| add_offset_to_sort_expr(sort_expr, offset)) - .collect() -} - -impl OrderingEquivalenceProperties { pub fn extend(&mut self, other: Option) { if let Some(other) = other { if let Some(class) = &mut self.oeq_class { @@ -280,18 +240,6 @@ impl OrderingEquivalenceProperties { } } - pub fn add_offset(&mut self, offset: usize) -> Result<()> { - if let Some(oeq_class) = &mut self.oeq_class { - oeq_class.head = add_offset_to_lex_ordering(oeq_class.head(), offset)?; - oeq_class.others = oeq_class - .others() - .iter() - .map(|ordering| add_offset_to_lex_ordering(ordering, offset)) - .collect::>>()?; - } - Ok(()) - } - pub fn add_constants(&mut self, constants: Vec>) { for constant in constants { if !exprs_contains(&self.constants, &constant) { @@ -339,9 +287,7 @@ impl OrderingEquivalenceProperties { } collapse_sort_reqs(normalized_sort_reqs) } -} -impl OrderingEquivalenceProperties { /// Checks whether `leading_ordering` is contained in any of the ordering /// equivalence classes. pub fn satisfies_leading_ordering( @@ -492,6 +438,54 @@ impl OrderingEquivalentClass { self.insert(update_with_alias(ordering, oeq_alias_map)); } } + + pub fn add_offset(&self, offset: usize) -> Result { + let head = add_offset_to_lex_ordering(self.head(), offset)?; + let others = self + .others() + .iter() + .map(|ordering| add_offset_to_lex_ordering(ordering, offset)) + .collect::>>()?; + Ok(OrderingEquivalentClass::new(head, others)) + } + + /// This function normalizes `OrderingEquivalenceProperties` according to `eq_properties`. + /// More explicitly, it makes sure that expressions in `oeq_class` are head entries + /// in `eq_properties`, replacing any non-head entries with head entries if necessary. + pub fn normalize_with_equivalence_properties( + &self, + eq_properties: &EquivalenceProperties, + ) -> OrderingEquivalentClass { + let head = eq_properties.normalize_sort_exprs(self.head()); + + let others = self + .others() + .iter() + .map(|other| eq_properties.normalize_sort_exprs(other)) + .collect(); + + EquivalentClass::new(head, others) + } + + /// Prefix with existing ordering. + pub fn prefix_ordering_equivalent_class_with_existing_ordering( + &self, + existing_ordering: &[PhysicalSortExpr], + eq_properties: &EquivalenceProperties, + ) -> OrderingEquivalentClass { + let existing_ordering = eq_properties.normalize_sort_exprs(existing_ordering); + let normalized_head = eq_properties.normalize_sort_exprs(self.head()); + let updated_head = merge_vectors(&existing_ordering, &normalized_head); + let updated_others = self + .others() + .iter() + .map(|ordering| { + let normalized_ordering = eq_properties.normalize_sort_exprs(ordering); + merge_vectors(&existing_ordering, &normalized_ordering) + }) + .collect(); + OrderingEquivalentClass::new(updated_head, updated_others) + } } /// This is a builder object facilitating incremental construction @@ -909,6 +903,44 @@ fn prune_sort_reqs_with_constants( new_ordering } +/// Adds the `offset` value to `Column` indices inside `expr`. This function is +/// generally used during the update of the right table schema in join operations. +pub(crate) fn add_offset_to_expr( + expr: Arc, + offset: usize, +) -> Result> { + expr.transform_down(&|e| match e.as_any().downcast_ref::() { + Some(col) => Ok(Transformed::Yes(Arc::new(Column::new( + col.name(), + offset + col.index(), + )))), + None => Ok(Transformed::No(e)), + }) +} + +/// Adds the `offset` value to `Column` indices inside `sort_expr.expr`. +pub(crate) fn add_offset_to_sort_expr( + sort_expr: &PhysicalSortExpr, + offset: usize, +) -> Result { + Ok(PhysicalSortExpr { + expr: add_offset_to_expr(sort_expr.expr.clone(), offset)?, + options: sort_expr.options, + }) +} + +/// Adds the `offset` value to `Column` indices for each `sort_expr.expr` +/// inside `sort_exprs`. +pub fn add_offset_to_lex_ordering( + sort_exprs: LexOrderingRef, + offset: usize, +) -> Result { + sort_exprs + .iter() + .map(|sort_expr| add_offset_to_sort_expr(sort_expr, offset)) + .collect() +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 8219f640c97c..b2a6bb5ca6d2 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -18,10 +18,8 @@ use crate::equivalence::{EquivalenceProperties, OrderingEquivalenceProperties}; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::sort_properties::{ExprOrdering, SortProperties}; -use crate::{update_ordering, EquivalentClass, OrderingEquivalentClass}; -use crate::{ - LexOrdering, LexOrderingRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement, -}; +use crate::update_ordering; +use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData}; use arrow::compute::{and_kleene, is_not_null, SlicesIterator}; @@ -34,6 +32,7 @@ use datafusion_common::utils::longest_consecutive_prefix; use datafusion_common::Result; use datafusion_expr::Operator; +use itertools::Itertools; use petgraph::graph::NodeIndex; use petgraph::stable_graph::StableGraph; use std::borrow::Borrow; @@ -153,83 +152,6 @@ fn normalize_sort_exprs( PhysicalSortRequirement::to_sort_exprs(normalized_exprs) } -pub fn normalize_expr_with_equivalence_properties( - expr: Arc, - eq_properties: &[EquivalentClass], -) -> Arc { - expr.clone() - .transform(&|expr| { - let normalized_form = - expr.as_any().downcast_ref::().and_then(|column| { - for class in eq_properties { - if class.contains(column) { - return Some(Arc::new(class.head().clone()) as _); - } - } - None - }); - Ok(if let Some(normalized_form) = normalized_form { - Transformed::Yes(normalized_form) - } else { - Transformed::No(expr) - }) - }) - .unwrap_or(expr) -} - -/// This function normalizes `sort_expr` according to `eq_properties`. If the -/// given sort expression doesn't belong to equivalence set `eq_properties`, -/// it returns `sort_expr` as is. -fn normalize_sort_expr_with_equivalence_properties( - mut sort_expr: PhysicalSortExpr, - eq_properties: &[EquivalentClass], -) -> PhysicalSortExpr { - sort_expr.expr = - normalize_expr_with_equivalence_properties(sort_expr.expr, eq_properties); - sort_expr -} - -/// This function applies the [`normalize_sort_expr_with_equivalence_properties`] -/// function for all sort expressions in `sort_exprs` and returns a vector of -/// normalized sort expressions. -pub fn normalize_sort_exprs_with_equivalence_properties( - sort_exprs: LexOrderingRef, - eq_properties: &EquivalenceProperties, -) -> LexOrdering { - sort_exprs - .iter() - .map(|expr| { - normalize_sort_expr_with_equivalence_properties( - expr.clone(), - eq_properties.classes(), - ) - }) - .collect() -} - -/// This function normalizes `oeq_classes` expressions according to `eq_properties`. -/// More explicitly, it makes sure that expressions in `oeq_classes` are head entries -/// in `eq_properties`, replacing any non-head entries with head entries if necessary. -pub fn normalize_ordering_equivalence_classes( - oeq_class: Option<&OrderingEquivalentClass>, - eq_properties: &EquivalenceProperties, -) -> Option { - oeq_class.map(|class| { - let head = - normalize_sort_exprs_with_equivalence_properties(class.head(), eq_properties); - - let others = class - .others() - .iter() - .map(|other| { - normalize_sort_exprs_with_equivalence_properties(other, eq_properties) - }) - .collect(); - - EquivalentClass::new(head, others) - }) -} - /// Transform `sort_reqs` vector, to standardized version using `eq_properties` and `ordering_eq_properties` /// Assume `eq_properties` states that `Column a` and `Column b` are aliases. /// Also assume `ordering_eq_properties` states that ordering `vec![d ASC]` and `vec![a ASC, c ASC]` are @@ -866,6 +788,18 @@ pub fn find_orderings_of_exprs( Ok(orderings) } +/// Merge left and right sort expressions, checking for duplicates. +pub fn merge_vectors( + left: &[PhysicalSortExpr], + right: &[PhysicalSortExpr], +) -> Vec { + left.iter() + .cloned() + .chain(right.iter().cloned()) + .unique() + .collect() +} + #[cfg(test)] mod tests { use std::fmt::{Display, Formatter}; @@ -875,7 +809,7 @@ mod tests { use super::*; use crate::equivalence::OrderingEquivalenceProperties; use crate::expressions::{binary, cast, col, in_list, lit, Column, Literal}; - use crate::PhysicalSortExpr; + use crate::{OrderingEquivalentClass, PhysicalSortExpr}; use arrow::compute::SortOptions; use arrow_array::Int32Array; @@ -1816,22 +1750,20 @@ mod tests { Field::new("c", DataType::Int32, true), ]); let mut equal_properties = EquivalenceProperties::new(Arc::new(schema.clone())); - let mut ordering_equal_properties = - OrderingEquivalenceProperties::new(Arc::new(schema.clone())); let mut expected_oeq = OrderingEquivalenceProperties::new(Arc::new(schema)); equal_properties .add_equal_conditions((&Column::new("a", 0), &Column::new("c", 2))); - ordering_equal_properties.add_equal_conditions(( - &vec![PhysicalSortExpr { - expr: Arc::new(Column::new("b", 1)), - options: sort_options, - }], - &vec![PhysicalSortExpr { - expr: Arc::new(Column::new("c", 2)), - options: sort_options, - }], - )); + let head = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("b", 1)), + options: sort_options, + }]; + let others = vec![vec![PhysicalSortExpr { + expr: Arc::new(Column::new("c", 2)), + options: sort_options, + }]]; + let oeq_class = OrderingEquivalentClass::new(head, others); + expected_oeq.add_equal_conditions(( &vec![PhysicalSortExpr { expr: Arc::new(Column::new("b", 1)), @@ -1843,15 +1775,12 @@ mod tests { }], )); - let normalized_class = normalize_ordering_equivalence_classes( - ordering_equal_properties.oeq_class(), - &equal_properties, - ) - .unwrap(); + let normalized_oeq_class = + oeq_class.normalize_with_equivalence_properties(&equal_properties); let expected = expected_oeq.oeq_class().unwrap(); assert!( - normalized_class.head().eq(expected.head()) - && normalized_class.others().eq(expected.others()) + normalized_oeq_class.head().eq(expected.head()) + && normalized_oeq_class.others().eq(expected.others()) ); Ok(()) From fe322b4dc951c4c46d6b97ebab2f787f7e06ba1c Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 8 Sep 2023 17:36:35 +0300 Subject: [PATCH 22/29] Update comments, Use existing stats to find constants --- datafusion/common/src/stats.rs | 9 ++ datafusion/core/src/physical_plan/filter.rs | 62 ++------ .../core/src/physical_plan/joins/utils.rs | 13 +- datafusion/physical-expr/src/analysis.rs | 3 +- datafusion/physical-expr/src/equivalence.rs | 138 +++++++++++------- 5 files changed, 116 insertions(+), 109 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index db788efef7cd..f396c5c1afa5 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -70,3 +70,12 @@ pub struct ColumnStatistics { /// Number of distinct values pub distinct_count: Option, } + +impl ColumnStatistics { + pub fn is_singleton(&self) -> bool { + match (&self.min_value, &self.max_value) { + (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max), + (_, _) => false, + } + } +} diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index 295da145cff4..7cdeb1f18375 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -40,13 +40,14 @@ use datafusion_common::cast::as_boolean_array; use datafusion_common::{plan_err, DataFusionError, Result}; use datafusion_execution::TaskContext; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; +use datafusion_physical_expr::expressions::BinaryExpr; use datafusion_physical_expr::{ analyze, split_conjunction, AnalysisContext, ExprBoundaries, OrderingEquivalenceProperties, PhysicalExpr, }; use datafusion_physical_expr::intervals::utils::check_support; +use datafusion_physical_expr::utils::collect_columns; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -154,8 +155,18 @@ impl ExecutionPlan for FilterExec { fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { let mut res = self.input.ordering_equivalence_properties(); - let constants = collect_constants_from_predicate(self.predicate()); - res.add_constants(constants); + let stats = self.statistics(); + + if let Some(col_stats) = stats.column_statistics { + let columns = collect_columns(self.predicate()); + let mut constants = vec![]; + for column in columns { + if col_stats[column.index()].is_singleton() { + constants.push(Arc::new(column) as Arc); + } + } + res.add_constants(constants); + } res } @@ -377,51 +388,6 @@ fn collect_columns_from_predicate(predicate: &Arc) -> EqualAnd pub type EqualAndNonEqual<'a> = (Vec<(&'a Column, &'a Column)>, Vec<(&'a Column, &'a Column)>); -/// Checks whether physical expression is literal -fn is_literal(expr: &Arc) -> bool { - expr.as_any().is::() -} - -/// Checks whether physical expression is column -fn is_column(expr: &Arc) -> bool { - expr.as_any().is::() -} - -/// Collects physical expressions that have constant result according to -/// predicate expression. -fn collect_constants_from_predicate( - predicate: &Arc, -) -> Vec> { - get_constant_expr_helper(predicate, vec![]) -} - -/// Helper function to collect expression have constant values -fn get_constant_expr_helper( - expr: &Arc, - mut res: Vec>, -) -> Vec> { - if let Some(binary) = expr.as_any().downcast_ref::() { - // in the 3=a form - if binary.op() == &Operator::Eq - && is_literal(binary.left()) - && is_column(binary.right()) - { - res.push(binary.right().clone()) - } - // In the a = 3 form - else if binary.op() == &Operator::Eq - && is_literal(binary.right()) - && is_column(binary.left()) - { - res.push(binary.left().clone()) - } else if binary.op() == &Operator::And { - res = get_constant_expr_helper(binary.left(), res); - res = get_constant_expr_helper(binary.right(), res); - } - } - res -} - #[cfg(test)] mod tests { diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index ceb401b032b5..be091ae36590 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -324,7 +324,7 @@ pub fn cross_join_equivalence_properties( /// it can thereafter safely be used for ordering equivalence normalization. fn get_updated_right_ordering_equivalent_class( join_type: &JoinType, - mut right_oeq_class: OrderingEquivalentClass, + right_oeq_class: &OrderingEquivalentClass, left_columns_len: usize, join_eq_properties: &EquivalenceProperties, ) -> Result { @@ -332,7 +332,10 @@ fn get_updated_right_ordering_equivalent_class( // In these modes, indices of the right schema should be offset by // the left table size. JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { - right_oeq_class = right_oeq_class.add_offset(left_columns_len)?; + let right_oeq_class = right_oeq_class.add_offset(left_columns_len)?; + return Ok( + right_oeq_class.normalize_with_equivalence_properties(join_eq_properties) + ); } _ => {} }; @@ -381,7 +384,7 @@ pub fn combine_join_ordering_equivalence_properties( let updated_right_oeq = get_updated_right_ordering_equivalent_class( join_type, - oeq_class.clone(), + oeq_class, left_columns_len, &join_eq_properties, )?; @@ -408,7 +411,7 @@ pub fn combine_join_ordering_equivalence_properties( .map(|right_oeq_class| { get_updated_right_ordering_equivalent_class( join_type, - right_oeq_class.clone(), + right_oeq_class, left_columns_len, &join_eq_properties, ) @@ -1872,7 +1875,7 @@ mod tests { let result = get_updated_right_ordering_equivalent_class( &join_type, - right_oeq_class.clone(), + &right_oeq_class, left_columns_len, &join_eq_properties, )?; diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index 7d9163763e3f..416fcaabd5a8 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -231,7 +231,8 @@ fn calculate_selectivity( 1.0, |acc, (i, ExprBoundaries { interval, .. })| { let temp = - cardinality_ratio(&initial_boundaries[i].interval, interval)?; + cardinality_ratio(&initial_boundaries[i].interval, interval) + .unwrap_or(1.0); Ok(acc * temp) }, ) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index da65e08a7225..4b8980293b00 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -119,6 +119,8 @@ impl EquivalenceProperties { } } + /// Normalizes physical expression according to `EquivalentClass`es inside `self.classes`. + /// expression is replaced with `EquivalentClass::head` expression if it is among `EquivalentClass::others`. pub fn normalize_expr(&self, expr: Arc) -> Arc { expr.clone() .transform(&|expr| { @@ -140,6 +142,9 @@ impl EquivalenceProperties { .unwrap_or(expr) } + /// This function applies the \[`normalize_expr`] + /// function for all expression in `exprs` and returns a vector of + /// normalized physical expressions. pub fn normalize_exprs( &self, exprs: &[Arc], @@ -150,6 +155,9 @@ impl EquivalenceProperties { .collect::>() } + /// This function normalizes `sort_requirement` according to `EquivalenceClasses` in the `self`. + /// If the given sort requirement doesn't belong to equivalence set inside + /// `self`, it returns `sort_requirement` as is. pub fn normalize_sort_requirement( &self, mut sort_requirement: PhysicalSortRequirement, @@ -158,6 +166,9 @@ impl EquivalenceProperties { sort_requirement } + /// This function applies the \[`normalize_sort_requirement`] + /// function for all sort requirements in `sort_reqs` and returns a vector of + /// normalized sort expressions. pub fn normalize_sort_requirements( &self, sort_reqs: &[PhysicalSortRequirement], @@ -169,6 +180,9 @@ impl EquivalenceProperties { collapse_vec(normalized_sort_reqs) } + /// Similar to the \[`normalize_sort_requirements`] this function normalizes + /// sort expressions in `sort_exprs` and returns a vector of + /// normalized sort expressions. pub fn normalize_sort_exprs( &self, sort_exprs: &[PhysicalSortExpr], @@ -199,11 +213,13 @@ impl EquivalenceProperties { #[derive(Debug, Clone)] pub struct OrderingEquivalenceProperties { oeq_class: Option, + /// Keeps track of expressions that have constant value. constants: Vec>, schema: SchemaRef, } impl OrderingEquivalenceProperties { + /// Create an empty `OrderingEquivalenceProperties` pub fn new(schema: SchemaRef) -> Self { Self { oeq_class: None, @@ -212,6 +228,8 @@ impl OrderingEquivalenceProperties { } } + /// Extends `OrderingEquivalenceProperties` by adding ordering inside the `other` + /// to the `self.oeq_class`. pub fn extend(&mut self, other: Option) { if let Some(other) = other { if let Some(class) = &mut self.oeq_class { @@ -240,6 +258,7 @@ impl OrderingEquivalenceProperties { } } + /// Add physical expression that have constant value to the `self.constants` pub fn add_constants(&mut self, constants: Vec>) { for constant in constants { if !exprs_contains(&self.constants, &constant) { @@ -258,7 +277,7 @@ impl OrderingEquivalenceProperties { ) -> Vec { let normalized_sort_reqs = prune_sort_reqs_with_constants(sort_reqs, &self.constants); - let mut normalized_sort_reqs = collapse_sort_reqs(normalized_sort_reqs); + let mut normalized_sort_reqs = collapse_lex_req(normalized_sort_reqs); if let Some(oeq_class) = &self.oeq_class { for item in oeq_class.others() { let item = PhysicalSortRequirement::from_sort_exprs(item); @@ -283,9 +302,9 @@ impl OrderingEquivalenceProperties { normalized_sort_reqs.splice(updated_start..updated_end, head); } } - normalized_sort_reqs = simplify_sort_reqs(normalized_sort_reqs, oeq_class); + normalized_sort_reqs = simplify_lex_req(normalized_sort_reqs, oeq_class); } - collapse_sort_reqs(normalized_sort_reqs) + collapse_lex_req(normalized_sort_reqs) } /// Checks whether `leading_ordering` is contained in any of the ordering @@ -439,6 +458,7 @@ impl OrderingEquivalentClass { } } + /// Adds `offset` value to the index of each expression inside `self.head` and `self.others`. pub fn add_offset(&self, offset: usize) -> Result { let head = add_offset_to_lex_ordering(self.head(), offset)?; let others = self @@ -768,48 +788,75 @@ fn collapse_vec(input: Vec) -> Vec { output } -/// This function constructs a duplicate-free vector by filtering out duplicate -/// entries inside the given vector `input`. -fn collapse_sort_reqs(input: LexOrderingReq) -> LexOrderingReq { +/// This function constructs a duplicate-free `LexOrderingReq` by filtering out duplicate +/// entries that have same physical expression inside the given vector `input`. +/// `vec![a Some(Asc), a Some(Desc)]` is collapsed to the `vec![a Some(Asc)]`. Since +/// when same expression is already seen before, following expressions are redundant. +fn collapse_lex_req(input: LexOrderingReq) -> LexOrderingReq { let mut output = vec![]; for item in input { - if !sort_reqs_contains(&output, &item) { + if !lex_req_contains(&output, &item) { output.push(item); } } output } -fn req_satisfied(given: LexOrderingRef, req: &[PhysicalSortRequirement]) -> bool { - if given.len() != req.len() { - false - } else { - for (given, req) in izip!(given.iter(), req.iter()) { - let PhysicalSortRequirement { expr, options } = req; - if let Some(options) = options { - if options != &given.options || !expr.eq(&given.expr) { - return false; - } - } else if !expr.eq(&given.expr) { - return false; - } +/// Check whether `sort_req.expr` is among the expressions of `lex_req`. +fn lex_req_contains( + lex_req: &[PhysicalSortRequirement], + sort_req: &PhysicalSortRequirement, +) -> bool { + for constant in lex_req { + if constant.expr.eq(&sort_req.expr) { + return true; } - true } + false } +/// This function simplifies lexicographical ordering requirement +/// inside `input` by removing postfix lexicographical requirements +/// that satisfy global ordering (occurs inside the ordering equivalent class) +fn simplify_lex_req( + input: LexOrderingReq, + oeq_class: &OrderingEquivalentClass, +) -> LexOrderingReq { + let mut section = &input[..]; + loop { + let n_prune = prune_last_n_that_is_in_oeq(section, oeq_class); + // Cannot prune entries from the end of requirement + if n_prune == 0 { + break; + } + section = §ion[0..section.len() - n_prune]; + } + if section.is_empty() { + PhysicalSortRequirement::from_sort_exprs(oeq_class.head()) + } else { + section.to_vec() + } +} + +/// Determines how many entries from the end can be deleted. +/// Last n entry satisfies global ordering, hence having them +/// as postfix in the lexicographical requirement is unnecessary. +/// Assume requirement is [a ASC, b ASC, c ASC], also assume that +/// existing ordering is [c ASC, d ASC]. In this case, since [c ASC] +/// is satisfied by the existing ordering (e.g corresponding section is global ordering), +/// [c ASC] can be pruned from the requirement: [a ASC, b ASC, c ASC]. In this case, +/// this function will return 1, to indicate last element can be removed from the requirement fn prune_last_n_that_is_in_oeq( input: &[PhysicalSortRequirement], oeq_class: &OrderingEquivalentClass, ) -> usize { let input_len = input.len(); for ordering in std::iter::once(oeq_class.head()).chain(oeq_class.others().iter()) { - let ordering_len = ordering.len(); - let mut search_range = std::cmp::min(ordering_len, input_len); + let mut search_range = std::cmp::min(ordering.len(), input_len); while search_range > 0 { let req_section = &input[input_len - search_range..]; - let given_section = &ordering[0..search_range]; - if req_satisfied(given_section, req_section) { + // let given_section = &ordering[0..search_range]; + if req_satisfied(ordering, req_section) { return search_range; } else { search_range -= 1; @@ -819,26 +866,19 @@ fn prune_last_n_that_is_in_oeq( 0 } -/// This function constructs a duplicate-free vector by filtering out duplicate -/// entries inside the given vector `input`. -fn simplify_sort_reqs( - input: LexOrderingReq, - oeq_class: &OrderingEquivalentClass, -) -> LexOrderingReq { - let mut section = &input[..]; - loop { - let n_prune = prune_last_n_that_is_in_oeq(section, oeq_class); - // Cannot prune entries from the end of requirement - if n_prune == 0 { - break; +/// Checks whether given section satisfies req. +fn req_satisfied(given: LexOrderingRef, req: &[PhysicalSortRequirement]) -> bool { + for (given, req) in izip!(given.iter(), req.iter()) { + let PhysicalSortRequirement { expr, options } = req; + if let Some(options) = options { + if options != &given.options || !expr.eq(&given.expr) { + return false; + } + } else if !expr.eq(&given.expr) { + return false; } - section = §ion[0..section.len() - n_prune]; - } - if section.is_empty() { - PhysicalSortRequirement::from_sort_exprs(oeq_class.head()) - } else { - section.to_vec() } + true } /// This function searches for the slice `section` inside the slice `given`. @@ -878,18 +918,6 @@ fn exprs_contains( false } -fn sort_reqs_contains( - sort_reqs: &[PhysicalSortRequirement], - sort_req: &PhysicalSortRequirement, -) -> bool { - for constant in sort_reqs { - if constant.expr.eq(&sort_req.expr) { - return true; - } - } - false -} - fn prune_sort_reqs_with_constants( ordering: &[PhysicalSortRequirement], constants: &[Arc], From cff6f2f697e187615313adff6f2cba98c1640587 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 12 Sep 2023 14:43:59 +0300 Subject: [PATCH 23/29] Simplifications --- datafusion/common/src/stats.rs | 1 + datafusion/core/src/physical_plan/filter.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index f396c5c1afa5..f9c0fd1581e8 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -74,6 +74,7 @@ pub struct ColumnStatistics { impl ColumnStatistics { pub fn is_singleton(&self) -> bool { match (&self.min_value, &self.max_value) { + // Min and max value are same and both are not infinity. (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max), (_, _) => false, } diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index 476c2948b87e..dc165804e7d0 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -157,14 +157,14 @@ impl ExecutionPlan for FilterExec { let mut res = self.input.ordering_equivalence_properties(); let stats = self.statistics(); - if let Some(col_stats) = stats.column_statistics { - let columns = collect_columns(self.predicate()); - let mut constants = vec![]; - for column in columns { - if col_stats[column.index()].is_singleton() { - constants.push(Arc::new(column) as Arc); - } - } + // Add columns that have fixed value (singleton) after filtering, to the constants. + if let Some(constants) = stats.column_statistics.map(|col_stats| { + collect_columns(self.predicate()) + .into_iter() + .filter(|column| col_stats[column.index()].is_singleton()) + .map(|column| Arc::new(column) as Arc) + .collect::>() + }) { res.add_constants(constants); } res From 6cb4d5a8795ae25a24198485e7a65554fc7a39f7 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 12 Sep 2023 16:18:36 +0300 Subject: [PATCH 24/29] Unknown input stats are handled --- datafusion/common/src/stats.rs | 28 ++++++++++++++++++- datafusion/core/src/physical_plan/filter.rs | 4 +-- .../sqllogictest/test_files/subquery.slt | 25 +++++++++-------- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index f9c0fd1581e8..630d1d274d3f 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -19,6 +19,8 @@ use std::fmt::Display; +use arrow::datatypes::SchemaRef; + use crate::ScalarValue; /// Statistics for a relation @@ -74,9 +76,33 @@ pub struct ColumnStatistics { impl ColumnStatistics { pub fn is_singleton(&self) -> bool { match (&self.min_value, &self.max_value) { - // Min and max value are same and both are not infinity. + // Min and max values are the same and not infinity. (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max), (_, _) => false, } } + + /// Returns the [`Vec`] corresponding to the given schema by assigning + /// infinite bounds to each column in the schema. This is useful when even the input statistics + /// are not known, as the current executor can shrink the bounds of some columns. + pub fn new_with_unbounded_columns(schema: SchemaRef) -> Vec { + let data_types = schema + .fields() + .iter() + .map(|field| field.data_type()) + .collect::>(); + + data_types + .into_iter() + .map(|data_type| { + let dt = ScalarValue::try_from(data_type.clone()).ok(); + ColumnStatistics { + null_count: None, + max_value: dt.clone(), + min_value: dt, + distinct_count: None, + } + }) + .collect() + } } diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index dc165804e7d0..719b307d9fb4 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -157,7 +157,7 @@ impl ExecutionPlan for FilterExec { let mut res = self.input.ordering_equivalence_properties(); let stats = self.statistics(); - // Add columns that have fixed value (singleton) after filtering, to the constants. + // Add the columns that have only one value (singleton) after filtering to constants. if let Some(constants) = stats.column_statistics.map(|col_stats| { collect_columns(self.predicate()) .into_iter() @@ -211,7 +211,7 @@ impl ExecutionPlan for FilterExec { let input_stats = self.input.statistics(); let input_column_stats = match input_stats.column_statistics { Some(stats) => stats, - None => return Statistics::default(), + None => ColumnStatistics::new_with_unbounded_columns(self.schema()), }; let starter_ctx = diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index fe074da1bba0..2eccb60aad3e 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -284,19 +284,20 @@ Projection: t1.t1_id, __scalar_sq_1.SUM(t2.t2_int) AS t2_sum ------------TableScan: t2 projection=[t2_id, t2_int] physical_plan ProjectionExec: expr=[t1_id@0 as t1_id, SUM(t2.t2_int)@1 as t2_sum] ---CoalesceBatchesExec: target_batch_size=8192 -----HashJoinExec: mode=Partitioned, join_type=Left, on=[(t1_id@0, t2_id@1)] -------CoalesceBatchesExec: target_batch_size=8192 ---------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 -----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] -------ProjectionExec: expr=[SUM(t2.t2_int)@1 as SUM(t2.t2_int), t2_id@0 as t2_id] +--ProjectionExec: expr=[t1_id@2 as t1_id, SUM(t2.t2_int)@0 as SUM(t2.t2_int), t2_id@1 as t2_id] +----CoalesceBatchesExec: target_batch_size=8192 +------HashJoinExec: mode=Partitioned, join_type=Right, on=[(t2_id@1, t1_id@0)] +--------ProjectionExec: expr=[SUM(t2.t2_int)@1 as SUM(t2.t2_int), t2_id@0 as t2_id] +----------CoalesceBatchesExec: target_batch_size=8192 +------------FilterExec: SUM(t2.t2_int)@1 < 3 +--------------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[SUM(t2.t2_int)] +----------------CoalesceBatchesExec: target_batch_size=8192 +------------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 +--------------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[SUM(t2.t2_int)] +----------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] --------CoalesceBatchesExec: target_batch_size=8192 -----------FilterExec: SUM(t2.t2_int)@1 < 3 -------------AggregateExec: mode=FinalPartitioned, gby=[t2_id@0 as t2_id], aggr=[SUM(t2.t2_int)] ---------------CoalesceBatchesExec: target_batch_size=8192 -----------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 -------------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[SUM(t2.t2_int)] ---------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +----------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 +------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] query II rowsort SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id having sum(t2_int) < 3) as t2_sum from t1 From ef994fb11bebc9db4b73a4b995e1efc0d2d411ca Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 13 Sep 2023 15:23:15 +0300 Subject: [PATCH 25/29] Address reviews --- .../core/src/physical_plan/joins/utils.rs | 6 ++- datafusion/physical-expr/src/analysis.rs | 8 ++-- datafusion/physical-expr/src/equivalence.rs | 38 ++++++++++--------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index be091ae36590..84416948577c 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -371,7 +371,8 @@ pub fn combine_join_ordering_equivalence_properties( // In this special case, right side ordering can be prefixed with left side ordering. if let ( Some(JoinSide::Left), - Some(_right_ordering), + // right side have an ordering + Some(_), JoinType::Inner, Some(oeq_class), ) = ( @@ -421,7 +422,8 @@ pub fn combine_join_ordering_equivalence_properties( // In this special case, left side ordering can be prefixed with right side ordering. if let ( Some(JoinSide::Right), - Some(_left_ordering), + // left side have an ordering + Some(_), JoinType::Inner, Some(left_oeq_class), ) = ( diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index 416fcaabd5a8..bd8c79d94efb 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -191,12 +191,15 @@ fn shrink_boundaries( })?; let final_result = graph.get_interval(*root_index); + // If during selectivity calculation we encounter an error, use 1.0 as cardinality estimate + // safest estimate(e.q largest possible value). let selectivity = calculate_selectivity( &final_result.lower.value, &final_result.upper.value, &target_boundaries, &initial_boundaries, - )?; + ) + .unwrap_or(1.0); if !(0.0..=1.0).contains(&selectivity) { return internal_err!("Selectivity is out of limit: {}", selectivity); @@ -231,8 +234,7 @@ fn calculate_selectivity( 1.0, |acc, (i, ExprBoundaries { interval, .. })| { let temp = - cardinality_ratio(&initial_boundaries[i].interval, interval) - .unwrap_or(1.0); + cardinality_ratio(&initial_boundaries[i].interval, interval)?; Ok(acc * temp) }, ) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 4b8980293b00..f62fb19a1241 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -260,17 +260,21 @@ impl OrderingEquivalenceProperties { /// Add physical expression that have constant value to the `self.constants` pub fn add_constants(&mut self, constants: Vec>) { - for constant in constants { - if !exprs_contains(&self.constants, &constant) { + constants.into_iter().for_each(|constant| { + if !physical_exprs_contains(&self.constants, &constant) { self.constants.push(constant); } - } + }); } pub fn schema(&self) -> SchemaRef { self.schema.clone() } + /// This function normalizes `sort_reqs` by + /// - removing expressions that have constant value from requirement + /// - replacing sections that are in the `self.oeq_class.others` with `self.oeq_class.head` + /// - removing sections that satisfies global ordering that are in the post fix of requirement pub fn normalize_sort_requirements( &self, sort_reqs: &[PhysicalSortRequirement], @@ -906,29 +910,27 @@ fn get_compatible_ranges( .collect() } -fn exprs_contains( - constants: &[Arc], +/// It is similar to contains method of vector. +/// Finds whether `expr` is among `physical_exprs`. +pub fn physical_exprs_contains( + physical_exprs: &[Arc], expr: &Arc, ) -> bool { - for constant in constants { - if constant.eq(expr) { - return true; - } - } - false + physical_exprs + .iter() + .any(|physical_expr| physical_expr.eq(expr)) } +/// Remove ordering requirements that have constant value fn prune_sort_reqs_with_constants( ordering: &[PhysicalSortRequirement], constants: &[Arc], ) -> Vec { - let mut new_ordering = vec![]; - for order in ordering { - if !exprs_contains(constants, &order.expr) { - new_ordering.push(order.clone()) - } - } - new_ordering + ordering + .iter() + .filter(|&order| !physical_exprs_contains(constants, &order.expr)) + .cloned() + .collect() } /// Adds the `offset` value to `Column` indices inside `expr`. This function is From 4fe9c0d8f818e71616a1dfbbcb0fbc6852d30150 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 15 Sep 2023 10:40:32 +0300 Subject: [PATCH 26/29] Simplifications --- datafusion/common/src/stats.rs | 33 +++++++-------------- datafusion/core/src/physical_plan/filter.rs | 9 +++++- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 630d1d274d3f..8ad053f1073f 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -19,7 +19,7 @@ use std::fmt::Display; -use arrow::datatypes::SchemaRef; +use arrow::datatypes::DataType; use crate::ScalarValue; @@ -82,27 +82,14 @@ impl ColumnStatistics { } } - /// Returns the [`Vec`] corresponding to the given schema by assigning - /// infinite bounds to each column in the schema. This is useful when even the input statistics - /// are not known, as the current executor can shrink the bounds of some columns. - pub fn new_with_unbounded_columns(schema: SchemaRef) -> Vec { - let data_types = schema - .fields() - .iter() - .map(|field| field.data_type()) - .collect::>(); - - data_types - .into_iter() - .map(|data_type| { - let dt = ScalarValue::try_from(data_type.clone()).ok(); - ColumnStatistics { - null_count: None, - max_value: dt.clone(), - min_value: dt, - distinct_count: None, - } - }) - .collect() + /// Returns the [`ColumnStatistics`] corresponding to the given datatype by assigning infinite bounds. + pub fn new_with_unbounded_column(dt: &DataType) -> ColumnStatistics { + let null = ScalarValue::try_from(dt.clone()).ok(); + ColumnStatistics { + null_count: None, + max_value: null.clone(), + min_value: null, + distinct_count: None, + } } } diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index 719b307d9fb4..57374c4e1526 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -211,7 +211,14 @@ impl ExecutionPlan for FilterExec { let input_stats = self.input.statistics(); let input_column_stats = match input_stats.column_statistics { Some(stats) => stats, - None => ColumnStatistics::new_with_unbounded_columns(self.schema()), + None => self + .schema() + .fields + .iter() + .map(|field| { + ColumnStatistics::new_with_unbounded_column(field.data_type()) + }) + .collect::>(), }; let starter_ctx = From 1c1de0d8d111d281392019c2e35d5c99694d0fb8 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 15 Sep 2023 10:54:48 +0300 Subject: [PATCH 27/29] Simplifications --- datafusion/core/src/physical_plan/filter.rs | 15 +++++++-------- datafusion/physical-expr/src/equivalence.rs | 3 ++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/core/src/physical_plan/filter.rs index 57374c4e1526..2a42d7bbf1bc 100644 --- a/datafusion/core/src/physical_plan/filter.rs +++ b/datafusion/core/src/physical_plan/filter.rs @@ -154,20 +154,19 @@ impl ExecutionPlan for FilterExec { } fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { - let mut res = self.input.ordering_equivalence_properties(); let stats = self.statistics(); - // Add the columns that have only one value (singleton) after filtering to constants. - if let Some(constants) = stats.column_statistics.map(|col_stats| { - collect_columns(self.predicate()) + if let Some(col_stats) = stats.column_statistics { + let constants = collect_columns(self.predicate()) .into_iter() .filter(|column| col_stats[column.index()].is_singleton()) .map(|column| Arc::new(column) as Arc) - .collect::>() - }) { - res.add_constants(constants); + .collect::>(); + let filter_oeq = self.input.ordering_equivalence_properties(); + filter_oeq.with_constants(constants) + } else { + self.input.ordering_equivalence_properties() } - res } fn with_new_children( diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index f62fb19a1241..016d8d8deccb 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -259,12 +259,13 @@ impl OrderingEquivalenceProperties { } /// Add physical expression that have constant value to the `self.constants` - pub fn add_constants(&mut self, constants: Vec>) { + pub fn with_constants(mut self, constants: Vec>) -> Self { constants.into_iter().for_each(|constant| { if !physical_exprs_contains(&self.constants, &constant) { self.constants.push(constant); } }); + self } pub fn schema(&self) -> SchemaRef { From abe0f31fd986858d325b7f57ce97eac4b51b96bf Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 18 Sep 2023 10:40:00 +0300 Subject: [PATCH 28/29] Address reviews --- datafusion/common/src/stats.rs | 1 + datafusion/physical-expr/src/equivalence.rs | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 8ad053f1073f..ca76e14cb8ab 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -74,6 +74,7 @@ pub struct ColumnStatistics { } impl ColumnStatistics { + /// Column contains a single non null value (e.g constant). pub fn is_singleton(&self) -> bool { match (&self.min_value, &self.max_value) { // Min and max values are the same and not infinity. diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 016d8d8deccb..369c139aa30b 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -36,10 +36,9 @@ use std::sync::Arc; /// Represents a collection of [`EquivalentClass`] (equivalences /// between columns in relations) /// -/// This is used to represent both: +/// This is used to represent: /// /// 1. Equality conditions (like `A=B`), when `T` = [`Column`] -/// 2. Ordering (like `A ASC = B ASC`), when `T` = [`PhysicalSortExpr`] #[derive(Debug, Clone)] pub struct EquivalenceProperties { classes: Vec>, From 2eaf755a608da31e99803535433dfe407497b051 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Mon, 18 Sep 2023 11:45:03 +0300 Subject: [PATCH 29/29] Fix subdirectories --- parquet-testing | 2 +- testing | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-testing b/parquet-testing index a11fc8f148f8..d79a0101d90d 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit a11fc8f148f8a7a89d9281cc0da3eb9d56095fbf +Subproject commit d79a0101d90dfa3bbb10337626f57a3e8c4b5363 diff --git a/testing b/testing index e81d0c6de359..2c84953c8c27 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit e81d0c6de35948b3be7984af8e00413b314cde6e +Subproject commit 2c84953c8c2779a0dc86ef9ebe8a6cd978125bfe