From aedec6a34cbee1456d3918c7bfd43219a7075586 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 30 Aug 2025 21:50:14 +0800 Subject: [PATCH 1/2] fix: lazy evaluation for coalesce --- datafusion/functions/src/core/coalesce.rs | 88 +++++++------------ datafusion/sqllogictest/test_files/select.slt | 12 ++- 2 files changed, 40 insertions(+), 60 deletions(-) diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index b0f3483513ed..3fba539dd04b 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -15,14 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{new_null_array, BooleanArray}; -use arrow::compute::kernels::zip::zip; -use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::{DataType, Field, FieldRef}; -use datafusion_common::{exec_err, internal_err, Result}; +use datafusion_common::{exec_err, internal_err, plan_err, Result}; use datafusion_expr::binary::try_type_union_resolution; +use datafusion_expr::conditional_expressions::CaseBuilder; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, + ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarFunctionArgs, }; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; @@ -95,61 +94,36 @@ impl ScalarUDFImpl for CoalesceFunc { Ok(Field::new(self.name(), return_type, nullable).into()) } - /// coalesce evaluates to the first value which is not NULL - fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let args = args.args; - // do not accept 0 arguments. + fn simplify( + &self, + args: Vec, + _info: &dyn SimplifyInfo, + ) -> Result { if args.is_empty() { - return exec_err!( - "coalesce was called with {} arguments. It requires at least 1.", - args.len() - ); + return plan_err!("coalesce must have at least one argument"); } - - let return_type = args[0].data_type(); - let mut return_array = args.iter().filter_map(|x| match x { - ColumnarValue::Array(array) => Some(array.len()), - _ => None, - }); - - if let Some(size) = return_array.next() { - // start with nulls as default output - let mut current_value = new_null_array(&return_type, size); - let mut remainder = BooleanArray::from(vec![true; size]); - - for arg in args { - match arg { - ColumnarValue::Array(ref array) => { - let to_apply = and(&remainder, &is_not_null(array.as_ref())?)?; - current_value = zip(&to_apply, array, ¤t_value)?; - remainder = and(&remainder, &is_null(array)?)?; - } - ColumnarValue::Scalar(value) => { - if value.is_null() { - continue; - } else { - let last_value = value.to_scalar()?; - current_value = zip(&remainder, &last_value, ¤t_value)?; - break; - } - } - } - if remainder.iter().all(|x| x == Some(false)) { - break; - } - } - Ok(ColumnarValue::Array(current_value)) - } else { - let result = args - .iter() - .filter_map(|x| match x { - ColumnarValue::Scalar(s) if !s.is_null() => Some(x.clone()), - _ => None, - }) - .next() - .unwrap_or_else(|| args[0].clone()); - Ok(result) + if args.len() == 1 { + return Ok(ExprSimplifyResult::Simplified( + args.into_iter().next().unwrap(), + )); } + + let n = args.len(); + let (init, last_elem) = args.split_at(n - 1); + let whens = init + .iter() + .map(|x| x.clone().is_not_null()) + .collect::>(); + let cases = init.to_vec(); + Ok(ExprSimplifyResult::Simplified( + CaseBuilder::new(None, whens, cases, Some(Box::new(last_elem[0].clone()))) + .end()?, + )) + } + + /// coalesce evaluates to the first value which is not NULL + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + internal_err!("coalesce should have been simplified to case") } fn short_circuits(&self) -> bool { diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 109c2f209ad9..4b24c9951871 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1656,10 +1656,10 @@ query TT explain select coalesce(1, y/x), coalesce(2, y/x) from t; ---- logical_plan -01)Projection: coalesce(Int64(1), CAST(t.y / t.x AS Int64)), coalesce(Int64(2), CAST(t.y / t.x AS Int64)) +01)Projection: CASE WHEN Boolean(true) THEN Int64(1) ELSE CAST(t.y / t.x AS Int64) END AS coalesce(Int64(1),t.y / t.x), CASE WHEN Boolean(true) THEN Int64(2) ELSE CAST(t.y / t.x AS Int64) END AS coalesce(Int64(2),t.y / t.x) 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[coalesce(1, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(1),t.y / t.x), coalesce(2, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(2),t.y / t.x)] +01)ProjectionExec: expr=[CASE WHEN true THEN 1 ELSE CAST(y@1 / x@0 AS Int64) END as coalesce(Int64(1),t.y / t.x), CASE WHEN true THEN 2 ELSE CAST(y@1 / x@0 AS Int64) END as coalesce(Int64(2),t.y / t.x)] 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT @@ -1688,8 +1688,14 @@ physical_plan # due to the reason describe in https://github.com/apache/datafusion/issues/8927, # the following queries will fail -query error +query II select coalesce(1, y/x), coalesce(2, y/x) from t; +---- +1 2 +1 2 +1 2 +1 2 +1 2 query error SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t; From 6fcd4f8206705234414eb8eb1d5140d72b2725ff Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Tue, 2 Sep 2025 08:36:28 +0800 Subject: [PATCH 2/2] update test --- datafusion/sqllogictest/test_files/select.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 4b24c9951871..989f7df7b49e 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1686,8 +1686,6 @@ physical_plan 02)--ProjectionExec: expr=[y@1 = 0 as __common_expr_1, x@0 as x, y@1 as y] 03)----DataSourceExec: partitions=1, partition_sizes=[1] -# due to the reason describe in https://github.com/apache/datafusion/issues/8927, -# the following queries will fail query II select coalesce(1, y/x), coalesce(2, y/x) from t; ---- @@ -1697,6 +1695,8 @@ select coalesce(1, y/x), coalesce(2, y/x) from t; 1 2 1 2 +# due to the reason describe in https://github.com/apache/datafusion/issues/8927, +# the following queries will fail query error SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t;