Skip to content

Commit 0c311e2

Browse files
dharanadfindepi
authored andcommitted
Support Time Parquet Data Page Statistics (apache#11187)
* add parquet page stats for time * return empty array instead of null * fix typos
1 parent 25f03c4 commit 0c311e2

File tree

2 files changed

+39
-11
lines changed

2 files changed

+39
-11
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
use arrow::datatypes::i256;
2323
use arrow::{array::ArrayRef, datatypes::DataType};
2424
use arrow_array::{
25-
new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
26-
Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array,
27-
Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray,
28-
StringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
29-
Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
30-
TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
31-
UInt64Array, UInt8Array,
25+
new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
26+
Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array,
27+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
28+
LargeStringArray, StringArray, Time32MillisecondArray, Time32SecondArray,
29+
Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
30+
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
31+
UInt16Array, UInt32Array, UInt64Array, UInt8Array,
3232
};
3333
use arrow_schema::{Field, FieldRef, Schema, TimeUnit};
3434
use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
@@ -873,6 +873,34 @@ macro_rules! get_data_page_statistics {
873873
Decimal128Array::from_iter([<$stat_type_prefix Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)),
874874
Some(DataType::Decimal256(precision, scale)) => Ok(Arc::new(
875875
Decimal256Array::from_iter([<$stat_type_prefix Decimal256DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)),
876+
Some(DataType::Time32(unit)) => {
877+
Ok(match unit {
878+
TimeUnit::Second => Arc::new(Time32SecondArray::from_iter(
879+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten(),
880+
)),
881+
TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
882+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten(),
883+
)),
884+
_ => {
885+
// don't know how to extract statistics, so return an empty array
886+
new_empty_array(&DataType::Time32(unit.clone()))
887+
}
888+
})
889+
}
890+
Some(DataType::Time64(unit)) => {
891+
Ok(match unit {
892+
TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter(
893+
[<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten(),
894+
)),
895+
TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
896+
[<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten(),
897+
)),
898+
_ => {
899+
// don't know how to extract statistics, so return an empty array
900+
new_empty_array(&DataType::Time64(unit.clone()))
901+
}
902+
})
903+
}
876904
_ => unimplemented!()
877905
}
878906
}

datafusion/core/tests/parquet/arrow_statistics.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,7 @@ async fn test_time32_second_diff_rg_sizes() {
12041204
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity
12051205
expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])),
12061206
column_name: "second",
1207-
check: Check::RowGroup,
1207+
check: Check::Both,
12081208
}
12091209
.run();
12101210
}
@@ -1231,7 +1231,7 @@ async fn test_time32_millisecond_diff_rg_sizes() {
12311231
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity
12321232
expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])),
12331233
column_name: "millisecond",
1234-
check: Check::RowGroup,
1234+
check: Check::Both,
12351235
}
12361236
.run();
12371237
}
@@ -1264,7 +1264,7 @@ async fn test_time64_microsecond_diff_rg_sizes() {
12641264
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity
12651265
expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])),
12661266
column_name: "microsecond",
1267-
check: Check::RowGroup,
1267+
check: Check::Both,
12681268
}
12691269
.run();
12701270
}
@@ -1297,7 +1297,7 @@ async fn test_time64_nanosecond_diff_rg_sizes() {
12971297
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity
12981298
expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])),
12991299
column_name: "nanosecond",
1300-
check: Check::RowGroup,
1300+
check: Check::Both,
13011301
}
13021302
.run();
13031303
}

0 commit comments

Comments
 (0)