Skip to content

Commit 8c6feae

Browse files
author
Eric Fredine
committed
feat: Add support for Timestamp data types in data page statistics.
1 parent 58e2904 commit 8c6feae

File tree

2 files changed

+45
-16
lines changed

2 files changed

+45
-16
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,35 @@ macro_rules! get_data_page_statistics {
713713
)),
714714
Some(DataType::Float32) => Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix Float32DataPageStatsIterator>]::new($iterator).flatten()))),
715715
Some(DataType::Float64) => Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix Float64DataPageStatsIterator>]::new($iterator).flatten()))),
716+
Some(DataType::Timestamp(unit, timezone)) => {
717+
let iter = [<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten();
718+
Ok(match unit {
719+
TimeUnit::Second => {
720+
Arc::new(match timezone {
721+
Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
722+
None => TimestampSecondArray::from_iter(iter),
723+
})
724+
}
725+
TimeUnit::Millisecond => {
726+
Arc::new(match timezone {
727+
Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
728+
None => TimestampMillisecondArray::from_iter(iter),
729+
})
730+
}
731+
TimeUnit::Microsecond => {
732+
Arc::new(match timezone {
733+
Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
734+
None => TimestampMicrosecondArray::from_iter(iter),
735+
})
736+
}
737+
TimeUnit::Nanosecond => {
738+
Arc::new(match timezone {
739+
Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
740+
None => TimestampNanosecondArray::from_iter(iter),
741+
})
742+
}
743+
})
744+
},
716745
_ => unimplemented!()
717746
}
718747
}

datafusion/core/tests/parquet/arrow_statistics.rs

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ async fn test_timestamp() {
747747
// row counts are [5, 5, 5, 5]
748748
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
749749
column_name: "nanos",
750-
check: Check::RowGroup,
750+
check: Check::Both,
751751
}
752752
.run();
753753

@@ -776,7 +776,7 @@ async fn test_timestamp() {
776776
// row counts are [5, 5, 5, 5]
777777
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
778778
column_name: "nanos_timezoned",
779-
check: Check::RowGroup,
779+
check: Check::Both,
780780
}
781781
.run();
782782

@@ -798,7 +798,7 @@ async fn test_timestamp() {
798798
expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
799799
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
800800
column_name: "micros",
801-
check: Check::RowGroup,
801+
check: Check::Both,
802802
}
803803
.run();
804804

@@ -827,7 +827,7 @@ async fn test_timestamp() {
827827
// row counts are [5, 5, 5, 5]
828828
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
829829
column_name: "micros_timezoned",
830-
check: Check::RowGroup,
830+
check: Check::Both,
831831
}
832832
.run();
833833

@@ -849,7 +849,7 @@ async fn test_timestamp() {
849849
expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
850850
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
851851
column_name: "millis",
852-
check: Check::RowGroup,
852+
check: Check::Both,
853853
}
854854
.run();
855855

@@ -878,7 +878,7 @@ async fn test_timestamp() {
878878
// row counts are [5, 5, 5, 5]
879879
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
880880
column_name: "millis_timezoned",
881-
check: Check::RowGroup,
881+
check: Check::Both,
882882
}
883883
.run();
884884

@@ -900,7 +900,7 @@ async fn test_timestamp() {
900900
expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
901901
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
902902
column_name: "seconds",
903-
check: Check::RowGroup,
903+
check: Check::Both,
904904
}
905905
.run();
906906

@@ -929,7 +929,7 @@ async fn test_timestamp() {
929929
// row counts are [5, 5, 5, 5]
930930
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
931931
column_name: "seconds_timezoned",
932-
check: Check::RowGroup,
932+
check: Check::Both,
933933
}
934934
.run();
935935
}
@@ -975,7 +975,7 @@ async fn test_timestamp_diff_rg_sizes() {
975975
// row counts are [8, 8, 4]
976976
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
977977
column_name: "nanos",
978-
check: Check::RowGroup,
978+
check: Check::Both,
979979
}
980980
.run();
981981

@@ -1002,7 +1002,7 @@ async fn test_timestamp_diff_rg_sizes() {
10021002
// row counts are [8, 8, 4]
10031003
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
10041004
column_name: "nanos_timezoned",
1005-
check: Check::RowGroup,
1005+
check: Check::Both,
10061006
}
10071007
.run();
10081008

@@ -1022,7 +1022,7 @@ async fn test_timestamp_diff_rg_sizes() {
10221022
expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
10231023
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
10241024
column_name: "micros",
1025-
check: Check::RowGroup,
1025+
check: Check::Both,
10261026
}
10271027
.run();
10281028

@@ -1049,7 +1049,7 @@ async fn test_timestamp_diff_rg_sizes() {
10491049
// row counts are [8, 8, 4]
10501050
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
10511051
column_name: "micros_timezoned",
1052-
check: Check::RowGroup,
1052+
check: Check::Both,
10531053
}
10541054
.run();
10551055

@@ -1069,7 +1069,7 @@ async fn test_timestamp_diff_rg_sizes() {
10691069
expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
10701070
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
10711071
column_name: "millis",
1072-
check: Check::RowGroup,
1072+
check: Check::Both,
10731073
}
10741074
.run();
10751075

@@ -1096,7 +1096,7 @@ async fn test_timestamp_diff_rg_sizes() {
10961096
// row counts are [8, 8, 4]
10971097
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
10981098
column_name: "millis_timezoned",
1099-
check: Check::RowGroup,
1099+
check: Check::Both,
11001100
}
11011101
.run();
11021102

@@ -1116,7 +1116,7 @@ async fn test_timestamp_diff_rg_sizes() {
11161116
expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
11171117
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
11181118
column_name: "seconds",
1119-
check: Check::RowGroup,
1119+
check: Check::Both,
11201120
}
11211121
.run();
11221122

@@ -1143,7 +1143,7 @@ async fn test_timestamp_diff_rg_sizes() {
11431143
// row counts are [8, 8, 4]
11441144
expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])),
11451145
column_name: "seconds_timezoned",
1146-
check: Check::RowGroup,
1146+
check: Check::Both,
11471147
}
11481148
.run();
11491149
}

0 commit comments

Comments
 (0)