@@ -661,7 +661,7 @@ where
661661/// of parquet page [`Index`]'es to an [`ArrayRef`]
662662///
663663/// The returned Array is an [`UInt64Array`]
664- pub ( crate ) fn null_counts_page_statistics < ' a , I > ( iterator : I ) -> Result < ArrayRef >
664+ pub ( crate ) fn null_counts_page_statistics < ' a , I > ( iterator : I ) -> Result < UInt64Array >
665665where
666666 I : Iterator < Item = ( usize , & ' a Index ) > ,
667667{
@@ -680,7 +680,7 @@ where
680680 _ => unimplemented ! ( ) ,
681681 } ) ;
682682
683- Ok ( Arc :: new ( UInt64Array :: from_iter ( iter) ) )
683+ Ok ( UInt64Array :: from_iter ( iter) )
684684}
685685
686686/// Extracts Parquet statistics as Arrow arrays
@@ -874,21 +874,22 @@ impl<'a> StatisticsConverter<'a> {
874874 /// Extract the null counts from row group statistics in [`RowGroupMetaData`]
875875 ///
876876 /// See docs on [`Self::row_group_mins`] for details
877- pub fn row_group_null_counts < I > ( & self , metadatas : I ) -> Result < ArrayRef >
877+ pub fn row_group_null_counts < I > ( & self , metadatas : I ) -> Result < UInt64Array >
878878 where
879879 I : IntoIterator < Item = & ' a RowGroupMetaData > ,
880880 {
881- let data_type = self . arrow_field . data_type ( ) ;
882-
883881 let Some ( parquet_index) = self . parquet_index else {
884- return Ok ( self . make_null_array ( data_type, metadatas) ) ;
882+ let num_row_groups = metadatas. into_iter ( ) . count ( ) ;
883+ return Ok ( UInt64Array :: from_iter (
884+ std:: iter:: repeat ( None ) . take ( num_row_groups) ,
885+ ) ) ;
885886 } ;
886887
887888 let null_counts = metadatas
888889 . into_iter ( )
889890 . map ( |x| x. column ( parquet_index) . statistics ( ) )
890891 . map ( |s| s. map ( |s| s. null_count ( ) ) ) ;
891- Ok ( Arc :: new ( UInt64Array :: from_iter ( null_counts) ) )
892+ Ok ( UInt64Array :: from_iter ( null_counts) )
892893 }
893894
894895 /// Extract the minimum values from Data Page statistics.
@@ -1007,14 +1008,15 @@ impl<'a> StatisticsConverter<'a> {
10071008 column_page_index : & ParquetColumnIndex ,
10081009 column_offset_index : & ParquetOffsetIndex ,
10091010 row_group_indices : I ,
1010- ) -> Result < ArrayRef >
1011+ ) -> Result < UInt64Array >
10111012 where
10121013 I : IntoIterator < Item = & ' a usize > ,
10131014 {
1014- let data_type = self . arrow_field . data_type ( ) ;
1015-
10161015 let Some ( parquet_index) = self . parquet_index else {
1017- return Ok ( self . make_null_array ( data_type, row_group_indices) ) ;
1016+ let num_row_groups = row_group_indices. into_iter ( ) . count ( ) ;
1017+ return Ok ( UInt64Array :: from_iter (
1018+ std:: iter:: repeat ( None ) . take ( num_row_groups) ,
1019+ ) ) ;
10181020 } ;
10191021
10201022 let iter = row_group_indices. into_iter ( ) . map ( |rg_index| {
@@ -1047,21 +1049,19 @@ impl<'a> StatisticsConverter<'a> {
10471049 pub fn data_page_row_counts < I > (
10481050 & self ,
10491051 column_offset_index : & ParquetOffsetIndex ,
1050- row_group_metadatas : & [ RowGroupMetaData ] ,
1052+ row_group_metadatas : & ' a [ RowGroupMetaData ] ,
10511053 row_group_indices : I ,
1052- ) -> Result < ArrayRef >
1054+ ) -> Result < Option < UInt64Array > >
10531055 where
10541056 I : IntoIterator < Item = & ' a usize > ,
10551057 {
1056- let data_type = self . arrow_field . data_type ( ) ;
1057-
10581058 let Some ( parquet_index) = self . parquet_index else {
1059- return Ok ( self . make_null_array ( data_type, row_group_indices) ) ;
1059+ // no matching column found in parquet_index;
1060+ // thus we cannot extract page_locations in order to determine
1061+ // the row count on a per DataPage basis.
1062+ return Ok ( None ) ;
10601063 } ;
10611064
1062- // `offset_index[row_group_number][column_number][page_number]` holds
1063- // the [`PageLocation`] corresponding to page `page_number` of column
1064- // `column_number`of row group `row_group_number`.
10651065 let mut row_count_total = Vec :: new ( ) ;
10661066 for rg_idx in row_group_indices {
10671067 let page_locations = & column_offset_index[ * rg_idx] [ parquet_index] ;
@@ -1070,9 +1070,8 @@ impl<'a> StatisticsConverter<'a> {
10701070 Some ( loc[ 1 ] . first_row_index as u64 - loc[ 0 ] . first_row_index as u64 )
10711071 } ) ;
10721072
1073- let num_rows_in_row_group = & row_group_metadatas[ * rg_idx] . num_rows ( ) ;
1074-
10751073 // append the last page row count
1074+ let num_rows_in_row_group = & row_group_metadatas[ * rg_idx] . num_rows ( ) ;
10761075 let row_count_per_page = row_count_per_page
10771076 . chain ( std:: iter:: once ( Some (
10781077 * num_rows_in_row_group as u64
@@ -1083,7 +1082,7 @@ impl<'a> StatisticsConverter<'a> {
10831082 row_count_total. extend ( row_count_per_page) ;
10841083 }
10851084
1086- Ok ( Arc :: new ( UInt64Array :: from_iter ( row_count_total) ) )
1085+ Ok ( Some ( UInt64Array :: from_iter ( row_count_total) ) )
10871086 }
10881087
10891088 /// Returns a null array of data_type with one element per row group
0 commit comments