@@ -355,23 +355,22 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
355355 } else {
356356 "false" . to_string( )
357357 } ) ,
358- DataType :: List ( _field) => {
359- let list_array = array
360- . as_any ( )
361- . downcast_ref :: < arrow_array:: ListArray > ( )
362- . expect ( "Expected ListArray" ) ;
363-
364- let child_array = list_array. values ( ) ;
365- let offsets = list_array. value_offsets ( ) ;
366- let start = offsets[ idx] as usize ;
367- let end = offsets[ idx + 1 ] as usize ;
368-
369- let formatted_values: Vec < String > = ( start..end)
370- . map ( |i| format_arrow_value ( child_array. as_ref ( ) , i) )
371- . collect ( ) ;
372-
373- format ! ( "[{}]" , formatted_values. join( ", " ) )
374- }
358+ DataType :: List ( _field) => try_downcast ! (
359+ arrow_array:: ListArray ,
360+ array,
361+ |list_array: & arrow_array:: ListArray | {
362+ let child_array = list_array. values( ) ;
363+ let offsets = list_array. value_offsets( ) ;
364+ let start = offsets[ idx] as usize ;
365+ let end = offsets[ idx + 1 ] as usize ;
366+
367+ let formatted_values: Vec <String > = ( start..end)
368+ . map( |i| format_arrow_value( child_array. as_ref( ) , i) )
369+ . collect( ) ;
370+
371+ format!( "[{}]" , formatted_values. join( ", " ) )
372+ }
373+ ) ,
375374 DataType :: Null => "NULL" . to_string ( ) ,
376375 _ => {
377376 warn ! (
@@ -387,8 +386,10 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
387386mod tests {
388387 use std:: { fs:: OpenOptions , sync:: Arc } ;
389388
389+ use arrow:: buffer:: OffsetBuffer ;
390390 use arrow_array:: {
391- BooleanArray , Float64Array , Int64Array , RecordBatch , StringArray , TimestampMillisecondArray ,
391+ BooleanArray , Float64Array , Int64Array , ListArray , RecordBatch , StringArray ,
392+ TimestampMillisecondArray ,
392393 } ;
393394 use arrow_schema:: { DataType , Field , Schema , TimeUnit } ;
394395 use datafusion:: prelude:: { ParquetReadOptions , SessionContext } ;
@@ -467,6 +468,35 @@ mod tests {
467468 Some ( "constant" ) ,
468469 ] ) ;
469470
471+ // Create List<Int64> field
472+ let int_list_data = Int64Array :: from ( vec ! [
473+ 1 , 2 , 3 , 4 , 5 , 1 , 2 , 3 , 6 , 7 , 8 , 9 , 1 , 4 , 5 , 10 , 11 , 12 , 13 , 14 , 1 , 2 , 12 , 13 , 14 , 1 , 2 ,
474+ ] ) ;
475+ let int_list_offsets =
476+ OffsetBuffer :: new ( vec ! [ 0 , 3 , 5 , 8 , 12 , 13 , 15 , 17 , 17 , 20 , 22 ] . into ( ) ) ;
477+ let int_list_field = Arc :: new ( Field :: new ( "item" , DataType :: Int64 , false ) ) ;
478+ let int_list_array = ListArray :: new (
479+ int_list_field,
480+ int_list_offsets,
481+ Arc :: new ( int_list_data) ,
482+ None ,
483+ ) ;
484+
485+ // Create List<Float64> field
486+ let float_list_data = Float64Array :: from ( vec ! [
487+ 1.1 , 2.2 , 3.3 , 4.4 , 5.5 , 1.1 , 2.2 , 6.6 , 7.7 , 8.8 , 9.9 , 3.3 , 4.4 , 5.5 , 10.0 , 11.1 , 12.2 ,
488+ 13.3 ,
489+ ] ) ;
490+ let float_list_offsets =
491+ OffsetBuffer :: new ( vec ! [ 0 , 2 , 5 , 7 , 8 , 11 , 14 , 15 , 15 , 17 , 18 ] . into ( ) ) ;
492+ let float_list_field = Arc :: new ( Field :: new ( "item" , DataType :: Float64 , false ) ) ;
493+ let float_list_array = ListArray :: new (
494+ float_list_field,
495+ float_list_offsets,
496+ Arc :: new ( float_list_data) ,
497+ None ,
498+ ) ;
499+
470500 let batch = RecordBatch :: try_new (
471501 schema. clone ( ) ,
472502 vec ! [
@@ -476,6 +506,8 @@ mod tests {
476506 Arc :: new( active_array) ,
477507 Arc :: new( timestamp_array) ,
478508 Arc :: new( single_value_array) ,
509+ Arc :: new( int_list_array) ,
510+ Arc :: new( float_list_array) ,
479511 ] ,
480512 )
481513 . unwrap ( ) ;
@@ -505,6 +537,16 @@ mod tests {
505537 true ,
506538 ) ,
507539 Field :: new( "single_value" , DataType :: Utf8 , true ) ,
540+ Field :: new_list(
541+ "int_list" ,
542+ Arc :: new( Field :: new( "item" , DataType :: Int64 , false ) ) ,
543+ true ,
544+ ) ,
545+ Field :: new_list(
546+ "float_list" ,
547+ Arc :: new( Field :: new( "item" , DataType :: Float64 , false ) ) ,
548+ true ,
549+ ) ,
508550 ] )
509551 }
510552
@@ -910,4 +952,121 @@ mod tests {
910952 assert_eq ! ( distinct_stat. count, 100 ) ;
911953 }
912954 }
955+
956+ #[ tokio:: test]
957+ async fn test_calculate_single_field_stats_with_int_list_field ( ) {
958+ let ( _temp_dir, parquet_path) = create_test_parquet_with_data ( ) . await ;
959+
960+ let ctx = SessionContext :: new ( ) ;
961+ let table_name = ulid:: Ulid :: new ( ) . to_string ( ) ;
962+
963+ ctx. register_parquet (
964+ & table_name,
965+ parquet_path. to_str ( ) . unwrap ( ) ,
966+ ParquetReadOptions :: default ( ) ,
967+ )
968+ . await
969+ . unwrap ( ) ;
970+
971+ // Test int_list field (List<Int64>)
972+ let result = calculate_single_field_stats ( ctx. clone ( ) , & table_name, "int_list" , 50 ) . await ;
973+
974+ assert ! ( result. is_some( ) ) ;
975+ let stats = result. unwrap ( ) ;
976+
977+ assert_eq ! ( stats. field_name, "int_list" ) ;
978+ assert_eq ! ( stats. count, 10 ) ;
979+
980+ // Verify we have the expected distinct lists
981+ // Expected: [1, 2, 3], [4, 5], [6, 7, 8, 9], [1], [10, 11], [], [12, 13, 14], [1, 2]
982+ assert_eq ! ( stats. distinct_count, 8 ) ;
983+
984+ // Check for duplicate lists - [1, 2, 3] appears twice, [4, 5] appears twice
985+ let list_123_stat = stats
986+ . distinct_stats
987+ . iter ( )
988+ . find ( |s| s. distinct_value == "[1, 2, 3]" ) ;
989+ assert ! ( list_123_stat. is_some( ) ) ;
990+ assert_eq ! ( list_123_stat. unwrap( ) . count, 2 ) ;
991+
992+ let list_45_stat = stats
993+ . distinct_stats
994+ . iter ( )
995+ . find ( |s| s. distinct_value == "[4, 5]" ) ;
996+ assert ! ( list_45_stat. is_some( ) ) ;
997+ assert_eq ! ( list_45_stat. unwrap( ) . count, 2 ) ;
998+
999+ // Check single occurrence lists
1000+ let list_6789_stat = stats
1001+ . distinct_stats
1002+ . iter ( )
1003+ . find ( |s| s. distinct_value == "[6, 7, 8, 9]" ) ;
1004+ assert ! ( list_6789_stat. is_some( ) ) ;
1005+ assert_eq ! ( list_6789_stat. unwrap( ) . count, 1 ) ;
1006+
1007+ let empty_list_stat = stats
1008+ . distinct_stats
1009+ . iter ( )
1010+ . find ( |s| s. distinct_value == "[]" ) ;
1011+ assert ! ( empty_list_stat. is_some( ) ) ;
1012+ assert_eq ! ( empty_list_stat. unwrap( ) . count, 1 ) ;
1013+ }
1014+
1015+ #[ tokio:: test]
1016+ async fn test_calculate_single_field_stats_with_float_list_field ( ) {
1017+ let ( _temp_dir, parquet_path) = create_test_parquet_with_data ( ) . await ;
1018+
1019+ let ctx = SessionContext :: new ( ) ;
1020+ let table_name = ulid:: Ulid :: new ( ) . to_string ( ) ;
1021+
1022+ ctx. register_parquet (
1023+ & table_name,
1024+ parquet_path. to_str ( ) . unwrap ( ) ,
1025+ ParquetReadOptions :: default ( ) ,
1026+ )
1027+ . await
1028+ . unwrap ( ) ;
1029+
1030+ // Test float_list field (List<Float64>)
1031+ let result = calculate_single_field_stats ( ctx. clone ( ) , & table_name, "float_list" , 50 ) . await ;
1032+
1033+ assert ! ( result. is_some( ) ) ;
1034+ let stats = result. unwrap ( ) ;
1035+
1036+ assert_eq ! ( stats. field_name, "float_list" ) ;
1037+ assert_eq ! ( stats. count, 10 ) ;
1038+
1039+ // Expected distinct lists: [1.1, 2.2], [3.3, 4.4, 5.5], [6.6], [7.7, 8.8, 9.9], [10.0], [], [11.1, 12.2], [13.3]
1040+ assert_eq ! ( stats. distinct_count, 8 ) ;
1041+
1042+ // Check for duplicate lists - [1.1, 2.2] appears twice, [3.3, 4.4, 5.5] appears twice
1043+ let list_11_22_stat = stats
1044+ . distinct_stats
1045+ . iter ( )
1046+ . find ( |s| s. distinct_value == "[1.1, 2.2]" ) ;
1047+ assert ! ( list_11_22_stat. is_some( ) ) ;
1048+ assert_eq ! ( list_11_22_stat. unwrap( ) . count, 2 ) ;
1049+
1050+ let list_33_44_55_stat = stats
1051+ . distinct_stats
1052+ . iter ( )
1053+ . find ( |s| s. distinct_value == "[3.3, 4.4, 5.5]" ) ;
1054+ assert ! ( list_33_44_55_stat. is_some( ) ) ;
1055+ assert_eq ! ( list_33_44_55_stat. unwrap( ) . count, 2 ) ;
1056+
1057+ // Check single occurrence lists
1058+ let list_66_stat = stats
1059+ . distinct_stats
1060+ . iter ( )
1061+ . find ( |s| s. distinct_value == "[6.6]" ) ;
1062+ assert ! ( list_66_stat. is_some( ) ) ;
1063+ assert_eq ! ( list_66_stat. unwrap( ) . count, 1 ) ;
1064+
1065+ let empty_list_stat = stats
1066+ . distinct_stats
1067+ . iter ( )
1068+ . find ( |s| s. distinct_value == "[]" ) ;
1069+ assert ! ( empty_list_stat. is_some( ) ) ;
1070+ assert_eq ! ( empty_list_stat. unwrap( ) . count, 1 ) ;
1071+ }
9131072}
0 commit comments