From 92473ddbafe39a40c7bfd9ea3ccdb6a43c51bc41 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Mon, 13 Oct 2025 14:42:58 -0700 Subject: [PATCH 1/8] Assorted panics we've found --- parquet/src/column/reader.rs | 14 +++++++++----- parquet/src/encodings/decoding.rs | 6 ++++++ parquet/src/encodings/rle.rs | 5 ++++- parquet/src/file/reader.rs | 15 +++++++++++++++ parquet/src/file/serialized_reader.rs | 5 ++++- parquet/src/schema/types.rs | 2 ++ parquet/tests/arrow_reader/bad_data.rs | 6 ++++-- 7 files changed, 44 insertions(+), 9 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index b8ff38efa3c4..20f7be975ebc 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -569,11 +569,15 @@ fn parse_v1_level( match encoding { Encoding::RLE => { let i32_size = std::mem::size_of::(); - let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; - Ok(( - i32_size + data_size, - buf.slice(i32_size..i32_size + data_size), - )) + if i32_size <= buf.len() { + let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; + let end = + i32_size.checked_add(data_size).ok_or(general_err!("invalid level length"))?; + if end <= buf.len() { + return Ok((end, buf.slice(i32_size..end))); + } + } + Err(general_err!("not enough data to read levels")) } #[allow(deprecated)] Encoding::BIT_PACKED => { diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 91b31dbdfcd2..3bfbd89a315f 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -382,6 +382,12 @@ impl Decoder for DictDecoder { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // First byte in `data` is bit width let bit_width = data.as_ref()[0]; + if bit_width > 32 { + return Err(general_err!( + "Invalid or corrupted Bit width {}. Max allowed is 32", + bit_width + )); + } let mut rle_decoder = RleDecoder::new(bit_width); rle_decoder.set_data(data.slice(1..)); self.num_values = num_values; diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index db8227fcac3a..41c050132064 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -513,7 +513,10 @@ impl RleDecoder { self.rle_left = (indicator_value >> 1) as u32; let value_width = bit_util::ceil(self.bit_width as usize, 8); self.current_value = bit_reader.get_aligned::(value_width); - assert!(self.current_value.is_some()); + assert!( + self.current_value.is_some(), + "parquet_data_error: not enough data for RLE decoding" + ); } true } else { diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 61af21a68ec1..5bf318d5e344 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -124,11 +124,26 @@ impl ChunkReader for Bytes { fn get_read(&self, start: u64) -> Result { let start = start as usize; + if start > self.len() { + return Err(eof_err!( + "Expected to read at offset {}, while file has length {}", + start, + self.len() + )); + } Ok(self.slice(start..).reader()) } fn get_bytes(&self, start: u64, length: usize) -> Result { let start = start as usize; + if start > self.len() || start + length > self.len() { + return Err(eof_err!( + "Expected to read {} bytes at offset {}, while file has length {}", + length, + start, + self.len() + )); + } Ok(self.slice(start..start + length)) } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 6da5c39d745b..035113774fd2 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -392,6 +392,9 @@ pub(crate) fn decode_page( let buffer = match decompressor { Some(decompressor) if can_decompress => { let uncompressed_page_size = usize::try_from(page_header.uncompressed_page_size)?; + if offset > buffer.len() || offset > uncompressed_page_size { + return Err(general_err!("Invalid page header")); + } let decompressed_size = uncompressed_page_size - offset; let mut decompressed = Vec::with_capacity(uncompressed_page_size); decompressed.extend_from_slice(&buffer.as_ref()[..offset]); @@ -458,7 +461,7 @@ pub(crate) fn decode_page( } _ => { // For unknown page type (e.g., INDEX_PAGE), skip and read next. - unimplemented!("Page type {:?} is not supported", page_header.r#type) + return Err(general_err!("Page type {:?} is not supported", page_header.r#type)); } }; diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 1ae37d0a462f..779e27709ce7 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1359,6 +1359,8 @@ fn schema_from_array_helper<'a>( if !is_root_node { builder = builder.with_repetition(rep); } + } else if !is_root_node { + return Err(general_err!("Repetition level must be defined for non-root types")); } Ok((next_index, Arc::new(builder.build().unwrap()))) } diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 235f81812468..84dbfa47cf1f 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -84,10 +84,12 @@ fn test_parquet_1481() { } #[test] -#[should_panic(expected = "assertion failed: self.current_value.is_some()")] fn test_arrow_gh_41321() { let err = read_file("ARROW-GH-41321.parquet").unwrap_err(); - assert_eq!(err.to_string(), "TBD (currently panics)"); + assert_eq!( + err.to_string(), + "External: Parquet argument error: Parquet error: Invalid or corrupted Bit width 254. Max allowed is 32" + ); } #[test] From ad8ee66a5bc109aa1b7aa3ab187ded241d751829 Mon Sep 17 00:00:00 2001 From: Alex Stephen <1325798+rambleraptor@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:05:42 -0700 Subject: [PATCH 2/8] Update parquet/src/encodings/decoding.rs Co-authored-by: Ed Seidl --- parquet/src/encodings/decoding.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 3bfbd89a315f..c5fabbc588a2 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -384,7 +384,7 @@ impl Decoder for DictDecoder { let bit_width = data.as_ref()[0]; if bit_width > 32 { return Err(general_err!( - "Invalid or corrupted Bit width {}. Max allowed is 32", + "Invalid or corrupted RLE bit width {}. Max allowed is 32", bit_width )); } From 3a91747d10835e86b624e97fa1d271839fadae5b Mon Sep 17 00:00:00 2001 From: Alex Stephen <1325798+rambleraptor@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:05:50 -0700 Subject: [PATCH 3/8] Update parquet/src/file/reader.rs Co-authored-by: Ryan Johnson --- parquet/src/file/reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 5bf318d5e344..d4639095aef2 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -126,8 +126,7 @@ impl ChunkReader for Bytes { let start = start as usize; if start > self.len() { return Err(eof_err!( - "Expected to read at offset {}, while file has length {}", - start, + "Expected to read at offset {start}, while file has length {}", self.len() )); } From 775bddaa06054ce7c981a5fd0531211ffae00772 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Tue, 14 Oct 2025 13:26:50 -0700 Subject: [PATCH 4/8] PR comments --- parquet/src/encodings/decoding.rs | 4 ++++ parquet/src/schema/types.rs | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index c5fabbc588a2..411c28759010 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -381,6 +381,10 @@ impl DictDecoder { impl Decoder for DictDecoder { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // First byte in `data` is bit width + if data.len() < 1 { + return Err(eof_err!("Not enough bytes to decode bit_width")); + } + let bit_width = data.as_ref()[0]; if bit_width > 32 { return Err(general_err!( diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 779e27709ce7..9754a0e79bc7 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1348,19 +1348,19 @@ fn schema_from_array_helper<'a>( .with_logical_type(logical_type) .with_fields(fields) .with_id(field_id); - if let Some(rep) = repetition { - // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or - // REPEATED for root node. - // - // We only set repetition for group types that are not top-level message - // type. According to parquet-format: - // Root of the schema does not have a repetition_type. - // All other types must have one. - if !is_root_node { - builder = builder.with_repetition(rep); - } - } else if !is_root_node { - return Err(general_err!("Repetition level must be defined for non-root types")); + + // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or + // REPEATED for root node. + // + // We only set repetition for group types that are not top-level message + // type. According to parquet-format: + // Root of the schema does not have a repetition_type. + // All other types must have one. + if !is_root_node { + let Some(rep) = repetition else { + return Err(general_err!("Repetition level must be defined for non-root types")); + }; + builder = builder.with_repetition(rep); } Ok((next_index, Arc::new(builder.build().unwrap()))) } From b223c818f6eebba95873809bcad13a9ff2882c60 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Fri, 17 Oct 2025 15:31:35 -0700 Subject: [PATCH 5/8] add tests --- parquet/src/column/reader.rs | 19 ++++++++++++++++ parquet/src/encodings/decoding.rs | 10 +++++++++ parquet/src/file/reader.rs | 31 ++++++++++++++++++++++++++ parquet/src/file/serialized_reader.rs | 29 ++++++++++++++++++++++++ parquet/tests/arrow_reader/bad_data.rs | 2 +- 5 files changed, 90 insertions(+), 1 deletion(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 20f7be975ebc..a6a2ed7e78d1 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -601,6 +601,25 @@ mod tests { use crate::util::test_common::page_util::InMemoryPageReader; use crate::util::test_common::rand_gen::make_pages; + #[test] + fn test_parse_v1_level_invalid_length() { + // Say length is 10, but buffer is only 4 + let buf = Bytes::from(vec![10, 0, 0, 0]); + let err = parse_v1_level(1, 100, Encoding::RLE, buf).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: not enough data to read levels" + ); + + // Say length is 4, but buffer is only 3 + let buf = Bytes::from(vec![4, 0, 0]); + let err = parse_v1_level(1, 100, Encoding::RLE, buf).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: not enough data to read levels" + ); + } + const NUM_LEVELS: usize = 128; const NUM_PAGES: usize = 2; const MAX_DEF_LEVEL: i16 = 5; diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 411c28759010..e467d694d5f7 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1390,6 +1390,16 @@ mod tests { test_plain_skip::(Bytes::from(data_bytes), 3, 6, 4, &[]); } + #[test] + fn test_dict_decoder_empty_data() { + let mut decoder = DictDecoder::::new(); + let err = decoder.set_data(Bytes::new(), 10).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: Not enough bytes to decode bit_width" + ); + } + fn test_plain_decode( data: Bytes, num_values: usize, diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index d4639095aef2..03c2c291b712 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -288,3 +288,34 @@ impl Iterator for FilePageIterator { } impl PageIterator for FilePageIterator {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bytes_chunk_reader_get_read_out_of_bounds() { + let data = Bytes::from(vec![0, 1, 2, 3]); + let err = data.get_read(5).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: Expected to read at offset 5, while file has length 4" + ); + } + + #[test] + fn test_bytes_chunk_reader_get_bytes_out_of_bounds() { + let data = Bytes::from(vec![0, 1, 2, 3]); + let err = data.get_bytes(5, 1).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: Expected to read 1 bytes at offset 5, while file has length 4" + ); + + let err = data.get_bytes(2, 3).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: Expected to read 3 bytes at offset 2, while file has length 4" + ); + } +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 035113774fd2..90d8bb4b7bc4 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1142,6 +1142,35 @@ mod tests { use super::*; + #[test] + fn test_decode_page_invalid_offset() { + use crate::file::metadata::thrift_gen::DataPageHeaderV2; + + let mut page_header = PageHeader::default(); + page_header.r#type = PageType::DATA_PAGE_V2; + page_header.uncompressed_page_size = 10; + page_header.compressed_page_size = 10; + let mut data_page_header_v2 = DataPageHeaderV2::default(); + data_page_header_v2.definition_levels_byte_length = 11; // offset > uncompressed_page_size + page_header.data_page_header_v2 = Some(data_page_header_v2); + + let buffer = Bytes::new(); + let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); + assert_eq!(err.to_string(), "Parquet error: Invalid page header"); + } + + #[test] + fn test_decode_unsupported_page() { + let mut page_header = PageHeader::default(); + page_header.r#type = PageType::INDEX_PAGE; + let buffer = Bytes::new(); + let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: Page type INDEX_PAGE is not supported" + ); + } + #[test] fn test_cursor_and_file_has_the_same_behaviour() { let mut buf: Vec = Vec::new(); diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 84dbfa47cf1f..54c92976e41c 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -88,7 +88,7 @@ fn test_arrow_gh_41321() { let err = read_file("ARROW-GH-41321.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: Parquet error: Invalid or corrupted Bit width 254. Max allowed is 32" + "External: Parquet argument error: Parquet error: Invalid or corrupted RLE bit width 254. Max allowed is 32" ); } From 85e8b82b1cf5ce8f22c41d364ff9017437625516 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Wed, 22 Oct 2025 13:40:18 -0700 Subject: [PATCH 6/8] PR comments --- parquet/src/column/reader.rs | 5 +++-- parquet/src/file/serialized_reader.rs | 31 ++++++++++++++++++++------- parquet/src/schema/types.rs | 6 ++++-- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index a6a2ed7e78d1..ebde79e6a7f2 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -571,8 +571,9 @@ fn parse_v1_level( let i32_size = std::mem::size_of::(); if i32_size <= buf.len() { let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; - let end = - i32_size.checked_add(data_size).ok_or(general_err!("invalid level length"))?; + let end = i32_size + .checked_add(data_size) + .ok_or(general_err!("invalid level length"))?; if end <= buf.len() { return Ok((end, buf.slice(i32_size..end))); } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 90d8bb4b7bc4..cc545eb8b826 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -461,7 +461,10 @@ pub(crate) fn decode_page( } _ => { // For unknown page type (e.g., INDEX_PAGE), skip and read next. - return Err(general_err!("Page type {:?} is not supported", page_header.r#type)); + return Err(general_err!( + "Page type {:?} is not supported", + page_header.r#type + )); } }; @@ -1146,13 +1149,25 @@ mod tests { fn test_decode_page_invalid_offset() { use crate::file::metadata::thrift_gen::DataPageHeaderV2; - let mut page_header = PageHeader::default(); - page_header.r#type = PageType::DATA_PAGE_V2; - page_header.uncompressed_page_size = 10; - page_header.compressed_page_size = 10; - let mut data_page_header_v2 = DataPageHeaderV2::default(); - data_page_header_v2.definition_levels_byte_length = 11; // offset > uncompressed_page_size - page_header.data_page_header_v2 = Some(data_page_header_v2); + let page_header = PageHeader { + r#type: PageType::DATA_PAGE_V2, + uncompressed_page_size: 10, + compressed_page_size: 10, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + crc: None, + data_page_header_v2: Some(DataPageHeaderV2 { + num_nulls: 0, + num_rows: 0, + num_values: 0, + encoding: Encoding::PLAIN, + definition_levels_byte_length: 11, + repetition_levels_byte_length: 0, + is_compressed: None, + statistics: None, + }), + }; let buffer = Bytes::new(); let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 9754a0e79bc7..de6f855685a6 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1358,11 +1358,13 @@ fn schema_from_array_helper<'a>( // All other types must have one. if !is_root_node { let Some(rep) = repetition else { - return Err(general_err!("Repetition level must be defined for non-root types")); + return Err(general_err!( + "Repetition level must be defined for non-root types" + )); }; builder = builder.with_repetition(rep); } - Ok((next_index, Arc::new(builder.build().unwrap()))) + Ok((next_index, Arc::new(builder.build()?))) } } } From 72e30b05b7ce130a0e07351d1c9d23ee1cd301f8 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Wed, 22 Oct 2025 13:55:29 -0700 Subject: [PATCH 7/8] PR comments --- parquet/src/column/page.rs | 2 +- parquet/src/encodings/decoding.rs | 5 +--- parquet/src/file/reader.rs | 6 ++-- parquet/src/file/serialized_reader.rs | 41 +++++++++++++++++++++------ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 23517f05df11..f18b296c1c65 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -31,7 +31,7 @@ use crate::file::statistics::{Statistics, page_stats_to_thrift}; /// List of supported pages. /// These are 1-to-1 mapped from the equivalent Thrift definitions, except `buf` which /// used to store uncompressed bytes of the page. -#[derive(Clone)] +#[derive(Clone, Debug)] pub enum Page { /// Data page Parquet format v1. DataPage { diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index e467d694d5f7..7b5b8a94ae9a 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1394,10 +1394,7 @@ mod tests { fn test_dict_decoder_empty_data() { let mut decoder = DictDecoder::::new(); let err = decoder.set_data(Bytes::new(), 10).unwrap_err(); - assert_eq!( - err.to_string(), - "Parquet error: Not enough bytes to decode bit_width" - ); + assert_eq!(err.to_string(), "EOF: Not enough bytes to decode bit_width"); } fn test_plain_decode( diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 03c2c291b712..3adf10fac220 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -299,7 +299,7 @@ mod tests { let err = data.get_read(5).unwrap_err(); assert_eq!( err.to_string(), - "Parquet error: Expected to read at offset 5, while file has length 4" + "EOF: Expected to read at offset 5, while file has length 4" ); } @@ -309,13 +309,13 @@ mod tests { let err = data.get_bytes(5, 1).unwrap_err(); assert_eq!( err.to_string(), - "Parquet error: Expected to read 1 bytes at offset 5, while file has length 4" + "EOF: Expected to read 1 bytes at offset 5, while file has length 4" ); let err = data.get_bytes(2, 3).unwrap_err(); assert_eq!( err.to_string(), - "Parquet error: Expected to read 3 bytes at offset 2, while file has length 4" + "EOF: Expected to read 3 bytes at offset 2, while file has length 4" ); } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index cc545eb8b826..ee9a69b1dcf7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -25,7 +25,7 @@ use crate::compression::{Codec, create_codec}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{CryptoContext, read_and_decrypt}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::thrift::PageHeader; +use crate::file::metadata::thrift::{DataPageHeaderV2, PageHeader}; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics; use crate::file::{ @@ -1145,10 +1145,7 @@ mod tests { use super::*; - #[test] fn test_decode_page_invalid_offset() { - use crate::file::metadata::thrift_gen::DataPageHeaderV2; - let page_header = PageHeader { r#type: PageType::DATA_PAGE_V2, uncompressed_page_size: 10, @@ -1171,19 +1168,45 @@ mod tests { let buffer = Bytes::new(); let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); - assert_eq!(err.to_string(), "Parquet error: Invalid page header"); + assert!( + err.to_string() + .contains("DataPage v2 header contains implausible values") + ); } - #[test] fn test_decode_unsupported_page() { - let mut page_header = PageHeader::default(); - page_header.r#type = PageType::INDEX_PAGE; + let mut page_header = PageHeader { + r#type: PageType::INDEX_PAGE, + uncompressed_page_size: 10, + compressed_page_size: 10, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + crc: None, + data_page_header_v2: None, + }; let buffer = Bytes::new(); - let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); + let err = decode_page(page_header.clone(), buffer.clone(), Type::INT32, None).unwrap_err(); assert_eq!( err.to_string(), "Parquet error: Page type INDEX_PAGE is not supported" ); + + page_header.data_page_header_v2 = Some(DataPageHeaderV2 { + num_nulls: 0, + num_rows: 0, + num_values: 0, + encoding: Encoding::PLAIN, + definition_levels_byte_length: 11, + repetition_levels_byte_length: 0, + is_compressed: None, + statistics: None, + }); + let err = decode_page(page_header, buffer, Type::INT32, None).unwrap_err(); + assert!( + err.to_string() + .contains("DataPage v2 header contains implausible values") + ); } #[test] From 07983df8ff0909781cd79fc33b93d68cf9152691 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Wed, 22 Oct 2025 14:54:16 -0700 Subject: [PATCH 8/8] Fix clippy --- parquet/src/encodings/decoding.rs | 2 +- parquet/src/file/serialized_reader.rs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 7b5b8a94ae9a..b5ec5d1c1af5 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -381,7 +381,7 @@ impl DictDecoder { impl Decoder for DictDecoder { fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // First byte in `data` is bit width - if data.len() < 1 { + if data.is_empty() { return Err(eof_err!("Not enough bytes to decode bit_width")); } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ee9a69b1dcf7..3f95ea9d4982 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -25,7 +25,7 @@ use crate::compression::{Codec, create_codec}; #[cfg(feature = "encryption")] use crate::encryption::decrypt::{CryptoContext, read_and_decrypt}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::thrift::{DataPageHeaderV2, PageHeader}; +use crate::file::metadata::thrift::PageHeader; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics; use crate::file::{ @@ -1136,6 +1136,7 @@ mod tests { use crate::column::reader::ColumnReader; use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType, Int32Type}; + use crate::file::metadata::thrift::DataPageHeaderV2; #[allow(deprecated)] use crate::file::page_index::index_reader::{read_columns_indexes, read_offset_indexes}; use crate::file::writer::SerializedFileWriter; @@ -1145,6 +1146,7 @@ mod tests { use super::*; + #[test] fn test_decode_page_invalid_offset() { let page_header = PageHeader { r#type: PageType::DATA_PAGE_V2, @@ -1174,6 +1176,7 @@ mod tests { ); } + #[test] fn test_decode_unsupported_page() { let mut page_header = PageHeader { r#type: PageType::INDEX_PAGE,