diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5dc59d790d53..5aec476a6ee9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -297,6 +297,14 @@ impl ArrowWriter { Ok(()) } + /// Writes the given buf bytes to the internal buffer. + /// + /// It's safe to use this method to write data to the underlying writer, + /// because it will ensure that the buffering and byte‐counting layers are used. + pub fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { + self.writer.write_all(buf) + } + /// Flushes all buffered rows into a new row group pub fn flush(&mut self) -> Result<()> { let in_progress = match self.in_progress.take() { @@ -326,8 +334,12 @@ impl ArrowWriter { /// Returns a mutable reference to the underlying writer. /// - /// It is inadvisable to directly write to the underlying writer, doing so - /// will likely result in a corrupt parquet file + /// **Warning**: if you write directly to this writer, you will skip + /// the `TrackedWrite` buffering and byte‐counting layers. That’ll cause + /// the file footer’s recorded offsets and sizes to diverge from reality, + /// resulting in an unreadable or corrupted Parquet file. + /// + /// If you want to write safely to the underlying writer, use [`Self::write_all`]. pub fn inner_mut(&mut self) -> &mut W { self.writer.inner_mut() } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 0589d0933056..31a3344db66c 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -394,9 +394,26 @@ impl SerializedFileWriter { self.buf.inner() } + /// Writes the given buf bytes to the internal buffer. + /// + /// This can be used to write raw data to an in-progress parquet file, for + /// example, custom index structures or other payloads. Other parquet readers + /// will skip this data when reading the files. + /// + /// It's safe to use this method to write data to the underlying writer, + /// because it will ensure that the buffering and byte‐counting layers are used. + pub fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { + self.buf.write_all(buf) + } + /// Returns a mutable reference to the underlying writer. /// - /// It is inadvisable to directly write to the underlying writer. + /// **Warning**: if you write directly to this writer, you will skip + /// the `TrackedWrite` buffering and byte‐counting layers. That’ll cause + /// the file footer’s recorded offsets and sizes to diverge from reality, + /// resulting in an unreadable or corrupted Parquet file. + /// + /// If you want to write safely to the underlying writer, use [`Self::write_all`]. pub fn inner_mut(&mut self) -> &mut W { self.buf.inner_mut() }