Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/writer/format.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::compression::{CompressionCodec, CODEC_METADATA_KEY};
19
use crate::schema::{AvroSchema, SCHEMA_METADATA_KEY};
20
use crate::writer::encoder::{write_long, EncoderOptions};
21
use arrow_schema::{ArrowError, Schema};
22
use rand::RngCore;
23
use serde_json::{Map as JsonMap, Value as JsonValue};
24
use std::fmt::Debug;
25
use std::io::Write;
26
27
/// Format abstraction implemented by each container‐level writer.
28
pub trait AvroFormat: Debug + Default {
29
    /// Write any bytes required at the very beginning of the output stream
30
    /// (file header, etc.).
31
    /// Implementations **must not** write any record data.
32
    fn start_stream<W: Write>(
33
        &mut self,
34
        writer: &mut W,
35
        schema: &Schema,
36
        compression: Option<CompressionCodec>,
37
    ) -> Result<(), ArrowError>;
38
39
    /// Return the 16‑byte sync marker (OCF) or `None` (binary stream).
40
    fn sync_marker(&self) -> Option<&[u8; 16]>;
41
}
42
43
/// Avro Object Container File (OCF) format writer.
44
#[derive(Debug, Default)]
45
pub struct AvroOcfFormat {
46
    sync_marker: [u8; 16],
47
    /// Optional encoder behavior hints to keep file header schema ordering
48
    /// consistent with value encoding (e.g. Impala null-second).
49
    encoder_options: EncoderOptions,
50
}
51
52
impl AvroOcfFormat {
53
    /// Optional helper to attach encoder options (i.e., Impala null-second) to the format.
54
    #[allow(dead_code)]
55
0
    pub fn with_encoder_options(mut self, opts: EncoderOptions) -> Self {
56
0
        self.encoder_options = opts;
57
0
        self
58
0
    }
59
60
    /// Access the options used by this format.
61
    #[allow(dead_code)]
62
0
    pub fn encoder_options(&self) -> &EncoderOptions {
63
0
        &self.encoder_options
64
0
    }
65
}
66
67
impl AvroFormat for AvroOcfFormat {
68
9
    fn start_stream<W: Write>(
69
9
        &mut self,
70
9
        writer: &mut W,
71
9
        schema: &Schema,
72
9
        compression: Option<CompressionCodec>,
73
9
    ) -> Result<(), ArrowError> {
74
9
        let mut rng = rand::rng();
75
9
        rng.fill_bytes(&mut self.sync_marker);
76
9
        let avro_schema = AvroSchema::try_from(schema)
?0
;
77
9
        writer
78
9
            .write_all(b"Obj\x01")
79
9
            .map_err(|e| ArrowError::IoError(
format!0
(
"write OCF magic: {e}"0
),
e0
))
?0
;
80
9
        let codec_str = match 
compression4
{
81
0
            Some(CompressionCodec::Deflate) => "deflate",
82
1
            Some(CompressionCodec::Snappy) => "snappy",
83
1
            Some(CompressionCodec::ZStandard) => "zstandard",
84
1
            Some(CompressionCodec::Bzip2) => "bzip2",
85
1
            Some(CompressionCodec::Xz) => "xz",
86
5
            None => "null",
87
        };
88
9
        write_long(writer, 2)
?0
;
89
9
        write_string(writer, SCHEMA_METADATA_KEY)
?0
;
90
9
        write_bytes(writer, avro_schema.json_string.as_bytes())
?0
;
91
9
        write_string(writer, CODEC_METADATA_KEY)
?0
;
92
9
        write_bytes(writer, codec_str.as_bytes())
?0
;
93
9
        write_long(writer, 0)
?0
;
94
        // Sync marker (16 bytes)
95
9
        writer
96
9
            .write_all(&self.sync_marker)
97
9
            .map_err(|e| ArrowError::IoError(
format!0
(
"write OCF sync marker: {e}"0
),
e0
))
?0
;
98
99
9
        Ok(())
100
9
    }
101
102
9
    fn sync_marker(&self) -> Option<&[u8; 16]> {
103
9
        Some(&self.sync_marker)
104
9
    }
105
}
106
107
/// Raw Avro binary streaming format (no header or footer).
108
#[derive(Debug, Default)]
109
pub struct AvroBinaryFormat;
110
111
impl AvroFormat for AvroBinaryFormat {
112
0
    fn start_stream<W: Write>(
113
0
        &mut self,
114
0
        _writer: &mut W,
115
0
        _schema: &Schema,
116
0
        _compression: Option<CompressionCodec>,
117
0
    ) -> Result<(), ArrowError> {
118
0
        Err(ArrowError::NotYetImplemented(
119
0
            "avro binary format not yet implemented".to_string(),
120
0
        ))
121
0
    }
122
123
0
    fn sync_marker(&self) -> Option<&[u8; 16]> {
124
0
        None
125
0
    }
126
}
127
128
#[inline]
129
18
fn write_string<W: Write>(writer: &mut W, s: &str) -> Result<(), ArrowError> {
130
18
    write_bytes(writer, s.as_bytes())
131
18
}
132
133
#[inline]
134
36
fn write_bytes<W: Write>(writer: &mut W, bytes: &[u8]) -> Result<(), ArrowError> {
135
36
    write_long(writer, bytes.len() as i64)
?0
;
136
36
    writer
137
36
        .write_all(bytes)
138
36
        .map_err(|e| ArrowError::IoError(
format!0
(
"write bytes: {e}"0
),
e0
))
139
36
}