Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/writer/encoder.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Avro Encoder for Arrow types.
19
20
use arrow_array::cast::AsArray;
21
use arrow_array::types::{
22
    ArrowPrimitiveType, Float32Type, Float64Type, Int32Type, Int64Type, TimestampMicrosecondType,
23
};
24
use arrow_array::OffsetSizeTrait;
25
use arrow_array::{Array, GenericBinaryArray, PrimitiveArray, RecordBatch};
26
use arrow_buffer::NullBuffer;
27
use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit};
28
use std::io::Write;
29
30
/// Behavior knobs for the Avro encoder.
31
///
32
/// When `impala_mode` is `true`, optional/nullable values are encoded
33
/// as Avro unions with **null second** (`[T, "null"]`). When `false`
34
/// (default), we use **null first** (`["null", T]`).
35
#[derive(Debug, Clone, Copy, Default)]
36
pub struct EncoderOptions {
37
    impala_mode: bool, // Will be fully implemented in a follow-up PR
38
}
39
40
/// Encode a single Avro-`long` using ZigZag + variable length, buffered.
41
///
42
/// Spec: <https://avro.apache.org/docs/1.11.1/specification/#binary-encoding>
43
#[inline]
44
853
pub fn write_long<W: Write + ?Sized>(writer: &mut W, value: i64) -> Result<(), ArrowError> {
45
853
    let mut zz = ((value << 1) ^ (value >> 63)) as u64;
46
    // At most 10 bytes for 64-bit varint
47
853
    let mut buf = [0u8; 10];
48
853
    let mut i = 0;
49
1.15k
    while (zz & !0x7F) != 0 {
50
298
        buf[i] = ((zz & 0x7F) as u8) | 0x80;
51
298
        i += 1;
52
298
        zz >>= 7;
53
298
    }
54
853
    buf[i] = (zz & 0x7F) as u8;
55
853
    i += 1;
56
853
    writer
57
853
        .write_all(&buf[..i])
58
853
        .map_err(|e| ArrowError::IoError(
format!0
(
"write long: {e}"0
),
e0
))
59
853
}
60
61
#[inline]
62
609
fn write_int<W: Write + ?Sized>(writer: &mut W, value: i32) -> Result<(), ArrowError> {
63
609
    write_long(writer, value as i64)
64
609
}
65
66
#[inline]
67
89
fn write_len_prefixed<W: Write + ?Sized>(writer: &mut W, bytes: &[u8]) -> Result<(), ArrowError> {
68
89
    write_long(writer, bytes.len() as i64)
?0
;
69
89
    writer
70
89
        .write_all(bytes)
71
89
        .map_err(|e| ArrowError::IoError(
format!0
(
"write bytes: {e}"0
),
e0
))
72
89
}
73
74
#[inline]
75
40
fn write_bool<W: Write + ?Sized>(writer: &mut W, v: bool) -> Result<(), ArrowError> {
76
40
    writer
77
40
        .write_all(&[if v { 
120
} else {
020
}])
78
40
        .map_err(|e| ArrowError::IoError(
format!0
(
"write bool: {e}"0
),
e0
))
79
40
}
80
81
/// Write the union branch index for an optional field.
82
///
83
/// Branch index is 0-based per Avro unions:
84
/// - Null-first (default): null => 0, value => 1
85
/// - Null-second (Impala): value => 0, null => 1
86
#[inline]
87
440
fn write_optional_branch<W: Write + ?Sized>(
88
440
    writer: &mut W,
89
440
    is_null: bool,
90
440
    impala_mode: bool,
91
440
) -> Result<(), ArrowError> {
92
440
    let branch = if impala_mode == is_null { 1 } else { 
00
};
93
440
    write_int(writer, branch)
94
440
}
95
96
/// Encode a `RecordBatch` in Avro binary format using **default options**.
97
8
pub fn encode_record_batch<W: Write>(batch: &RecordBatch, out: &mut W) -> Result<(), ArrowError> {
98
8
    encode_record_batch_with_options(batch, out, &EncoderOptions::default())
99
8
}
100
101
/// Encode a `RecordBatch` with explicit `EncoderOptions`.
102
8
pub fn encode_record_batch_with_options<W: Write>(
103
8
    batch: &RecordBatch,
104
8
    out: &mut W,
105
8
    opts: &EncoderOptions,
106
8
) -> Result<(), ArrowError> {
107
8
    let mut encoders = batch
108
8
        .schema()
109
8
        .fields()
110
8
        .iter()
111
8
        .zip(batch.columns())
112
61
        .
map8
(|(field, array)| Ok((field.is_nullable(), make_encoder(array.as_ref())
?0
)))
113
8
        .collect::<Result<Vec<_>, ArrowError>>()
?0
;
114
49
    
(0..batch.num_rows())8
.
try_for_each8
(|row| {
115
458
        
encoders.iter_mut()49
.
try_for_each49
(|(is_nullable, enc)| {
116
458
            if *is_nullable {
117
440
                let is_null = enc.is_null(row);
118
440
                write_optional_branch(out, is_null, opts.impala_mode)
?0
;
119
440
                if is_null {
120
0
                    return Ok(());
121
440
                }
122
18
            }
123
458
            enc.encode(row, out)
124
458
        })
125
49
    })
126
8
}
127
128
/// Enum for static dispatch of concrete encoders.
129
enum Encoder<'a> {
130
    Boolean(BooleanEncoder<'a>),
131
    Int(IntEncoder<'a, Int32Type>),
132
    Long(LongEncoder<'a, Int64Type>),
133
    Timestamp(LongEncoder<'a, TimestampMicrosecondType>),
134
    Float32(F32Encoder<'a>),
135
    Float64(F64Encoder<'a>),
136
    Binary(BinaryEncoder<'a, i32>),
137
}
138
139
impl<'a> Encoder<'a> {
140
    /// Encode the value at `idx`.
141
    #[inline]
142
458
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
143
458
        match self {
144
40
            Encoder::Boolean(e) => e.encode(idx, out),
145
169
            Encoder::Int(e) => e.encode(idx, out),
146
40
            Encoder::Long(e) => e.encode(idx, out),
147
40
            Encoder::Timestamp(e) => e.encode(idx, out),
148
40
            Encoder::Float32(e) => e.encode(idx, out),
149
40
            Encoder::Float64(e) => e.encode(idx, out),
150
89
            Encoder::Binary(e) => e.encode(idx, out),
151
        }
152
458
    }
153
}
154
155
/// An encoder + a null buffer for nullable fields.
156
pub struct NullableEncoder<'a> {
157
    encoder: Encoder<'a>,
158
    nulls: Option<NullBuffer>,
159
}
160
161
impl<'a> NullableEncoder<'a> {
162
    /// Create a new nullable encoder, wrapping a non-null encoder and a null buffer.
163
    #[inline]
164
61
    fn new(encoder: Encoder<'a>, nulls: Option<NullBuffer>) -> Self {
165
61
        Self { encoder, nulls }
166
61
    }
167
168
    /// Encode the value at `idx`, assuming it's not-null.
169
    #[inline]
170
458
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
171
458
        self.encoder.encode(idx, out)
172
458
    }
173
174
    /// Check if the value at `idx` is null.
175
    #[inline]
176
440
    fn is_null(&self, idx: usize) -> bool {
177
440
        self.nulls.as_ref().is_some_and(|nulls| 
nulls0
.
is_null0
(
idx0
))
178
440
    }
179
}
180
181
/// Creates an Avro encoder for the given `array`.
182
61
pub fn make_encoder<'a>(array: &'a dyn Array) -> Result<NullableEncoder<'a>, ArrowError> {
183
61
    let nulls = array.nulls().cloned();
184
61
    let enc = match array.data_type() {
185
        DataType::Boolean => {
186
5
            let arr = array.as_boolean();
187
5
            NullableEncoder::new(Encoder::Boolean(BooleanEncoder(arr)), nulls)
188
        }
189
        DataType::Int32 => {
190
23
            let arr = array.as_primitive::<Int32Type>();
191
23
            NullableEncoder::new(Encoder::Int(IntEncoder(arr)), nulls)
192
        }
193
        DataType::Int64 => {
194
5
            let arr = array.as_primitive::<Int64Type>();
195
5
            NullableEncoder::new(Encoder::Long(LongEncoder(arr)), nulls)
196
        }
197
        DataType::Float32 => {
198
5
            let arr = array.as_primitive::<Float32Type>();
199
5
            NullableEncoder::new(Encoder::Float32(F32Encoder(arr)), nulls)
200
        }
201
        DataType::Float64 => {
202
5
            let arr = array.as_primitive::<Float64Type>();
203
5
            NullableEncoder::new(Encoder::Float64(F64Encoder(arr)), nulls)
204
        }
205
        DataType::Binary => {
206
13
            let arr = array.as_binary::<i32>();
207
13
            NullableEncoder::new(Encoder::Binary(BinaryEncoder(arr)), nulls)
208
        }
209
        DataType::Timestamp(TimeUnit::Microsecond, _) => {
210
5
            let arr = array.as_primitive::<TimestampMicrosecondType>();
211
5
            NullableEncoder::new(Encoder::Timestamp(LongEncoder(arr)), nulls)
212
        }
213
0
        other => {
214
0
            return Err(ArrowError::NotYetImplemented(format!(
215
0
                "Unsupported data type for Avro encoding in slim build: {other:?}"
216
0
            )))
217
        }
218
    };
219
61
    Ok(enc)
220
61
}
221
222
struct BooleanEncoder<'a>(&'a arrow_array::BooleanArray);
223
impl BooleanEncoder<'_> {
224
    #[inline]
225
40
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
226
40
        write_bool(out, self.0.value(idx))
227
40
    }
228
}
229
230
/// Generic Avro `int` encoder for primitive arrays with `i32` native values.
231
struct IntEncoder<'a, P: ArrowPrimitiveType<Native = i32>>(&'a PrimitiveArray<P>);
232
impl<'a, P: ArrowPrimitiveType<Native = i32>> IntEncoder<'a, P> {
233
    #[inline]
234
169
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
235
169
        write_int(out, self.0.value(idx))
236
169
    }
237
}
238
239
/// Generic Avro `long` encoder for primitive arrays with `i64` native values.
240
struct LongEncoder<'a, P: ArrowPrimitiveType<Native = i64>>(&'a PrimitiveArray<P>);
241
impl<'a, P: ArrowPrimitiveType<Native = i64>> LongEncoder<'a, P> {
242
    #[inline]
243
80
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
244
80
        write_long(out, self.0.value(idx))
245
80
    }
246
}
247
248
/// Unified binary encoder generic over offset size (i32/i64).
249
struct BinaryEncoder<'a, O: OffsetSizeTrait>(&'a GenericBinaryArray<O>);
250
impl<'a, O: OffsetSizeTrait> BinaryEncoder<'a, O> {
251
    #[inline]
252
89
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
253
89
        write_len_prefixed(out, self.0.value(idx))
254
89
    }
255
}
256
257
struct F32Encoder<'a>(&'a arrow_array::Float32Array);
258
impl F32Encoder<'_> {
259
    #[inline]
260
40
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
261
        // Avro float: 4 bytes, IEEE-754 little-endian
262
40
        let bits = self.0.value(idx).to_bits();
263
40
        out.write_all(&bits.to_le_bytes())
264
40
            .map_err(|e| ArrowError::IoError(
format!0
(
"write f32: {e}"0
),
e0
))
265
40
    }
266
}
267
268
struct F64Encoder<'a>(&'a arrow_array::Float64Array);
269
impl F64Encoder<'_> {
270
    #[inline]
271
40
    fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> {
272
        // Avro double: 8 bytes, IEEE-754 little-endian
273
40
        let bits = self.0.value(idx).to_bits();
274
40
        out.write_all(&bits.to_le_bytes())
275
40
            .map_err(|e| ArrowError::IoError(
format!0
(
"write f64: {e}"0
),
e0
))
276
40
    }
277
}