/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/writer/encoder.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Avro Encoder for Arrow types. |
19 | | |
20 | | use arrow_array::cast::AsArray; |
21 | | use arrow_array::types::{ |
22 | | ArrowPrimitiveType, Float32Type, Float64Type, Int32Type, Int64Type, TimestampMicrosecondType, |
23 | | }; |
24 | | use arrow_array::OffsetSizeTrait; |
25 | | use arrow_array::{Array, GenericBinaryArray, PrimitiveArray, RecordBatch}; |
26 | | use arrow_buffer::NullBuffer; |
27 | | use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit}; |
28 | | use std::io::Write; |
29 | | |
30 | | /// Behavior knobs for the Avro encoder. |
31 | | /// |
32 | | /// When `impala_mode` is `true`, optional/nullable values are encoded |
33 | | /// as Avro unions with **null second** (`[T, "null"]`). When `false` |
34 | | /// (default), we use **null first** (`["null", T]`). |
35 | | #[derive(Debug, Clone, Copy, Default)] |
36 | | pub struct EncoderOptions { |
37 | | impala_mode: bool, // Will be fully implemented in a follow-up PR |
38 | | } |
39 | | |
40 | | /// Encode a single Avro-`long` using ZigZag + variable length, buffered. |
41 | | /// |
42 | | /// Spec: <https://avro.apache.org/docs/1.11.1/specification/#binary-encoding> |
43 | | #[inline] |
44 | 853 | pub fn write_long<W: Write + ?Sized>(writer: &mut W, value: i64) -> Result<(), ArrowError> { |
45 | 853 | let mut zz = ((value << 1) ^ (value >> 63)) as u64; |
46 | | // At most 10 bytes for 64-bit varint |
47 | 853 | let mut buf = [0u8; 10]; |
48 | 853 | let mut i = 0; |
49 | 1.15k | while (zz & !0x7F) != 0 { |
50 | 298 | buf[i] = ((zz & 0x7F) as u8) | 0x80; |
51 | 298 | i += 1; |
52 | 298 | zz >>= 7; |
53 | 298 | } |
54 | 853 | buf[i] = (zz & 0x7F) as u8; |
55 | 853 | i += 1; |
56 | 853 | writer |
57 | 853 | .write_all(&buf[..i]) |
58 | 853 | .map_err(|e| ArrowError::IoError(format!0 ("write long: {e}"0 ), e0 )) |
59 | 853 | } |
60 | | |
61 | | #[inline] |
62 | 609 | fn write_int<W: Write + ?Sized>(writer: &mut W, value: i32) -> Result<(), ArrowError> { |
63 | 609 | write_long(writer, value as i64) |
64 | 609 | } |
65 | | |
66 | | #[inline] |
67 | 89 | fn write_len_prefixed<W: Write + ?Sized>(writer: &mut W, bytes: &[u8]) -> Result<(), ArrowError> { |
68 | 89 | write_long(writer, bytes.len() as i64)?0 ; |
69 | 89 | writer |
70 | 89 | .write_all(bytes) |
71 | 89 | .map_err(|e| ArrowError::IoError(format!0 ("write bytes: {e}"0 ), e0 )) |
72 | 89 | } |
73 | | |
74 | | #[inline] |
75 | 40 | fn write_bool<W: Write + ?Sized>(writer: &mut W, v: bool) -> Result<(), ArrowError> { |
76 | 40 | writer |
77 | 40 | .write_all(&[if v { 120 } else { 020 }]) |
78 | 40 | .map_err(|e| ArrowError::IoError(format!0 ("write bool: {e}"0 ), e0 )) |
79 | 40 | } |
80 | | |
81 | | /// Write the union branch index for an optional field. |
82 | | /// |
83 | | /// Branch index is 0-based per Avro unions: |
84 | | /// - Null-first (default): null => 0, value => 1 |
85 | | /// - Null-second (Impala): value => 0, null => 1 |
86 | | #[inline] |
87 | 440 | fn write_optional_branch<W: Write + ?Sized>( |
88 | 440 | writer: &mut W, |
89 | 440 | is_null: bool, |
90 | 440 | impala_mode: bool, |
91 | 440 | ) -> Result<(), ArrowError> { |
92 | 440 | let branch = if impala_mode == is_null { 1 } else { 00 }; |
93 | 440 | write_int(writer, branch) |
94 | 440 | } |
95 | | |
96 | | /// Encode a `RecordBatch` in Avro binary format using **default options**. |
97 | 8 | pub fn encode_record_batch<W: Write>(batch: &RecordBatch, out: &mut W) -> Result<(), ArrowError> { |
98 | 8 | encode_record_batch_with_options(batch, out, &EncoderOptions::default()) |
99 | 8 | } |
100 | | |
101 | | /// Encode a `RecordBatch` with explicit `EncoderOptions`. |
102 | 8 | pub fn encode_record_batch_with_options<W: Write>( |
103 | 8 | batch: &RecordBatch, |
104 | 8 | out: &mut W, |
105 | 8 | opts: &EncoderOptions, |
106 | 8 | ) -> Result<(), ArrowError> { |
107 | 8 | let mut encoders = batch |
108 | 8 | .schema() |
109 | 8 | .fields() |
110 | 8 | .iter() |
111 | 8 | .zip(batch.columns()) |
112 | 61 | .map8 (|(field, array)| Ok((field.is_nullable(), make_encoder(array.as_ref())?0 ))) |
113 | 8 | .collect::<Result<Vec<_>, ArrowError>>()?0 ; |
114 | 49 | (0..batch.num_rows())8 .try_for_each8 (|row| { |
115 | 458 | encoders.iter_mut()49 .try_for_each49 (|(is_nullable, enc)| { |
116 | 458 | if *is_nullable { |
117 | 440 | let is_null = enc.is_null(row); |
118 | 440 | write_optional_branch(out, is_null, opts.impala_mode)?0 ; |
119 | 440 | if is_null { |
120 | 0 | return Ok(()); |
121 | 440 | } |
122 | 18 | } |
123 | 458 | enc.encode(row, out) |
124 | 458 | }) |
125 | 49 | }) |
126 | 8 | } |
127 | | |
128 | | /// Enum for static dispatch of concrete encoders. |
129 | | enum Encoder<'a> { |
130 | | Boolean(BooleanEncoder<'a>), |
131 | | Int(IntEncoder<'a, Int32Type>), |
132 | | Long(LongEncoder<'a, Int64Type>), |
133 | | Timestamp(LongEncoder<'a, TimestampMicrosecondType>), |
134 | | Float32(F32Encoder<'a>), |
135 | | Float64(F64Encoder<'a>), |
136 | | Binary(BinaryEncoder<'a, i32>), |
137 | | } |
138 | | |
139 | | impl<'a> Encoder<'a> { |
140 | | /// Encode the value at `idx`. |
141 | | #[inline] |
142 | 458 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
143 | 458 | match self { |
144 | 40 | Encoder::Boolean(e) => e.encode(idx, out), |
145 | 169 | Encoder::Int(e) => e.encode(idx, out), |
146 | 40 | Encoder::Long(e) => e.encode(idx, out), |
147 | 40 | Encoder::Timestamp(e) => e.encode(idx, out), |
148 | 40 | Encoder::Float32(e) => e.encode(idx, out), |
149 | 40 | Encoder::Float64(e) => e.encode(idx, out), |
150 | 89 | Encoder::Binary(e) => e.encode(idx, out), |
151 | | } |
152 | 458 | } |
153 | | } |
154 | | |
155 | | /// An encoder + a null buffer for nullable fields. |
156 | | pub struct NullableEncoder<'a> { |
157 | | encoder: Encoder<'a>, |
158 | | nulls: Option<NullBuffer>, |
159 | | } |
160 | | |
161 | | impl<'a> NullableEncoder<'a> { |
162 | | /// Create a new nullable encoder, wrapping a non-null encoder and a null buffer. |
163 | | #[inline] |
164 | 61 | fn new(encoder: Encoder<'a>, nulls: Option<NullBuffer>) -> Self { |
165 | 61 | Self { encoder, nulls } |
166 | 61 | } |
167 | | |
168 | | /// Encode the value at `idx`, assuming it's not-null. |
169 | | #[inline] |
170 | 458 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
171 | 458 | self.encoder.encode(idx, out) |
172 | 458 | } |
173 | | |
174 | | /// Check if the value at `idx` is null. |
175 | | #[inline] |
176 | 440 | fn is_null(&self, idx: usize) -> bool { |
177 | 440 | self.nulls.as_ref().is_some_and(|nulls| nulls0 .is_null0 (idx0 )) |
178 | 440 | } |
179 | | } |
180 | | |
181 | | /// Creates an Avro encoder for the given `array`. |
182 | 61 | pub fn make_encoder<'a>(array: &'a dyn Array) -> Result<NullableEncoder<'a>, ArrowError> { |
183 | 61 | let nulls = array.nulls().cloned(); |
184 | 61 | let enc = match array.data_type() { |
185 | | DataType::Boolean => { |
186 | 5 | let arr = array.as_boolean(); |
187 | 5 | NullableEncoder::new(Encoder::Boolean(BooleanEncoder(arr)), nulls) |
188 | | } |
189 | | DataType::Int32 => { |
190 | 23 | let arr = array.as_primitive::<Int32Type>(); |
191 | 23 | NullableEncoder::new(Encoder::Int(IntEncoder(arr)), nulls) |
192 | | } |
193 | | DataType::Int64 => { |
194 | 5 | let arr = array.as_primitive::<Int64Type>(); |
195 | 5 | NullableEncoder::new(Encoder::Long(LongEncoder(arr)), nulls) |
196 | | } |
197 | | DataType::Float32 => { |
198 | 5 | let arr = array.as_primitive::<Float32Type>(); |
199 | 5 | NullableEncoder::new(Encoder::Float32(F32Encoder(arr)), nulls) |
200 | | } |
201 | | DataType::Float64 => { |
202 | 5 | let arr = array.as_primitive::<Float64Type>(); |
203 | 5 | NullableEncoder::new(Encoder::Float64(F64Encoder(arr)), nulls) |
204 | | } |
205 | | DataType::Binary => { |
206 | 13 | let arr = array.as_binary::<i32>(); |
207 | 13 | NullableEncoder::new(Encoder::Binary(BinaryEncoder(arr)), nulls) |
208 | | } |
209 | | DataType::Timestamp(TimeUnit::Microsecond, _) => { |
210 | 5 | let arr = array.as_primitive::<TimestampMicrosecondType>(); |
211 | 5 | NullableEncoder::new(Encoder::Timestamp(LongEncoder(arr)), nulls) |
212 | | } |
213 | 0 | other => { |
214 | 0 | return Err(ArrowError::NotYetImplemented(format!( |
215 | 0 | "Unsupported data type for Avro encoding in slim build: {other:?}" |
216 | 0 | ))) |
217 | | } |
218 | | }; |
219 | 61 | Ok(enc) |
220 | 61 | } |
221 | | |
222 | | struct BooleanEncoder<'a>(&'a arrow_array::BooleanArray); |
223 | | impl BooleanEncoder<'_> { |
224 | | #[inline] |
225 | 40 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
226 | 40 | write_bool(out, self.0.value(idx)) |
227 | 40 | } |
228 | | } |
229 | | |
230 | | /// Generic Avro `int` encoder for primitive arrays with `i32` native values. |
231 | | struct IntEncoder<'a, P: ArrowPrimitiveType<Native = i32>>(&'a PrimitiveArray<P>); |
232 | | impl<'a, P: ArrowPrimitiveType<Native = i32>> IntEncoder<'a, P> { |
233 | | #[inline] |
234 | 169 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
235 | 169 | write_int(out, self.0.value(idx)) |
236 | 169 | } |
237 | | } |
238 | | |
239 | | /// Generic Avro `long` encoder for primitive arrays with `i64` native values. |
240 | | struct LongEncoder<'a, P: ArrowPrimitiveType<Native = i64>>(&'a PrimitiveArray<P>); |
241 | | impl<'a, P: ArrowPrimitiveType<Native = i64>> LongEncoder<'a, P> { |
242 | | #[inline] |
243 | 80 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
244 | 80 | write_long(out, self.0.value(idx)) |
245 | 80 | } |
246 | | } |
247 | | |
248 | | /// Unified binary encoder generic over offset size (i32/i64). |
249 | | struct BinaryEncoder<'a, O: OffsetSizeTrait>(&'a GenericBinaryArray<O>); |
250 | | impl<'a, O: OffsetSizeTrait> BinaryEncoder<'a, O> { |
251 | | #[inline] |
252 | 89 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
253 | 89 | write_len_prefixed(out, self.0.value(idx)) |
254 | 89 | } |
255 | | } |
256 | | |
257 | | struct F32Encoder<'a>(&'a arrow_array::Float32Array); |
258 | | impl F32Encoder<'_> { |
259 | | #[inline] |
260 | 40 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
261 | | // Avro float: 4 bytes, IEEE-754 little-endian |
262 | 40 | let bits = self.0.value(idx).to_bits(); |
263 | 40 | out.write_all(&bits.to_le_bytes()) |
264 | 40 | .map_err(|e| ArrowError::IoError(format!0 ("write f32: {e}"0 ), e0 )) |
265 | 40 | } |
266 | | } |
267 | | |
268 | | struct F64Encoder<'a>(&'a arrow_array::Float64Array); |
269 | | impl F64Encoder<'_> { |
270 | | #[inline] |
271 | 40 | fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { |
272 | | // Avro double: 8 bytes, IEEE-754 little-endian |
273 | 40 | let bits = self.0.value(idx).to_bits(); |
274 | 40 | out.write_all(&bits.to_le_bytes()) |
275 | 40 | .map_err(|e| ArrowError::IoError(format!0 ("write f64: {e}"0 ), e0 )) |
276 | 40 | } |
277 | | } |