/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/reader/record.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::codec::{AvroDataType, Codec, Nullability, Promotion, ResolutionInfo}; |
19 | | use crate::reader::block::{Block, BlockDecoder}; |
20 | | use crate::reader::cursor::AvroCursor; |
21 | | use crate::reader::header::Header; |
22 | | use crate::schema::*; |
23 | | use arrow_array::builder::{ |
24 | | ArrayBuilder, Decimal128Builder, Decimal256Builder, IntervalMonthDayNanoBuilder, |
25 | | PrimitiveBuilder, |
26 | | }; |
27 | | use arrow_array::types::*; |
28 | | use arrow_array::*; |
29 | | use arrow_buffer::*; |
30 | | use arrow_schema::{ |
31 | | ArrowError, DataType, Field as ArrowField, FieldRef, Fields, IntervalUnit, |
32 | | Schema as ArrowSchema, SchemaRef, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, |
33 | | }; |
34 | | use std::cmp::Ordering; |
35 | | use std::collections::HashMap; |
36 | | use std::io::Read; |
37 | | use std::sync::Arc; |
38 | | use uuid::Uuid; |
39 | | |
40 | | const DEFAULT_CAPACITY: usize = 1024; |
41 | | |
42 | | #[derive(Debug)] |
43 | | pub(crate) struct RecordDecoderBuilder<'a> { |
44 | | data_type: &'a AvroDataType, |
45 | | use_utf8view: bool, |
46 | | } |
47 | | |
48 | | impl<'a> RecordDecoderBuilder<'a> { |
49 | 0 | pub(crate) fn new(data_type: &'a AvroDataType) -> Self { |
50 | 0 | Self { |
51 | 0 | data_type, |
52 | 0 | use_utf8view: false, |
53 | 0 | } |
54 | 0 | } |
55 | | |
56 | 0 | pub(crate) fn with_utf8_view(mut self, use_utf8view: bool) -> Self { |
57 | 0 | self.use_utf8view = use_utf8view; |
58 | 0 | self |
59 | 0 | } |
60 | | |
61 | | /// Builds the `RecordDecoder`. |
62 | 0 | pub(crate) fn build(self) -> Result<RecordDecoder, ArrowError> { |
63 | 0 | RecordDecoder::try_new_with_options(self.data_type, self.use_utf8view) |
64 | 0 | } |
65 | | } |
66 | | |
67 | | /// Decodes avro encoded data into [`RecordBatch`] |
68 | | #[derive(Debug)] |
69 | | pub(crate) struct RecordDecoder { |
70 | | schema: SchemaRef, |
71 | | fields: Vec<Decoder>, |
72 | | use_utf8view: bool, |
73 | | resolved: Option<ResolvedRuntime>, |
74 | | } |
75 | | |
76 | | #[derive(Debug)] |
77 | | struct ResolvedRuntime { |
78 | | /// writer field index -> reader field index (or None if writer-only) |
79 | | writer_to_reader: Arc<[Option<usize>]>, |
80 | | /// per-writer-field skipper (Some only when writer-only) |
81 | | skip_decoders: Vec<Option<Skipper>>, |
82 | | } |
83 | | |
84 | | impl RecordDecoder { |
85 | | /// Creates a new `RecordDecoderBuilder` for configuring a `RecordDecoder`. |
86 | 0 | pub(crate) fn new(data_type: &'_ AvroDataType) -> Self { |
87 | 0 | RecordDecoderBuilder::new(data_type).build().unwrap() |
88 | 0 | } |
89 | | |
90 | | /// Create a new [`RecordDecoder`] from the provided [`AvroDataType`] with default options |
91 | 0 | pub(crate) fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> { |
92 | 0 | RecordDecoderBuilder::new(data_type) |
93 | 0 | .with_utf8_view(true) |
94 | 0 | .build() |
95 | 0 | } |
96 | | |
97 | | /// Creates a new [`RecordDecoder`] from the provided [`AvroDataType`] with additional options. |
98 | | /// |
99 | | /// This method allows you to customize how the Avro data is decoded into Arrow arrays. |
100 | | /// |
101 | | /// # Arguments |
102 | | /// * `data_type` - The Avro data type to decode. |
103 | | /// * `use_utf8view` - A flag indicating whether to use `Utf8View` for string types. |
104 | | /// * `strict_mode` - A flag to enable strict decoding, returning an error if the data |
105 | | /// does not conform to the schema. |
106 | | /// |
107 | | /// # Errors |
108 | | /// This function will return an error if the provided `data_type` is not a `Record`. |
109 | 90 | pub(crate) fn try_new_with_options( |
110 | 90 | data_type: &AvroDataType, |
111 | 90 | use_utf8view: bool, |
112 | 90 | ) -> Result<Self, ArrowError> { |
113 | 90 | match data_type.codec() { |
114 | 90 | Codec::Struct(reader_fields) => { |
115 | | // Build Arrow schema fields and per-child decoders |
116 | 90 | let mut arrow_fields = Vec::with_capacity(reader_fields.len()); |
117 | 90 | let mut encodings = Vec::with_capacity(reader_fields.len()); |
118 | 521 | for avro_field in reader_fields90 .iter90 () { |
119 | 521 | arrow_fields.push(avro_field.field()); |
120 | 521 | encodings.push(Decoder::try_new(avro_field.data_type())?0 ); |
121 | | } |
122 | | // If this record carries resolution metadata, prepare top-level runtime helpers |
123 | 90 | let resolved = match data_type.resolution.as_ref() { |
124 | 29 | Some(ResolutionInfo::Record(rec)) => { |
125 | 29 | let skip_decoders = build_skip_decoders(&rec.skip_fields)?0 ; |
126 | 29 | Some(ResolvedRuntime { |
127 | 29 | writer_to_reader: rec.writer_to_reader.clone(), |
128 | 29 | skip_decoders, |
129 | 29 | }) |
130 | | } |
131 | 61 | _ => None, |
132 | | }; |
133 | 90 | Ok(Self { |
134 | 90 | schema: Arc::new(ArrowSchema::new(arrow_fields)), |
135 | 90 | fields: encodings, |
136 | 90 | use_utf8view, |
137 | 90 | resolved, |
138 | 90 | }) |
139 | | } |
140 | 0 | other => Err(ArrowError::ParseError(format!( |
141 | 0 | "Expected record got {other:?}" |
142 | 0 | ))), |
143 | | } |
144 | 90 | } |
145 | | |
146 | | /// Returns the decoder's `SchemaRef` |
147 | 73 | pub(crate) fn schema(&self) -> &SchemaRef { |
148 | 73 | &self.schema |
149 | 73 | } |
150 | | |
151 | | /// Decode `count` records from `buf` |
152 | 135 | pub(crate) fn decode(&mut self, buf: &[u8], count: usize) -> Result<usize, ArrowError> { |
153 | 135 | let mut cursor = AvroCursor::new(buf); |
154 | 135 | match self.resolved.as_mut() { |
155 | 22 | Some(runtime) => { |
156 | | // Top-level resolved record: read writer fields in writer order, |
157 | | // project into reader fields, and skip writer-only fields |
158 | 22 | for _ in 0..count { |
159 | 141 | decode_with_resolution( |
160 | 141 | &mut cursor, |
161 | 141 | &mut self.fields, |
162 | 141 | &runtime.writer_to_reader, |
163 | 141 | &mut runtime.skip_decoders, |
164 | 0 | )?; |
165 | | } |
166 | | } |
167 | | None => { |
168 | 113 | for _ in 0..count { |
169 | 2.78k | for field2.30k in &mut self.fields { |
170 | 2.30k | field.decode(&mut cursor)?0 ; |
171 | | } |
172 | | } |
173 | | } |
174 | | } |
175 | 135 | Ok(cursor.position()) |
176 | 135 | } |
177 | | |
178 | | /// Flush the decoded records into a [`RecordBatch`] |
179 | 133 | pub(crate) fn flush(&mut self) -> Result<RecordBatch, ArrowError> { |
180 | 133 | let arrays = self |
181 | 133 | .fields |
182 | 133 | .iter_mut() |
183 | 679 | .map133 (|x| x.flush(None)) |
184 | 133 | .collect::<Result<Vec<_>, _>>()?0 ; |
185 | 133 | RecordBatch::try_new(self.schema.clone(), arrays) |
186 | 133 | } |
187 | | } |
188 | | |
189 | 147 | fn decode_with_resolution( |
190 | 147 | buf: &mut AvroCursor<'_>, |
191 | 147 | encodings: &mut [Decoder], |
192 | 147 | writer_to_reader: &[Option<usize>], |
193 | 147 | skippers: &mut [Option<Skipper>], |
194 | 147 | ) -> Result<(), ArrowError> { |
195 | 1.51k | for (w_idx, (target, skipper_opt)) in writer_to_reader147 .iter147 ().zip147 (skippers147 ).enumerate147 () { |
196 | 1.51k | match (*target, skipper_opt.as_mut()) { |
197 | 1.35k | (Some(r_idx), _) => encodings[r_idx].decode(buf)?0 , |
198 | 158 | (None, Some(sk)) => sk.skip(buf)?0 , |
199 | | (None, None) => { |
200 | 0 | return Err(ArrowError::SchemaError(format!( |
201 | 0 | "No skipper available for writer-only field at index {w_idx}", |
202 | 0 | ))); |
203 | | } |
204 | | } |
205 | | } |
206 | 147 | Ok(()) |
207 | 147 | } |
208 | | |
209 | | #[derive(Debug)] |
210 | | enum Decoder { |
211 | | Null(usize), |
212 | | Boolean(BooleanBufferBuilder), |
213 | | Int32(Vec<i32>), |
214 | | Int64(Vec<i64>), |
215 | | Float32(Vec<f32>), |
216 | | Float64(Vec<f64>), |
217 | | Date32(Vec<i32>), |
218 | | TimeMillis(Vec<i32>), |
219 | | TimeMicros(Vec<i64>), |
220 | | TimestampMillis(bool, Vec<i64>), |
221 | | TimestampMicros(bool, Vec<i64>), |
222 | | Int32ToInt64(Vec<i64>), |
223 | | Int32ToFloat32(Vec<f32>), |
224 | | Int32ToFloat64(Vec<f64>), |
225 | | Int64ToFloat32(Vec<f32>), |
226 | | Int64ToFloat64(Vec<f64>), |
227 | | Float32ToFloat64(Vec<f64>), |
228 | | BytesToString(OffsetBufferBuilder<i32>, Vec<u8>), |
229 | | StringToBytes(OffsetBufferBuilder<i32>, Vec<u8>), |
230 | | Binary(OffsetBufferBuilder<i32>, Vec<u8>), |
231 | | /// String data encoded as UTF-8 bytes, mapped to Arrow's StringArray |
232 | | String(OffsetBufferBuilder<i32>, Vec<u8>), |
233 | | /// String data encoded as UTF-8 bytes, but mapped to Arrow's StringViewArray |
234 | | StringView(OffsetBufferBuilder<i32>, Vec<u8>), |
235 | | Array(FieldRef, OffsetBufferBuilder<i32>, Box<Decoder>), |
236 | | Record(Fields, Vec<Decoder>), |
237 | | Map( |
238 | | FieldRef, |
239 | | OffsetBufferBuilder<i32>, |
240 | | OffsetBufferBuilder<i32>, |
241 | | Vec<u8>, |
242 | | Box<Decoder>, |
243 | | ), |
244 | | Fixed(i32, Vec<u8>), |
245 | | Enum(Vec<i32>, Arc<[String]>), |
246 | | Duration(IntervalMonthDayNanoBuilder), |
247 | | Uuid(Vec<u8>), |
248 | | Decimal128(usize, Option<usize>, Option<usize>, Decimal128Builder), |
249 | | Decimal256(usize, Option<usize>, Option<usize>, Decimal256Builder), |
250 | | Nullable(Nullability, NullBufferBuilder, Box<Decoder>), |
251 | | /// Resolved record that needs writer->reader projection and skipping writer-only fields |
252 | | RecordResolved { |
253 | | fields: Fields, |
254 | | encodings: Vec<Decoder>, |
255 | | writer_to_reader: Arc<[Option<usize>]>, |
256 | | skip_decoders: Vec<Option<Skipper>>, |
257 | | }, |
258 | | } |
259 | | |
260 | | impl Decoder { |
261 | 678 | fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> { |
262 | | // Extract just the Promotion (if any) to simplify pattern matching |
263 | 678 | let promotion = match data_type.resolution.as_ref() { |
264 | 69 | Some(ResolutionInfo::Promotion(p)) => Some(p), |
265 | 609 | _ => None, |
266 | | }; |
267 | 678 | let decoder = match (data_type.codec(), promotion) { |
268 | | (Codec::Int64, Some(Promotion::IntToLong)) => { |
269 | 11 | Self::Int32ToInt64(Vec::with_capacity(DEFAULT_CAPACITY)) |
270 | | } |
271 | | (Codec::Float32, Some(Promotion::IntToFloat)) => { |
272 | 6 | Self::Int32ToFloat32(Vec::with_capacity(DEFAULT_CAPACITY)) |
273 | | } |
274 | | (Codec::Float64, Some(Promotion::IntToDouble)) => { |
275 | 11 | Self::Int32ToFloat64(Vec::with_capacity(DEFAULT_CAPACITY)) |
276 | | } |
277 | | (Codec::Float32, Some(Promotion::LongToFloat)) => { |
278 | 6 | Self::Int64ToFloat32(Vec::with_capacity(DEFAULT_CAPACITY)) |
279 | | } |
280 | | (Codec::Float64, Some(Promotion::LongToDouble)) => { |
281 | 6 | Self::Int64ToFloat64(Vec::with_capacity(DEFAULT_CAPACITY)) |
282 | | } |
283 | | (Codec::Float64, Some(Promotion::FloatToDouble)) => { |
284 | 6 | Self::Float32ToFloat64(Vec::with_capacity(DEFAULT_CAPACITY)) |
285 | | } |
286 | | (Codec::Utf8, Some(Promotion::BytesToString)) |
287 | 22 | | (Codec::Utf8View, Some(Promotion::BytesToString)) => Self::BytesToString( |
288 | 22 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
289 | 22 | Vec::with_capacity(DEFAULT_CAPACITY), |
290 | 22 | ), |
291 | 1 | (Codec::Binary, Some(Promotion::StringToBytes)) => Self::StringToBytes( |
292 | 1 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
293 | 1 | Vec::with_capacity(DEFAULT_CAPACITY), |
294 | 1 | ), |
295 | 0 | (Codec::Null, _) => Self::Null(0), |
296 | 42 | (Codec::Boolean, _) => Self::Boolean(BooleanBufferBuilder::new(DEFAULT_CAPACITY)), |
297 | 179 | (Codec::Int32, _) => Self::Int32(Vec::with_capacity(DEFAULT_CAPACITY)), |
298 | 46 | (Codec::Int64, _) => Self::Int64(Vec::with_capacity(DEFAULT_CAPACITY)), |
299 | 36 | (Codec::Float32, _) => Self::Float32(Vec::with_capacity(DEFAULT_CAPACITY)), |
300 | 49 | (Codec::Float64, _) => Self::Float64(Vec::with_capacity(DEFAULT_CAPACITY)), |
301 | 58 | (Codec::Binary, _) => Self::Binary( |
302 | 58 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
303 | 58 | Vec::with_capacity(DEFAULT_CAPACITY), |
304 | 58 | ), |
305 | 20 | (Codec::Utf8, _) => Self::String( |
306 | 20 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
307 | 20 | Vec::with_capacity(DEFAULT_CAPACITY), |
308 | 20 | ), |
309 | 0 | (Codec::Utf8View, _) => Self::StringView( |
310 | 0 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
311 | 0 | Vec::with_capacity(DEFAULT_CAPACITY), |
312 | 0 | ), |
313 | 0 | (Codec::Date32, _) => Self::Date32(Vec::with_capacity(DEFAULT_CAPACITY)), |
314 | 0 | (Codec::TimeMillis, _) => Self::TimeMillis(Vec::with_capacity(DEFAULT_CAPACITY)), |
315 | 0 | (Codec::TimeMicros, _) => Self::TimeMicros(Vec::with_capacity(DEFAULT_CAPACITY)), |
316 | 0 | (Codec::TimestampMillis(is_utc), _) => { |
317 | 0 | Self::TimestampMillis(*is_utc, Vec::with_capacity(DEFAULT_CAPACITY)) |
318 | | } |
319 | 38 | (Codec::TimestampMicros(is_utc), _) => { |
320 | 38 | Self::TimestampMicros(*is_utc, Vec::with_capacity(DEFAULT_CAPACITY)) |
321 | | } |
322 | 8 | (Codec::Fixed(sz), _) => Self::Fixed(*sz, Vec::with_capacity(DEFAULT_CAPACITY)), |
323 | 12 | (Codec::Decimal(precision, scale, size), _) => { |
324 | 12 | let p = *precision; |
325 | 12 | let s = *scale; |
326 | 12 | let sz = *size; |
327 | 12 | let prec = p as u8; |
328 | 12 | let scl = s.unwrap_or(0) as i8; |
329 | 12 | match (sz, p) { |
330 | 11 | (Some(fixed_size10 ), _) if fixed_size <= 1610 => { |
331 | 10 | let builder = |
332 | 10 | Decimal128Builder::new().with_precision_and_scale(prec, scl)?0 ; |
333 | 10 | Self::Decimal128(p, s, sz, builder) |
334 | | } |
335 | 1 | (Some(fixed_size), _) if fixed_size <= 32 => { |
336 | 1 | let builder = |
337 | 1 | Decimal256Builder::new().with_precision_and_scale(prec, scl)?0 ; |
338 | 1 | Self::Decimal256(p, s, sz, builder) |
339 | | } |
340 | 0 | (Some(fixed_size), _) => { |
341 | 0 | return Err(ArrowError::ParseError(format!( |
342 | 0 | "Unsupported decimal size: {fixed_size:?}" |
343 | 0 | ))); |
344 | | } |
345 | 1 | (None, p) if p <= DECIMAL128_MAX_PRECISION as usize => { |
346 | 1 | let builder = |
347 | 1 | Decimal128Builder::new().with_precision_and_scale(prec, scl)?0 ; |
348 | 1 | Self::Decimal128(p, s, sz, builder) |
349 | | } |
350 | 0 | (None, p) if p <= DECIMAL256_MAX_PRECISION as usize => { |
351 | 0 | let builder = |
352 | 0 | Decimal256Builder::new().with_precision_and_scale(prec, scl)?; |
353 | 0 | Self::Decimal256(p, s, sz, builder) |
354 | | } |
355 | | (None, _) => { |
356 | 0 | return Err(ArrowError::ParseError(format!( |
357 | 0 | "Decimal precision {p} exceeds maximum supported" |
358 | 0 | ))); |
359 | | } |
360 | | } |
361 | | } |
362 | 3 | (Codec::Interval, _) => Self::Duration(IntervalMonthDayNanoBuilder::new()), |
363 | 54 | (Codec::List(item), _) => { |
364 | 54 | let decoder = Self::try_new(item)?0 ; |
365 | 54 | Self::Array( |
366 | 54 | Arc::new(item.field_with_name("item")), |
367 | 54 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
368 | 54 | Box::new(decoder), |
369 | 54 | ) |
370 | | } |
371 | 11 | (Codec::Enum(symbols), _) => { |
372 | 11 | Self::Enum(Vec::with_capacity(DEFAULT_CAPACITY), symbols.clone()) |
373 | | } |
374 | 37 | (Codec::Struct(fields), _) => { |
375 | 37 | let mut arrow_fields = Vec::with_capacity(fields.len()); |
376 | 37 | let mut encodings = Vec::with_capacity(fields.len()); |
377 | 61 | for avro_field in fields37 .iter37 () { |
378 | 61 | let encoding = Self::try_new(avro_field.data_type())?0 ; |
379 | 61 | arrow_fields.push(avro_field.field()); |
380 | 61 | encodings.push(encoding); |
381 | | } |
382 | 37 | if let Some(ResolutionInfo::Record(rec0 )) = data_type.resolution.as_ref() { |
383 | 0 | let skip_decoders = build_skip_decoders(&rec.skip_fields)?; |
384 | 0 | Self::RecordResolved { |
385 | 0 | fields: arrow_fields.into(), |
386 | 0 | encodings, |
387 | 0 | writer_to_reader: rec.writer_to_reader.clone(), |
388 | 0 | skip_decoders, |
389 | 0 | } |
390 | | } else { |
391 | 37 | Self::Record(arrow_fields.into(), encodings) |
392 | | } |
393 | | } |
394 | 14 | (Codec::Map(child), _) => { |
395 | 14 | let val_field = child.field_with_name("value").with_nullable(true); |
396 | 14 | let map_field = Arc::new(ArrowField::new( |
397 | | "entries", |
398 | 14 | DataType::Struct(Fields::from(vec![ |
399 | 14 | ArrowField::new("key", DataType::Utf8, false), |
400 | 14 | val_field, |
401 | 14 | ])), |
402 | | false, |
403 | | )); |
404 | 14 | let val_dec = Self::try_new(child)?0 ; |
405 | 14 | Self::Map( |
406 | 14 | map_field, |
407 | 14 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
408 | 14 | OffsetBufferBuilder::new(DEFAULT_CAPACITY), |
409 | 14 | Vec::with_capacity(DEFAULT_CAPACITY), |
410 | 14 | Box::new(val_dec), |
411 | 14 | ) |
412 | | } |
413 | 2 | (Codec::Uuid, _) => Self::Uuid(Vec::with_capacity(DEFAULT_CAPACITY)), |
414 | | }; |
415 | 678 | Ok(match data_type.nullability() { |
416 | 584 | Some(nullability) => Self::Nullable( |
417 | 584 | nullability, |
418 | 584 | NullBufferBuilder::new(DEFAULT_CAPACITY), |
419 | 584 | Box::new(decoder), |
420 | 584 | ), |
421 | 94 | None => decoder, |
422 | | }) |
423 | 678 | } |
424 | | |
425 | | /// Append a null record |
426 | 226 | fn append_null(&mut self) { |
427 | 226 | match self { |
428 | 0 | Self::Null(count) => *count += 1, |
429 | 2 | Self::Boolean(b) => b.append(false), |
430 | 47 | Self::Int32(v) | Self::Date32(v0 ) | Self::TimeMillis(v0 ) => v.push(0), |
431 | 7 | Self::Int64(v) |
432 | 0 | | Self::Int32ToInt64(v) |
433 | 0 | | Self::TimeMicros(v) |
434 | 0 | | Self::TimestampMillis(_, v) |
435 | 7 | | Self::TimestampMicros(_, v0 ) => v.push(0), |
436 | 2 | Self::Float32(v) | Self::Int32ToFloat32(v0 ) | Self::Int64ToFloat32(v0 ) => v.push(0.), |
437 | 6 | Self::Float64(v) |
438 | 0 | | Self::Int32ToFloat64(v) |
439 | 0 | | Self::Int64ToFloat64(v) |
440 | 6 | | Self::Float32ToFloat64(v0 ) => v.push(0.), |
441 | 3 | Self::Binary(offsets, _) |
442 | 20 | | Self::String(offsets, _) |
443 | 0 | | Self::StringView(offsets, _) |
444 | 0 | | Self::BytesToString(offsets, _) |
445 | 23 | | Self::StringToBytes(offsets0 , _) => { |
446 | 23 | offsets.push_length(0); |
447 | 23 | } |
448 | 0 | Self::Uuid(v) => { |
449 | 0 | v.extend([0; 16]); |
450 | 0 | } |
451 | 61 | Self::Array(_, offsets, e) => { |
452 | 61 | offsets.push_length(0); |
453 | 61 | } |
454 | 36 | Self::Record(_, e26 ) => e.iter_mut()26 .for_each26 (|e| e.append_null()), |
455 | 14 | Self::Map(_, _koff, moff, _, _) => { |
456 | 14 | moff.push_length(0); |
457 | 14 | } |
458 | 2 | Self::Fixed(sz, accum) => { |
459 | 2 | accum.extend(std::iter::repeat_n(0u8, *sz as usize)); |
460 | 2 | } |
461 | 2 | Self::Decimal128(_, _, _, builder) => builder.append_value(0), |
462 | 0 | Self::Decimal256(_, _, _, builder) => builder.append_value(i256::ZERO), |
463 | 3 | Self::Enum(indices, _) => indices.push(0), |
464 | 1 | Self::Duration(builder) => builder.append_null(), |
465 | 30 | Self::Nullable(_, null_buffer, inner) => { |
466 | 30 | null_buffer.append(false); |
467 | 30 | inner.append_null(); |
468 | 30 | } |
469 | 0 | Self::RecordResolved { encodings, .. } => { |
470 | 0 | encodings.iter_mut().for_each(|e| e.append_null()); |
471 | | } |
472 | | } |
473 | 226 | } |
474 | | |
475 | | /// Decode a single record from `buf` |
476 | 8.15k | fn decode(&mut self, buf: &mut AvroCursor<'_>) -> Result<(), ArrowError> { |
477 | 8.15k | match self { |
478 | 0 | Self::Null(x) => *x += 1, |
479 | 295 | Self::Boolean(values) => values.append(buf.get_bool()?0 ), |
480 | 1.17k | Self::Int32(values) | Self::Date32(values0 ) | Self::TimeMillis(values0 ) => { |
481 | 1.17k | values.push1.16k (buf.get_int()?1 ) |
482 | | } |
483 | 242 | Self::Int64(values) |
484 | 0 | | Self::TimeMicros(values) |
485 | 0 | | Self::TimestampMillis(_, values) |
486 | 534 | | Self::TimestampMicros(_, values292 ) => values.push(buf.get_long()?0 ), |
487 | 250 | Self::Float32(values) => values.push(buf.get_float()?0 ), |
488 | 309 | Self::Float64(values) => values.push(buf.get_double()?0 ), |
489 | 44 | Self::Int32ToInt64(values) => values.push(buf.get_int()?0 as i64), |
490 | 43 | Self::Int32ToFloat32(values) => values.push(buf.get_int()?0 as f32), |
491 | 83 | Self::Int32ToFloat64(values) => values.push(buf.get_int()?0 as f64), |
492 | 43 | Self::Int64ToFloat32(values) => values.push(buf.get_long()?0 as f32), |
493 | 43 | Self::Int64ToFloat64(values) => values.push(buf.get_long()?0 as f64), |
494 | 43 | Self::Float32ToFloat64(values) => values.push(buf.get_float()?0 as f64), |
495 | 3 | Self::StringToBytes(offsets, values) |
496 | 164 | | Self::BytesToString(offsets, values) |
497 | 422 | | Self::Binary(offsets, values) |
498 | 68 | | Self::String(offsets, values) |
499 | 0 | | Self::StringView(offsets, values) => { |
500 | 657 | let data = buf.get_bytes()?0 ; |
501 | 657 | offsets.push_length(data.len()); |
502 | 657 | values.extend_from_slice(data); |
503 | | } |
504 | 5 | Self::Uuid(values) => { |
505 | 5 | let s_bytes = buf.get_bytes()?0 ; |
506 | 5 | let s = std::str::from_utf8(s_bytes).map_err(|e| {0 |
507 | 0 | ArrowError::ParseError(format!("UUID bytes are not valid UTF-8: {e}")) |
508 | 0 | })?; |
509 | 5 | let uuid = Uuid::try_parse(s) |
510 | 5 | .map_err(|e| ArrowError::ParseError(format!0 ("Failed to parse uuid: {e}"0 )))?0 ; |
511 | 5 | values.extend_from_slice(uuid.as_bytes()); |
512 | | } |
513 | 160 | Self::Array(_, off, encoding) => { |
514 | 289 | let total_items160 = read_blocks160 (buf160 , |cursor| encoding.decode(cursor))?0 ; |
515 | 160 | off.push_length(total_items); |
516 | | } |
517 | 115 | Self::Record(_, encodings) => { |
518 | 316 | for encoding201 in encodings { |
519 | 201 | encoding.decode(buf)?0 ; |
520 | | } |
521 | | } |
522 | 40 | Self::Map(_, koff, moff, kdata, valdec) => { |
523 | 40 | let newly_added = read_blocks(buf, |cur| {37 |
524 | 37 | let kb = cur.get_bytes()?0 ; |
525 | 37 | koff.push_length(kb.len()); |
526 | 37 | kdata.extend_from_slice(kb); |
527 | 37 | valdec.decode(cur) |
528 | 37 | })?0 ; |
529 | 40 | moff.push_length(newly_added); |
530 | | } |
531 | 12 | Self::Fixed(sz, accum) => { |
532 | 12 | let fx = buf.get_fixed(*sz as usize)?0 ; |
533 | 12 | accum.extend_from_slice(fx); |
534 | | } |
535 | 198 | Self::Decimal128(_, _, size, builder) => { |
536 | 198 | let raw = if let Some(s196 ) = size { |
537 | 196 | buf.get_fixed(*s)?0 |
538 | | } else { |
539 | 2 | buf.get_bytes()?0 |
540 | | }; |
541 | 198 | let ext = sign_extend_to::<16>(raw)?0 ; |
542 | 198 | let val = i128::from_be_bytes(ext); |
543 | 198 | builder.append_value(val); |
544 | | } |
545 | 2 | Self::Decimal256(_, _, size, builder) => { |
546 | 2 | let raw = if let Some(s) = size { |
547 | 2 | buf.get_fixed(*s)?0 |
548 | | } else { |
549 | 0 | buf.get_bytes()? |
550 | | }; |
551 | 2 | let ext = sign_extend_to::<32>(raw)?0 ; |
552 | 2 | let val = i256::from_be_bytes(ext); |
553 | 2 | builder.append_value(val); |
554 | | } |
555 | 36 | Self::Enum(indices, _) => { |
556 | 36 | indices.push(buf.get_int()?0 ); |
557 | | } |
558 | 6 | Self::Duration(builder) => { |
559 | 6 | let b = buf.get_fixed(12)?0 ; |
560 | 6 | let months = u32::from_le_bytes(b[0..4].try_into().unwrap()); |
561 | 6 | let days = u32::from_le_bytes(b[4..8].try_into().unwrap()); |
562 | 6 | let millis = u32::from_le_bytes(b[8..12].try_into().unwrap()); |
563 | 6 | let nanos = (millis as i64) * 1_000_000; |
564 | 6 | builder.append_value(IntervalMonthDayNano::new(months as i32, days as i32, nanos)); |
565 | | } |
566 | 4.06k | Self::Nullable(order, nb, encoding) => { |
567 | 4.06k | let branch = buf.read_vlq()?0 ; |
568 | 4.06k | let is_not_null = match *order { |
569 | 487 | Nullability::NullFirst => branch != 0, |
570 | 3.57k | Nullability::NullSecond => branch == 0, |
571 | | }; |
572 | 4.06k | if is_not_null { |
573 | | // It is important to decode before appending to null buffer in case of decode error |
574 | 3.90k | encoding.decode(buf)?1 ; |
575 | 3.90k | nb.append(true); |
576 | 160 | } else { |
577 | 160 | encoding.append_null(); |
578 | 160 | nb.append(false); |
579 | 160 | } |
580 | | } |
581 | | Self::RecordResolved { |
582 | 6 | encodings, |
583 | 6 | writer_to_reader, |
584 | 6 | skip_decoders, |
585 | | .. |
586 | | } => { |
587 | 6 | decode_with_resolution(buf, encodings, writer_to_reader, skip_decoders)?0 ; |
588 | | } |
589 | | } |
590 | 8.15k | Ok(()) |
591 | 8.15k | } |
592 | | |
593 | | /// Flush decoded records to an [`ArrayRef`] |
594 | 1.68k | fn flush(&mut self, nulls: Option<NullBuffer>) -> Result<ArrayRef, ArrowError> { |
595 | 1.68k | Ok(match self { |
596 | 794 | Self::Nullable(_, n, e) => e.flush(n.finish())?0 , |
597 | 0 | Self::Null(size) => Arc::new(NullArray::new(std::mem::replace(size, 0))), |
598 | 52 | Self::Boolean(b) => Arc::new(BooleanArray::new(b.finish(), nulls)), |
599 | 242 | Self::Int32(values) => Arc::new(flush_primitive::<Int32Type>(values, nulls)), |
600 | 0 | Self::Date32(values) => Arc::new(flush_primitive::<Date32Type>(values, nulls)), |
601 | 53 | Self::Int64(values) => Arc::new(flush_primitive::<Int64Type>(values, nulls)), |
602 | 0 | Self::TimeMillis(values) => { |
603 | 0 | Arc::new(flush_primitive::<Time32MillisecondType>(values, nulls)) |
604 | | } |
605 | 0 | Self::TimeMicros(values) => { |
606 | 0 | Arc::new(flush_primitive::<Time64MicrosecondType>(values, nulls)) |
607 | | } |
608 | 0 | Self::TimestampMillis(is_utc, values) => Arc::new( |
609 | 0 | flush_primitive::<TimestampMillisecondType>(values, nulls) |
610 | 0 | .with_timezone_opt(is_utc.then(|| "+00:00")), |
611 | | ), |
612 | 48 | Self::TimestampMicros(is_utc, values) => Arc::new( |
613 | 48 | flush_primitive::<TimestampMicrosecondType>(values, nulls) |
614 | 48 | .with_timezone_opt(is_utc.then(|| "+00:00")), |
615 | | ), |
616 | 46 | Self::Float32(values) => Arc::new(flush_primitive::<Float32Type>(values, nulls)), |
617 | 61 | Self::Float64(values) => Arc::new(flush_primitive::<Float64Type>(values, nulls)), |
618 | 6 | Self::Int32ToInt64(values) => Arc::new(flush_primitive::<Int64Type>(values, nulls)), |
619 | 6 | Self::Int32ToFloat32(values) | Self::Int64ToFloat32(values) => { |
620 | 12 | Arc::new(flush_primitive::<Float32Type>(values, nulls)) |
621 | | } |
622 | 11 | Self::Int32ToFloat64(values) |
623 | 6 | | Self::Int64ToFloat64(values) |
624 | 6 | | Self::Float32ToFloat64(values) => { |
625 | 23 | Arc::new(flush_primitive::<Float64Type>(values, nulls)) |
626 | | } |
627 | 79 | Self::StringToBytes(offsets1 , values1 ) | Self::Binary(offsets, values) => { |
628 | 80 | let offsets = flush_offsets(offsets); |
629 | 80 | let values = flush_values(values).into(); |
630 | 80 | Arc::new(BinaryArray::new(offsets, values, nulls)) |
631 | | } |
632 | 23 | Self::BytesToString(offsets22 , values22 ) | Self::String(offsets, values) => { |
633 | 45 | let offsets = flush_offsets(offsets); |
634 | 45 | let values = flush_values(values).into(); |
635 | 45 | Arc::new(StringArray::new(offsets, values, nulls)) |
636 | | } |
637 | 0 | Self::StringView(offsets, values) => { |
638 | 0 | let offsets = flush_offsets(offsets); |
639 | 0 | let values = flush_values(values); |
640 | 0 | let array = StringArray::new(offsets, values.into(), nulls.clone()); |
641 | 0 | let values: Vec<&str> = (0..array.len()) |
642 | 0 | .map(|i| { |
643 | 0 | if array.is_valid(i) { |
644 | 0 | array.value(i) |
645 | | } else { |
646 | 0 | "" |
647 | | } |
648 | 0 | }) |
649 | 0 | .collect(); |
650 | 0 | Arc::new(StringViewArray::from(values)) |
651 | | } |
652 | 71 | Self::Array(field, offsets, values) => { |
653 | 71 | let values = values.flush(None)?0 ; |
654 | 71 | let offsets = flush_offsets(offsets); |
655 | 71 | Arc::new(ListArray::new(field.clone(), offsets, values, nulls)) |
656 | | } |
657 | 49 | Self::Record(fields, encodings) => { |
658 | 49 | let arrays = encodings |
659 | 49 | .iter_mut() |
660 | 82 | .map49 (|x| x.flush(None)) |
661 | 49 | .collect::<Result<Vec<_>, _>>()?0 ; |
662 | 49 | Arc::new(StructArray::new(fields.clone(), arrays, nulls)) |
663 | | } |
664 | 20 | Self::Map(map_field, k_off, m_off, kdata, valdec) => { |
665 | 20 | let moff = flush_offsets(m_off); |
666 | 20 | let koff = flush_offsets(k_off); |
667 | 20 | let kd = flush_values(kdata).into(); |
668 | 20 | let val_arr = valdec.flush(None)?0 ; |
669 | 20 | let key_arr = StringArray::new(koff, kd, None); |
670 | 20 | if key_arr.len() != val_arr.len() { |
671 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
672 | 0 | "Map keys length ({}) != map values length ({})", |
673 | 0 | key_arr.len(), |
674 | 0 | val_arr.len() |
675 | 0 | ))); |
676 | 20 | } |
677 | 20 | let final_len = moff.len() - 1; |
678 | 20 | if let Some(n7 ) = &nulls { |
679 | 7 | if n.len() != final_len { |
680 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
681 | 0 | "Map array null buffer length {} != final map length {final_len}", |
682 | 0 | n.len() |
683 | 0 | ))); |
684 | 7 | } |
685 | 13 | } |
686 | 20 | let entries_struct = StructArray::new( |
687 | 20 | Fields::from(vec![ |
688 | 20 | Arc::new(ArrowField::new("key", DataType::Utf8, false)), |
689 | 20 | Arc::new(ArrowField::new("value", val_arr.data_type().clone(), true)), |
690 | | ]), |
691 | 20 | vec![Arc::new(key_arr), val_arr], |
692 | 20 | None, |
693 | | ); |
694 | 20 | let map_arr = MapArray::new(map_field.clone(), moff, entries_struct, nulls, false); |
695 | 20 | Arc::new(map_arr) |
696 | | } |
697 | 11 | Self::Fixed(sz, accum) => { |
698 | 11 | let b: Buffer = flush_values(accum).into(); |
699 | 11 | let arr = FixedSizeBinaryArray::try_new(*sz, b, nulls) |
700 | 11 | .map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
701 | 11 | Arc::new(arr) |
702 | | } |
703 | 2 | Self::Uuid(values) => { |
704 | 2 | let arr = FixedSizeBinaryArray::try_new(16, std::mem::take(values).into(), nulls) |
705 | 2 | .map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
706 | 2 | Arc::new(arr) |
707 | | } |
708 | 47 | Self::Decimal128(precision, scale, _, builder) => { |
709 | 47 | let (_, vals, _) = builder.finish().into_parts(); |
710 | 47 | let scl = scale.unwrap_or(0); |
711 | 47 | let dec = Decimal128Array::new(vals, nulls) |
712 | 47 | .with_precision_and_scale(*precision as u8, scl as i8) |
713 | 47 | .map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
714 | 47 | Arc::new(dec) |
715 | | } |
716 | 1 | Self::Decimal256(precision, scale, _, builder) => { |
717 | 1 | let (_, vals, _) = builder.finish().into_parts(); |
718 | 1 | let scl = scale.unwrap_or(0); |
719 | 1 | let dec = Decimal256Array::new(vals, nulls) |
720 | 1 | .with_precision_and_scale(*precision as u8, scl as i8) |
721 | 1 | .map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
722 | 1 | Arc::new(dec) |
723 | | } |
724 | 14 | Self::Enum(indices, symbols) => { |
725 | 14 | let keys = flush_primitive::<Int32Type>(indices, nulls); |
726 | 14 | let values = Arc::new(StringArray::from( |
727 | 47 | symbols.iter()14 .map14 (|s| s.as_str()).collect14 ::<Vec<_>>(), |
728 | | )); |
729 | 14 | Arc::new(DictionaryArray::try_new(keys, values)?0 ) |
730 | | } |
731 | 3 | Self::Duration(builder) => { |
732 | 3 | let (_, vals, _) = builder.finish().into_parts(); |
733 | 3 | let vals = IntervalMonthDayNanoArray::try_new(vals, nulls) |
734 | 3 | .map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
735 | 3 | Arc::new(vals) |
736 | | } |
737 | | Self::RecordResolved { |
738 | 5 | fields, encodings, .. |
739 | | } => { |
740 | 5 | let arrays = encodings |
741 | 5 | .iter_mut() |
742 | 6 | .map5 (|x| x.flush(None)) |
743 | 5 | .collect::<Result<Vec<_>, _>>()?0 ; |
744 | 5 | Arc::new(StructArray::new(fields.clone(), arrays, nulls)) |
745 | | } |
746 | | }) |
747 | 1.68k | } |
748 | | } |
749 | | |
750 | | #[derive(Debug, Copy, Clone)] |
751 | | enum NegativeBlockBehavior { |
752 | | ProcessItems, |
753 | | SkipBySize, |
754 | | } |
755 | | |
756 | | #[inline] |
757 | 2 | fn skip_blocks( |
758 | 2 | buf: &mut AvroCursor<'_>, |
759 | 2 | mut skip_item: impl FnMut(&mut AvroCursor<'_>) -> Result<(), ArrowError>, |
760 | 2 | _skip_negative_block_by_size: bool, |
761 | 2 | ) -> Result<usize, ArrowError> { |
762 | 2 | process_blockwise( |
763 | 2 | buf, |
764 | 0 | move |c| skip_item(c), |
765 | 2 | NegativeBlockBehavior::SkipBySize, |
766 | | ) |
767 | 2 | } |
768 | | |
769 | | #[inline] |
770 | 200 | fn read_blocks( |
771 | 200 | buf: &mut AvroCursor, |
772 | 200 | decode_entry: impl FnMut(&mut AvroCursor) -> Result<(), ArrowError>, |
773 | 200 | ) -> Result<usize, ArrowError> { |
774 | 200 | process_blockwise(buf, decode_entry, NegativeBlockBehavior::ProcessItems) |
775 | 200 | } |
776 | | |
777 | | #[inline] |
778 | 202 | fn process_blockwise( |
779 | 202 | buf: &mut AvroCursor<'_>, |
780 | 202 | mut on_item: impl FnMut(&mut AvroCursor<'_>) -> Result<(), ArrowError>, |
781 | 202 | negative_behavior: NegativeBlockBehavior, |
782 | 202 | ) -> Result<usize, ArrowError> { |
783 | 202 | let mut total = 0usize; |
784 | | loop { |
785 | 363 | let block_count = buf.get_long()?0 ; |
786 | 363 | match block_count.cmp(&0) { |
787 | 202 | Ordering::Equal => break, |
788 | | Ordering::Less => { |
789 | 3 | let count = (-block_count) as usize; |
790 | | // A negative count is followed by a long of the size in bytes |
791 | 3 | let size_in_bytes = buf.get_long()?0 as usize; |
792 | 3 | match negative_behavior { |
793 | | NegativeBlockBehavior::ProcessItems => { |
794 | | // Process items one-by-one after reading size |
795 | 1 | for _ in 0..count { |
796 | 3 | on_item(buf)?0 ; |
797 | | } |
798 | | } |
799 | | NegativeBlockBehavior::SkipBySize => { |
800 | | // Skip the entire block payload at once |
801 | 2 | let _ = buf.get_fixed(size_in_bytes)?0 ; |
802 | | } |
803 | | } |
804 | 3 | total += count; |
805 | | } |
806 | | Ordering::Greater => { |
807 | 158 | let count = block_count as usize; |
808 | 158 | for _ in 0..count { |
809 | 323 | on_item(buf)?0 ; |
810 | | } |
811 | 158 | total += count; |
812 | | } |
813 | | } |
814 | | } |
815 | 202 | Ok(total) |
816 | 202 | } |
817 | | |
818 | | #[inline] |
819 | 661 | fn flush_values<T>(values: &mut Vec<T>) -> Vec<T> { |
820 | 661 | std::mem::replace(values, Vec::with_capacity(DEFAULT_CAPACITY)) |
821 | 661 | } |
822 | | |
823 | | #[inline] |
824 | 236 | fn flush_offsets(offsets: &mut OffsetBufferBuilder<i32>) -> OffsetBuffer<i32> { |
825 | 236 | std::mem::replace(offsets, OffsetBufferBuilder::new(DEFAULT_CAPACITY)).finish() |
826 | 236 | } |
827 | | |
828 | | #[inline] |
829 | 505 | fn flush_primitive<T: ArrowPrimitiveType>( |
830 | 505 | values: &mut Vec<T::Native>, |
831 | 505 | nulls: Option<NullBuffer>, |
832 | 505 | ) -> PrimitiveArray<T> { |
833 | 505 | PrimitiveArray::new(flush_values(values).into(), nulls) |
834 | 505 | } |
835 | | |
836 | | /// Sign extends a byte slice to a fixed-size array of N bytes. |
837 | | /// This is done by filling the leading bytes with 0x00 for positive numbers |
838 | | /// or 0xFF for negative numbers. |
839 | | #[inline] |
840 | 200 | fn sign_extend_to<const N: usize>(raw: &[u8]) -> Result<[u8; N], ArrowError> { |
841 | 200 | if raw.len() > N { |
842 | 0 | return Err(ArrowError::ParseError(format!( |
843 | 0 | "Cannot extend a slice of length {} to {} bytes.", |
844 | 0 | raw.len(), |
845 | 0 | N |
846 | 0 | ))); |
847 | 200 | } |
848 | 200 | let mut arr = [0u8; N]; |
849 | 200 | let pad_len = N - raw.len(); |
850 | | // Determine the byte to use for padding based on the sign bit of the raw data. |
851 | 200 | let extension_byte = if raw.is_empty() || (raw[0] & 0x80 == 0) { |
852 | 196 | 0x00 |
853 | | } else { |
854 | 4 | 0xFF |
855 | | }; |
856 | 200 | arr[..pad_len].fill(extension_byte); |
857 | 200 | arr[pad_len..].copy_from_slice(raw); |
858 | 200 | Ok(arr) |
859 | 200 | } |
860 | | |
861 | | /// Lightweight skipping decoder for writer-only fields |
862 | | #[derive(Debug)] |
863 | | enum Skipper { |
864 | | Null, |
865 | | Boolean, |
866 | | Int32, |
867 | | Int64, |
868 | | Float32, |
869 | | Float64, |
870 | | Bytes, |
871 | | String, |
872 | | Date32, |
873 | | TimeMillis, |
874 | | TimeMicros, |
875 | | TimestampMillis, |
876 | | TimestampMicros, |
877 | | Fixed(usize), |
878 | | Decimal(Option<usize>), |
879 | | UuidString, |
880 | | Enum, |
881 | | DurationFixed12, |
882 | | List(Box<Skipper>), |
883 | | Map(Box<Skipper>), |
884 | | Struct(Vec<Skipper>), |
885 | | Nullable(Nullability, Box<Skipper>), |
886 | | } |
887 | | |
888 | | impl Skipper { |
889 | 19 | fn from_avro(dt: &AvroDataType) -> Result<Self, ArrowError> { |
890 | 19 | let mut base = match dt.codec() { |
891 | 0 | Codec::Null => Self::Null, |
892 | 2 | Codec::Boolean => Self::Boolean, |
893 | 7 | Codec::Int32 | Codec::Date32 | Codec::TimeMillis => Self::Int32, |
894 | 2 | Codec::Int64 => Self::Int64, |
895 | 0 | Codec::TimeMicros => Self::TimeMicros, |
896 | 0 | Codec::TimestampMillis(_) => Self::TimestampMillis, |
897 | 1 | Codec::TimestampMicros(_) => Self::TimestampMicros, |
898 | 2 | Codec::Float32 => Self::Float32, |
899 | 1 | Codec::Float64 => Self::Float64, |
900 | 4 | Codec::Binary => Self::Bytes, |
901 | 0 | Codec::Utf8 | Codec::Utf8View => Self::String, |
902 | 0 | Codec::Fixed(sz) => Self::Fixed(*sz as usize), |
903 | 0 | Codec::Decimal(_, _, size) => Self::Decimal(*size), |
904 | 0 | Codec::Uuid => Self::UuidString, // encoded as string |
905 | 0 | Codec::Enum(_) => Self::Enum, |
906 | 0 | Codec::List(item) => Self::List(Box::new(Skipper::from_avro(item)?)), |
907 | 0 | Codec::Struct(fields) => Self::Struct( |
908 | 0 | fields |
909 | 0 | .iter() |
910 | 0 | .map(|f| Skipper::from_avro(f.data_type())) |
911 | 0 | .collect::<Result<_, _>>()?, |
912 | | ), |
913 | 0 | Codec::Map(values) => Self::Map(Box::new(Skipper::from_avro(values)?)), |
914 | 0 | Codec::Interval => Self::DurationFixed12, |
915 | | _ => { |
916 | 0 | return Err(ArrowError::NotYetImplemented(format!( |
917 | 0 | "Skipper not implemented for codec {:?}", |
918 | 0 | dt.codec() |
919 | 0 | ))); |
920 | | } |
921 | | }; |
922 | 19 | if let Some(n) = dt.nullability() { |
923 | 19 | base = Self::Nullable(n, Box::new(base)); |
924 | 19 | }0 |
925 | 19 | Ok(base) |
926 | 19 | } |
927 | | |
928 | 311 | fn skip(&mut self, buf: &mut AvroCursor<'_>) -> Result<(), ArrowError> { |
929 | 311 | match self { |
930 | 0 | Self::Null => Ok(()), |
931 | | Self::Boolean => { |
932 | 16 | buf.get_bool()?0 ; |
933 | 16 | Ok(()) |
934 | | } |
935 | | Self::Int32 | Self::Date32 | Self::TimeMillis => { |
936 | 58 | buf.get_int()?0 ; |
937 | 58 | Ok(()) |
938 | | } |
939 | | Self::Int64 | Self::TimeMicros | Self::TimestampMillis | Self::TimestampMicros => { |
940 | 24 | buf.get_long()?0 ; |
941 | 24 | Ok(()) |
942 | | } |
943 | | Self::Float32 => { |
944 | 16 | buf.get_float()?0 ; |
945 | 16 | Ok(()) |
946 | | } |
947 | | Self::Float64 => { |
948 | 8 | buf.get_double()?0 ; |
949 | 8 | Ok(()) |
950 | | } |
951 | | Self::Bytes | Self::String | Self::UuidString => { |
952 | 33 | buf.get_bytes()?0 ; |
953 | 33 | Ok(()) |
954 | | } |
955 | 0 | Self::Fixed(sz) => { |
956 | 0 | buf.get_fixed(*sz)?; |
957 | 0 | Ok(()) |
958 | | } |
959 | 0 | Self::Decimal(size) => { |
960 | 0 | if let Some(s) = size { |
961 | 0 | buf.get_fixed(*s) |
962 | | } else { |
963 | 0 | buf.get_bytes() |
964 | 0 | }?; |
965 | 0 | Ok(()) |
966 | | } |
967 | | Self::Enum => { |
968 | 0 | buf.get_int()?; |
969 | 0 | Ok(()) |
970 | | } |
971 | | Self::DurationFixed12 => { |
972 | 0 | buf.get_fixed(12)?; |
973 | 0 | Ok(()) |
974 | | } |
975 | 1 | Self::List(item) => { |
976 | 1 | skip_blocks(buf, |c| item0 .skip0 (c0 ), true)?0 ; |
977 | 1 | Ok(()) |
978 | | } |
979 | 1 | Self::Map(value) => { |
980 | 1 | skip_blocks( |
981 | 1 | buf, |
982 | 0 | |c| { |
983 | 0 | c.get_bytes()?; // key |
984 | 0 | value.skip(c) |
985 | 0 | }, |
986 | | true, |
987 | 0 | )?; |
988 | 1 | Ok(()) |
989 | | } |
990 | 0 | Self::Struct(fields) => { |
991 | 0 | for f in fields.iter_mut() { |
992 | 0 | f.skip(buf)? |
993 | | } |
994 | 0 | Ok(()) |
995 | | } |
996 | 154 | Self::Nullable(order, inner) => { |
997 | 154 | let branch = buf.read_vlq()?0 ; |
998 | 154 | let is_not_null = match *order { |
999 | 2 | Nullability::NullFirst => branch != 0, |
1000 | 152 | Nullability::NullSecond => branch == 0, |
1001 | | }; |
1002 | 154 | if is_not_null { |
1003 | 153 | inner.skip(buf)?0 ; |
1004 | 1 | } |
1005 | 154 | Ok(()) |
1006 | | } |
1007 | | } |
1008 | 311 | } |
1009 | | } |
1010 | | |
1011 | | #[inline] |
1012 | 29 | fn build_skip_decoders( |
1013 | 29 | skip_fields: &[Option<AvroDataType>], |
1014 | 29 | ) -> Result<Vec<Option<Skipper>>, ArrowError> { |
1015 | 29 | skip_fields |
1016 | 29 | .iter() |
1017 | 199 | .map29 (|opt| opt.as_ref().map(Skipper::from_avro).transpose()) |
1018 | 29 | .collect() |
1019 | 29 | } |
1020 | | |
1021 | | #[cfg(test)] |
1022 | | mod tests { |
1023 | | use super::*; |
1024 | | use crate::codec::AvroField; |
1025 | | use arrow_array::{ |
1026 | | cast::AsArray, Array, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, |
1027 | | IntervalMonthDayNanoArray, ListArray, MapArray, StringArray, StructArray, |
1028 | | }; |
1029 | | |
1030 | 47 | fn encode_avro_int(value: i32) -> Vec<u8> { |
1031 | 47 | let mut buf = Vec::new(); |
1032 | 47 | let mut v = (value << 1) ^ (value >> 31); |
1033 | 54 | while v & !0x7F != 0 { |
1034 | 7 | buf.push(((v & 0x7F) | 0x80) as u8); |
1035 | 7 | v >>= 7; |
1036 | 7 | } |
1037 | 47 | buf.push(v as u8); |
1038 | 47 | buf |
1039 | 47 | } |
1040 | | |
1041 | 50 | fn encode_avro_long(value: i64) -> Vec<u8> { |
1042 | 50 | let mut buf = Vec::new(); |
1043 | 50 | let mut v = (value << 1) ^ (value >> 63); |
1044 | 57 | while v & !0x7F != 0 { |
1045 | 7 | buf.push(((v & 0x7F) | 0x80) as u8); |
1046 | 7 | v >>= 7; |
1047 | 7 | } |
1048 | 50 | buf.push(v as u8); |
1049 | 50 | buf |
1050 | 50 | } |
1051 | | |
1052 | 15 | fn encode_avro_bytes(bytes: &[u8]) -> Vec<u8> { |
1053 | 15 | let mut buf = encode_avro_long(bytes.len() as i64); |
1054 | 15 | buf.extend_from_slice(bytes); |
1055 | 15 | buf |
1056 | 15 | } |
1057 | | |
1058 | 21 | fn avro_from_codec(codec: Codec) -> AvroDataType { |
1059 | 21 | AvroDataType::new(codec, Default::default(), None) |
1060 | 21 | } |
1061 | | |
1062 | 9 | fn decoder_for_promotion( |
1063 | 9 | writer: PrimitiveType, |
1064 | 9 | reader: PrimitiveType, |
1065 | 9 | use_utf8view: bool, |
1066 | 9 | ) -> Decoder { |
1067 | 9 | let ws = Schema::TypeName(TypeName::Primitive(writer)); |
1068 | 9 | let rs = Schema::TypeName(TypeName::Primitive(reader)); |
1069 | 9 | let field = |
1070 | 9 | AvroField::resolve_from_writer_and_reader(&ws, &rs, use_utf8view, false).unwrap(); |
1071 | 9 | Decoder::try_new(field.data_type()).unwrap() |
1072 | 9 | } |
1073 | | |
1074 | | #[test] |
1075 | 1 | fn test_schema_resolution_promotion_int_to_long() { |
1076 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Int, PrimitiveType::Long, false); |
1077 | 1 | assert!(matches!0 (dec, Decoder::Int32ToInt64(_))); |
1078 | 5 | for v4 in [0, 1, -2, 123456] { |
1079 | 4 | let data = encode_avro_int(v); |
1080 | 4 | let mut cur = AvroCursor::new(&data); |
1081 | 4 | dec.decode(&mut cur).unwrap(); |
1082 | 4 | } |
1083 | 1 | let arr = dec.flush(None).unwrap(); |
1084 | 1 | let a = arr.as_any().downcast_ref::<Int64Array>().unwrap(); |
1085 | 1 | assert_eq!(a.value(0), 0); |
1086 | 1 | assert_eq!(a.value(1), 1); |
1087 | 1 | assert_eq!(a.value(2), -2); |
1088 | 1 | assert_eq!(a.value(3), 123456); |
1089 | 1 | } |
1090 | | |
1091 | | #[test] |
1092 | 1 | fn test_schema_resolution_promotion_int_to_float() { |
1093 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Int, PrimitiveType::Float, false); |
1094 | 1 | assert!(matches!0 (dec, Decoder::Int32ToFloat32(_))); |
1095 | 4 | for v3 in [0, 42, -7] { |
1096 | 3 | let data = encode_avro_int(v); |
1097 | 3 | let mut cur = AvroCursor::new(&data); |
1098 | 3 | dec.decode(&mut cur).unwrap(); |
1099 | 3 | } |
1100 | 1 | let arr = dec.flush(None).unwrap(); |
1101 | 1 | let a = arr.as_any().downcast_ref::<Float32Array>().unwrap(); |
1102 | 1 | assert_eq!(a.value(0), 0.0); |
1103 | 1 | assert_eq!(a.value(1), 42.0); |
1104 | 1 | assert_eq!(a.value(2), -7.0); |
1105 | 1 | } |
1106 | | |
1107 | | #[test] |
1108 | 1 | fn test_schema_resolution_promotion_int_to_double() { |
1109 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Int, PrimitiveType::Double, false); |
1110 | 1 | assert!(matches!0 (dec, Decoder::Int32ToFloat64(_))); |
1111 | 4 | for v3 in [1, -1, 10_000] { |
1112 | 3 | let data = encode_avro_int(v); |
1113 | 3 | let mut cur = AvroCursor::new(&data); |
1114 | 3 | dec.decode(&mut cur).unwrap(); |
1115 | 3 | } |
1116 | 1 | let arr = dec.flush(None).unwrap(); |
1117 | 1 | let a = arr.as_any().downcast_ref::<Float64Array>().unwrap(); |
1118 | 1 | assert_eq!(a.value(0), 1.0); |
1119 | 1 | assert_eq!(a.value(1), -1.0); |
1120 | 1 | assert_eq!(a.value(2), 10_000.0); |
1121 | 1 | } |
1122 | | |
1123 | | #[test] |
1124 | 1 | fn test_schema_resolution_promotion_long_to_float() { |
1125 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Long, PrimitiveType::Float, false); |
1126 | 1 | assert!(matches!0 (dec, Decoder::Int64ToFloat32(_))); |
1127 | 4 | for v3 in [0_i64, 1_000_000_i64, -123_i64] { |
1128 | 3 | let data = encode_avro_long(v); |
1129 | 3 | let mut cur = AvroCursor::new(&data); |
1130 | 3 | dec.decode(&mut cur).unwrap(); |
1131 | 3 | } |
1132 | 1 | let arr = dec.flush(None).unwrap(); |
1133 | 1 | let a = arr.as_any().downcast_ref::<Float32Array>().unwrap(); |
1134 | 1 | assert_eq!(a.value(0), 0.0); |
1135 | 1 | assert_eq!(a.value(1), 1_000_000.0); |
1136 | 1 | assert_eq!(a.value(2), -123.0); |
1137 | 1 | } |
1138 | | |
1139 | | #[test] |
1140 | 1 | fn test_schema_resolution_promotion_long_to_double() { |
1141 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Long, PrimitiveType::Double, false); |
1142 | 1 | assert!(matches!0 (dec, Decoder::Int64ToFloat64(_))); |
1143 | 4 | for v3 in [2_i64, -2_i64, 9_223_372_i64] { |
1144 | 3 | let data = encode_avro_long(v); |
1145 | 3 | let mut cur = AvroCursor::new(&data); |
1146 | 3 | dec.decode(&mut cur).unwrap(); |
1147 | 3 | } |
1148 | 1 | let arr = dec.flush(None).unwrap(); |
1149 | 1 | let a = arr.as_any().downcast_ref::<Float64Array>().unwrap(); |
1150 | 1 | assert_eq!(a.value(0), 2.0); |
1151 | 1 | assert_eq!(a.value(1), -2.0); |
1152 | 1 | assert_eq!(a.value(2), 9_223_372.0); |
1153 | 1 | } |
1154 | | |
1155 | | #[test] |
1156 | 1 | fn test_schema_resolution_promotion_float_to_double() { |
1157 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Float, PrimitiveType::Double, false); |
1158 | 1 | assert!(matches!0 (dec, Decoder::Float32ToFloat64(_))); |
1159 | 4 | for v3 in [0.5_f32, -3.25_f32, 1.0e6_f32] { |
1160 | 3 | let data = v.to_le_bytes().to_vec(); |
1161 | 3 | let mut cur = AvroCursor::new(&data); |
1162 | 3 | dec.decode(&mut cur).unwrap(); |
1163 | 3 | } |
1164 | 1 | let arr = dec.flush(None).unwrap(); |
1165 | 1 | let a = arr.as_any().downcast_ref::<Float64Array>().unwrap(); |
1166 | 1 | assert_eq!(a.value(0), 0.5_f64); |
1167 | 1 | assert_eq!(a.value(1), -3.25_f64); |
1168 | 1 | assert_eq!(a.value(2), 1.0e6_f64); |
1169 | 1 | } |
1170 | | |
1171 | | #[test] |
1172 | 1 | fn test_schema_resolution_promotion_bytes_to_string_utf8() { |
1173 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Bytes, PrimitiveType::String, false); |
1174 | 1 | assert!(matches!0 (dec, Decoder::BytesToString(_, _))); |
1175 | 3 | for s in ["hello", "world"1 , "héllo"1 ] { |
1176 | 3 | let data = encode_avro_bytes(s.as_bytes()); |
1177 | 3 | let mut cur = AvroCursor::new(&data); |
1178 | 3 | dec.decode(&mut cur).unwrap(); |
1179 | 3 | } |
1180 | 1 | let arr = dec.flush(None).unwrap(); |
1181 | 1 | let a = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
1182 | 1 | assert_eq!(a.value(0), "hello"); |
1183 | 1 | assert_eq!(a.value(1), "world"); |
1184 | 1 | assert_eq!(a.value(2), "héllo"); |
1185 | 1 | } |
1186 | | |
1187 | | #[test] |
1188 | 1 | fn test_schema_resolution_promotion_bytes_to_string_utf8view_enabled() { |
1189 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::Bytes, PrimitiveType::String, true); |
1190 | 1 | assert!(matches!0 (dec, Decoder::BytesToString(_, _))); |
1191 | 1 | let data = encode_avro_bytes("abc".as_bytes()); |
1192 | 1 | let mut cur = AvroCursor::new(&data); |
1193 | 1 | dec.decode(&mut cur).unwrap(); |
1194 | 1 | let arr = dec.flush(None).unwrap(); |
1195 | 1 | let a = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
1196 | 1 | assert_eq!(a.value(0), "abc"); |
1197 | 1 | } |
1198 | | |
1199 | | #[test] |
1200 | 1 | fn test_schema_resolution_promotion_string_to_bytes() { |
1201 | 1 | let mut dec = decoder_for_promotion(PrimitiveType::String, PrimitiveType::Bytes, false); |
1202 | 1 | assert!(matches!0 (dec, Decoder::StringToBytes(_, _))); |
1203 | 3 | for s in ["", "abc"1 , "data"1 ] { |
1204 | 3 | let data = encode_avro_bytes(s.as_bytes()); |
1205 | 3 | let mut cur = AvroCursor::new(&data); |
1206 | 3 | dec.decode(&mut cur).unwrap(); |
1207 | 3 | } |
1208 | 1 | let arr = dec.flush(None).unwrap(); |
1209 | 1 | let a = arr.as_any().downcast_ref::<BinaryArray>().unwrap(); |
1210 | 1 | assert_eq!(a.value(0), b""); |
1211 | 1 | assert_eq!(a.value(1), b"abc"); |
1212 | 1 | assert_eq!(a.value(2), "data".as_bytes()); |
1213 | 1 | } |
1214 | | |
1215 | | #[test] |
1216 | 1 | fn test_schema_resolution_no_promotion_passthrough_int() { |
1217 | 1 | let ws = Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)); |
1218 | 1 | let rs = Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)); |
1219 | 1 | let field = AvroField::resolve_from_writer_and_reader(&ws, &rs, false, false).unwrap(); |
1220 | 1 | let mut dec = Decoder::try_new(field.data_type()).unwrap(); |
1221 | 1 | assert!(matches!0 (dec, Decoder::Int32(_))); |
1222 | 3 | for v2 in [7, -9] { |
1223 | 2 | let data = encode_avro_int(v); |
1224 | 2 | let mut cur = AvroCursor::new(&data); |
1225 | 2 | dec.decode(&mut cur).unwrap(); |
1226 | 2 | } |
1227 | 1 | let arr = dec.flush(None).unwrap(); |
1228 | 1 | let a = arr.as_any().downcast_ref::<Int32Array>().unwrap(); |
1229 | 1 | assert_eq!(a.value(0), 7); |
1230 | 1 | assert_eq!(a.value(1), -9); |
1231 | 1 | } |
1232 | | |
1233 | | #[test] |
1234 | 1 | fn test_schema_resolution_illegal_promotion_int_to_boolean_errors() { |
1235 | 1 | let ws = Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)); |
1236 | 1 | let rs = Schema::TypeName(TypeName::Primitive(PrimitiveType::Boolean)); |
1237 | 1 | let res = AvroField::resolve_from_writer_and_reader(&ws, &rs, false, false); |
1238 | 1 | assert!(res.is_err(), "expected error for illegal promotion"0 ); |
1239 | 1 | } |
1240 | | |
1241 | | #[test] |
1242 | 1 | fn test_map_decoding_one_entry() { |
1243 | 1 | let value_type = avro_from_codec(Codec::Utf8); |
1244 | 1 | let map_type = avro_from_codec(Codec::Map(Arc::new(value_type))); |
1245 | 1 | let mut decoder = Decoder::try_new(&map_type).unwrap(); |
1246 | | // Encode a single map with one entry: {"hello": "world"} |
1247 | 1 | let mut data = Vec::new(); |
1248 | 1 | data.extend_from_slice(&encode_avro_long(1)); |
1249 | 1 | data.extend_from_slice(&encode_avro_bytes(b"hello")); // key |
1250 | 1 | data.extend_from_slice(&encode_avro_bytes(b"world")); // value |
1251 | 1 | data.extend_from_slice(&encode_avro_long(0)); |
1252 | 1 | let mut cursor = AvroCursor::new(&data); |
1253 | 1 | decoder.decode(&mut cursor).unwrap(); |
1254 | 1 | let array = decoder.flush(None).unwrap(); |
1255 | 1 | let map_arr = array.as_any().downcast_ref::<MapArray>().unwrap(); |
1256 | 1 | assert_eq!(map_arr.len(), 1); // one map |
1257 | 1 | assert_eq!(map_arr.value_length(0), 1); |
1258 | 1 | let entries = map_arr.value(0); |
1259 | 1 | let struct_entries = entries.as_any().downcast_ref::<StructArray>().unwrap(); |
1260 | 1 | assert_eq!(struct_entries.len(), 1); |
1261 | 1 | let key_arr = struct_entries |
1262 | 1 | .column_by_name("key") |
1263 | 1 | .unwrap() |
1264 | 1 | .as_any() |
1265 | 1 | .downcast_ref::<StringArray>() |
1266 | 1 | .unwrap(); |
1267 | 1 | let val_arr = struct_entries |
1268 | 1 | .column_by_name("value") |
1269 | 1 | .unwrap() |
1270 | 1 | .as_any() |
1271 | 1 | .downcast_ref::<StringArray>() |
1272 | 1 | .unwrap(); |
1273 | 1 | assert_eq!(key_arr.value(0), "hello"); |
1274 | 1 | assert_eq!(val_arr.value(0), "world"); |
1275 | 1 | } |
1276 | | |
1277 | | #[test] |
1278 | 1 | fn test_map_decoding_empty() { |
1279 | 1 | let value_type = avro_from_codec(Codec::Utf8); |
1280 | 1 | let map_type = avro_from_codec(Codec::Map(Arc::new(value_type))); |
1281 | 1 | let mut decoder = Decoder::try_new(&map_type).unwrap(); |
1282 | 1 | let data = encode_avro_long(0); |
1283 | 1 | decoder.decode(&mut AvroCursor::new(&data)).unwrap(); |
1284 | 1 | let array = decoder.flush(None).unwrap(); |
1285 | 1 | let map_arr = array.as_any().downcast_ref::<MapArray>().unwrap(); |
1286 | 1 | assert_eq!(map_arr.len(), 1); |
1287 | 1 | assert_eq!(map_arr.value_length(0), 0); |
1288 | 1 | } |
1289 | | |
1290 | | #[test] |
1291 | 1 | fn test_fixed_decoding() { |
1292 | 1 | let avro_type = avro_from_codec(Codec::Fixed(3)); |
1293 | 1 | let mut decoder = Decoder::try_new(&avro_type).expect("Failed to create decoder"); |
1294 | | |
1295 | 1 | let data1 = [1u8, 2, 3]; |
1296 | 1 | let mut cursor1 = AvroCursor::new(&data1); |
1297 | 1 | decoder |
1298 | 1 | .decode(&mut cursor1) |
1299 | 1 | .expect("Failed to decode data1"); |
1300 | 1 | assert_eq!(cursor1.position(), 3, "Cursor should advance by fixed size"0 ); |
1301 | 1 | let data2 = [4u8, 5, 6]; |
1302 | 1 | let mut cursor2 = AvroCursor::new(&data2); |
1303 | 1 | decoder |
1304 | 1 | .decode(&mut cursor2) |
1305 | 1 | .expect("Failed to decode data2"); |
1306 | 1 | assert_eq!(cursor2.position(), 3, "Cursor should advance by fixed size"0 ); |
1307 | 1 | let array = decoder.flush(None).expect("Failed to flush decoder"); |
1308 | 1 | assert_eq!(array.len(), 2, "Array should contain two items"0 ); |
1309 | 1 | let fixed_size_binary_array = array |
1310 | 1 | .as_any() |
1311 | 1 | .downcast_ref::<FixedSizeBinaryArray>() |
1312 | 1 | .expect("Failed to downcast to FixedSizeBinaryArray"); |
1313 | 1 | assert_eq!( |
1314 | 1 | fixed_size_binary_array.value_length(), |
1315 | | 3, |
1316 | 0 | "Fixed size of binary values should be 3" |
1317 | | ); |
1318 | 1 | assert_eq!( |
1319 | 1 | fixed_size_binary_array.value(0), |
1320 | | &[1, 2, 3], |
1321 | 0 | "First item mismatch" |
1322 | | ); |
1323 | 1 | assert_eq!( |
1324 | 1 | fixed_size_binary_array.value(1), |
1325 | | &[4, 5, 6], |
1326 | 0 | "Second item mismatch" |
1327 | | ); |
1328 | 1 | } |
1329 | | |
1330 | | #[test] |
1331 | 1 | fn test_fixed_decoding_empty() { |
1332 | 1 | let avro_type = avro_from_codec(Codec::Fixed(5)); |
1333 | 1 | let mut decoder = Decoder::try_new(&avro_type).expect("Failed to create decoder"); |
1334 | | |
1335 | 1 | let array = decoder |
1336 | 1 | .flush(None) |
1337 | 1 | .expect("Failed to flush decoder for empty input"); |
1338 | | |
1339 | 1 | assert_eq!(array.len(), 0, "Array should be empty"0 ); |
1340 | 1 | let fixed_size_binary_array = array |
1341 | 1 | .as_any() |
1342 | 1 | .downcast_ref::<FixedSizeBinaryArray>() |
1343 | 1 | .expect("Failed to downcast to FixedSizeBinaryArray for empty array"); |
1344 | | |
1345 | 1 | assert_eq!( |
1346 | 1 | fixed_size_binary_array.value_length(), |
1347 | | 5, |
1348 | 0 | "Fixed size of binary values should be 5 as per type" |
1349 | | ); |
1350 | 1 | } |
1351 | | |
1352 | | #[test] |
1353 | 1 | fn test_uuid_decoding() { |
1354 | 1 | let avro_type = avro_from_codec(Codec::Uuid); |
1355 | 1 | let mut decoder = Decoder::try_new(&avro_type).expect("Failed to create decoder"); |
1356 | 1 | let uuid_str = "f81d4fae-7dec-11d0-a765-00a0c91e6bf6"; |
1357 | 1 | let data = encode_avro_bytes(uuid_str.as_bytes()); |
1358 | 1 | let mut cursor = AvroCursor::new(&data); |
1359 | 1 | decoder.decode(&mut cursor).expect("Failed to decode data"); |
1360 | 1 | assert_eq!( |
1361 | 1 | cursor.position(), |
1362 | 1 | data.len(), |
1363 | 0 | "Cursor should advance by varint size + data size" |
1364 | | ); |
1365 | 1 | let array = decoder.flush(None).expect("Failed to flush decoder"); |
1366 | 1 | let fixed_size_binary_array = array |
1367 | 1 | .as_any() |
1368 | 1 | .downcast_ref::<FixedSizeBinaryArray>() |
1369 | 1 | .expect("Array should be a FixedSizeBinaryArray"); |
1370 | 1 | assert_eq!(fixed_size_binary_array.len(), 1); |
1371 | 1 | assert_eq!(fixed_size_binary_array.value_length(), 16); |
1372 | 1 | let expected_bytes = [ |
1373 | 1 | 0xf8, 0x1d, 0x4f, 0xae, 0x7d, 0xec, 0x11, 0xd0, 0xa7, 0x65, 0x00, 0xa0, 0xc9, 0x1e, |
1374 | 1 | 0x6b, 0xf6, |
1375 | 1 | ]; |
1376 | 1 | assert_eq!(fixed_size_binary_array.value(0), &expected_bytes); |
1377 | 1 | } |
1378 | | |
1379 | | #[test] |
1380 | 1 | fn test_array_decoding() { |
1381 | 1 | let item_dt = avro_from_codec(Codec::Int32); |
1382 | 1 | let list_dt = avro_from_codec(Codec::List(Arc::new(item_dt))); |
1383 | 1 | let mut decoder = Decoder::try_new(&list_dt).unwrap(); |
1384 | 1 | let mut row1 = Vec::new(); |
1385 | 1 | row1.extend_from_slice(&encode_avro_long(2)); |
1386 | 1 | row1.extend_from_slice(&encode_avro_int(10)); |
1387 | 1 | row1.extend_from_slice(&encode_avro_int(20)); |
1388 | 1 | row1.extend_from_slice(&encode_avro_long(0)); |
1389 | 1 | let row2 = encode_avro_long(0); |
1390 | 1 | let mut cursor = AvroCursor::new(&row1); |
1391 | 1 | decoder.decode(&mut cursor).unwrap(); |
1392 | 1 | let mut cursor2 = AvroCursor::new(&row2); |
1393 | 1 | decoder.decode(&mut cursor2).unwrap(); |
1394 | 1 | let array = decoder.flush(None).unwrap(); |
1395 | 1 | let list_arr = array.as_any().downcast_ref::<ListArray>().unwrap(); |
1396 | 1 | assert_eq!(list_arr.len(), 2); |
1397 | 1 | let offsets = list_arr.value_offsets(); |
1398 | 1 | assert_eq!(offsets, &[0, 2, 2]); |
1399 | 1 | let values = list_arr.values(); |
1400 | 1 | let int_arr = values.as_primitive::<Int32Type>(); |
1401 | 1 | assert_eq!(int_arr.len(), 2); |
1402 | 1 | assert_eq!(int_arr.value(0), 10); |
1403 | 1 | assert_eq!(int_arr.value(1), 20); |
1404 | 1 | } |
1405 | | |
1406 | | #[test] |
1407 | 1 | fn test_array_decoding_with_negative_block_count() { |
1408 | 1 | let item_dt = avro_from_codec(Codec::Int32); |
1409 | 1 | let list_dt = avro_from_codec(Codec::List(Arc::new(item_dt))); |
1410 | 1 | let mut decoder = Decoder::try_new(&list_dt).unwrap(); |
1411 | 1 | let mut data = encode_avro_long(-3); |
1412 | 1 | data.extend_from_slice(&encode_avro_long(12)); |
1413 | 1 | data.extend_from_slice(&encode_avro_int(1)); |
1414 | 1 | data.extend_from_slice(&encode_avro_int(2)); |
1415 | 1 | data.extend_from_slice(&encode_avro_int(3)); |
1416 | 1 | data.extend_from_slice(&encode_avro_long(0)); |
1417 | 1 | let mut cursor = AvroCursor::new(&data); |
1418 | 1 | decoder.decode(&mut cursor).unwrap(); |
1419 | 1 | let array = decoder.flush(None).unwrap(); |
1420 | 1 | let list_arr = array.as_any().downcast_ref::<ListArray>().unwrap(); |
1421 | 1 | assert_eq!(list_arr.len(), 1); |
1422 | 1 | assert_eq!(list_arr.value_length(0), 3); |
1423 | 1 | let values = list_arr.values().as_primitive::<Int32Type>(); |
1424 | 1 | assert_eq!(values.len(), 3); |
1425 | 1 | assert_eq!(values.value(0), 1); |
1426 | 1 | assert_eq!(values.value(1), 2); |
1427 | 1 | assert_eq!(values.value(2), 3); |
1428 | 1 | } |
1429 | | |
1430 | | #[test] |
1431 | 1 | fn test_nested_array_decoding() { |
1432 | 1 | let inner_ty = avro_from_codec(Codec::List(Arc::new(avro_from_codec(Codec::Int32)))); |
1433 | 1 | let nested_ty = avro_from_codec(Codec::List(Arc::new(inner_ty.clone()))); |
1434 | 1 | let mut decoder = Decoder::try_new(&nested_ty).unwrap(); |
1435 | 1 | let mut buf = Vec::new(); |
1436 | 1 | buf.extend(encode_avro_long(1)); |
1437 | 1 | buf.extend(encode_avro_long(2)); |
1438 | 1 | buf.extend(encode_avro_int(5)); |
1439 | 1 | buf.extend(encode_avro_int(6)); |
1440 | 1 | buf.extend(encode_avro_long(0)); |
1441 | 1 | buf.extend(encode_avro_long(0)); |
1442 | 1 | let mut cursor = AvroCursor::new(&buf); |
1443 | 1 | decoder.decode(&mut cursor).unwrap(); |
1444 | 1 | let arr = decoder.flush(None).unwrap(); |
1445 | 1 | let outer = arr.as_any().downcast_ref::<ListArray>().unwrap(); |
1446 | 1 | assert_eq!(outer.len(), 1); |
1447 | 1 | assert_eq!(outer.value_length(0), 1); |
1448 | 1 | let inner = outer.values().as_any().downcast_ref::<ListArray>().unwrap(); |
1449 | 1 | assert_eq!(inner.len(), 1); |
1450 | 1 | assert_eq!(inner.value_length(0), 2); |
1451 | 1 | let values = inner |
1452 | 1 | .values() |
1453 | 1 | .as_any() |
1454 | 1 | .downcast_ref::<Int32Array>() |
1455 | 1 | .unwrap(); |
1456 | 1 | assert_eq!(values.values(), &[5, 6]); |
1457 | 1 | } |
1458 | | |
1459 | | #[test] |
1460 | 1 | fn test_array_decoding_empty_array() { |
1461 | 1 | let value_type = avro_from_codec(Codec::Utf8); |
1462 | 1 | let map_type = avro_from_codec(Codec::List(Arc::new(value_type))); |
1463 | 1 | let mut decoder = Decoder::try_new(&map_type).unwrap(); |
1464 | 1 | let data = encode_avro_long(0); |
1465 | 1 | decoder.decode(&mut AvroCursor::new(&data)).unwrap(); |
1466 | 1 | let array = decoder.flush(None).unwrap(); |
1467 | 1 | let list_arr = array.as_any().downcast_ref::<ListArray>().unwrap(); |
1468 | 1 | assert_eq!(list_arr.len(), 1); |
1469 | 1 | assert_eq!(list_arr.value_length(0), 0); |
1470 | 1 | } |
1471 | | |
1472 | | #[test] |
1473 | 1 | fn test_decimal_decoding_fixed256() { |
1474 | 1 | let dt = avro_from_codec(Codec::Decimal(5, Some(2), Some(32))); |
1475 | 1 | let mut decoder = Decoder::try_new(&dt).unwrap(); |
1476 | 1 | let row1 = [ |
1477 | 1 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
1478 | 1 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
1479 | 1 | 0x00, 0x00, 0x30, 0x39, |
1480 | 1 | ]; |
1481 | 1 | let row2 = [ |
1482 | 1 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
1483 | 1 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
1484 | 1 | 0xFF, 0xFF, 0xFF, 0x85, |
1485 | 1 | ]; |
1486 | 1 | let mut data = Vec::new(); |
1487 | 1 | data.extend_from_slice(&row1); |
1488 | 1 | data.extend_from_slice(&row2); |
1489 | 1 | let mut cursor = AvroCursor::new(&data); |
1490 | 1 | decoder.decode(&mut cursor).unwrap(); |
1491 | 1 | decoder.decode(&mut cursor).unwrap(); |
1492 | 1 | let arr = decoder.flush(None).unwrap(); |
1493 | 1 | let dec = arr.as_any().downcast_ref::<Decimal256Array>().unwrap(); |
1494 | 1 | assert_eq!(dec.len(), 2); |
1495 | 1 | assert_eq!(dec.value_as_string(0), "123.45"); |
1496 | 1 | assert_eq!(dec.value_as_string(1), "-1.23"); |
1497 | 1 | } |
1498 | | |
1499 | | #[test] |
1500 | 1 | fn test_decimal_decoding_fixed128() { |
1501 | 1 | let dt = avro_from_codec(Codec::Decimal(5, Some(2), Some(16))); |
1502 | 1 | let mut decoder = Decoder::try_new(&dt).unwrap(); |
1503 | 1 | let row1 = [ |
1504 | 1 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
1505 | 1 | 0x30, 0x39, |
1506 | 1 | ]; |
1507 | 1 | let row2 = [ |
1508 | 1 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
1509 | 1 | 0xFF, 0x85, |
1510 | 1 | ]; |
1511 | 1 | let mut data = Vec::new(); |
1512 | 1 | data.extend_from_slice(&row1); |
1513 | 1 | data.extend_from_slice(&row2); |
1514 | 1 | let mut cursor = AvroCursor::new(&data); |
1515 | 1 | decoder.decode(&mut cursor).unwrap(); |
1516 | 1 | decoder.decode(&mut cursor).unwrap(); |
1517 | 1 | let arr = decoder.flush(None).unwrap(); |
1518 | 1 | let dec = arr.as_any().downcast_ref::<Decimal128Array>().unwrap(); |
1519 | 1 | assert_eq!(dec.len(), 2); |
1520 | 1 | assert_eq!(dec.value_as_string(0), "123.45"); |
1521 | 1 | assert_eq!(dec.value_as_string(1), "-1.23"); |
1522 | 1 | } |
1523 | | |
1524 | | #[test] |
1525 | 1 | fn test_decimal_decoding_bytes_with_nulls() { |
1526 | 1 | let dt = avro_from_codec(Codec::Decimal(4, Some(1), None)); |
1527 | 1 | let inner = Decoder::try_new(&dt).unwrap(); |
1528 | 1 | let mut decoder = Decoder::Nullable( |
1529 | 1 | Nullability::NullSecond, |
1530 | 1 | NullBufferBuilder::new(DEFAULT_CAPACITY), |
1531 | 1 | Box::new(inner), |
1532 | 1 | ); |
1533 | 1 | let mut data = Vec::new(); |
1534 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1535 | 1 | data.extend_from_slice(&encode_avro_bytes(&[0x04, 0xD2])); |
1536 | 1 | data.extend_from_slice(&encode_avro_int(1)); |
1537 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1538 | 1 | data.extend_from_slice(&encode_avro_bytes(&[0xFB, 0x2E])); |
1539 | 1 | let mut cursor = AvroCursor::new(&data); |
1540 | 1 | decoder.decode(&mut cursor).unwrap(); // row1 |
1541 | 1 | decoder.decode(&mut cursor).unwrap(); // row2 |
1542 | 1 | decoder.decode(&mut cursor).unwrap(); // row3 |
1543 | 1 | let arr = decoder.flush(None).unwrap(); |
1544 | 1 | let dec_arr = arr.as_any().downcast_ref::<Decimal128Array>().unwrap(); |
1545 | 1 | assert_eq!(dec_arr.len(), 3); |
1546 | 1 | assert!(dec_arr.is_valid(0)); |
1547 | 1 | assert!(!dec_arr.is_valid(1)); |
1548 | 1 | assert!(dec_arr.is_valid(2)); |
1549 | 1 | assert_eq!(dec_arr.value_as_string(0), "123.4"); |
1550 | 1 | assert_eq!(dec_arr.value_as_string(2), "-123.4"); |
1551 | 1 | } |
1552 | | |
1553 | | #[test] |
1554 | 1 | fn test_decimal_decoding_bytes_with_nulls_fixed_size() { |
1555 | 1 | let dt = avro_from_codec(Codec::Decimal(6, Some(2), Some(16))); |
1556 | 1 | let inner = Decoder::try_new(&dt).unwrap(); |
1557 | 1 | let mut decoder = Decoder::Nullable( |
1558 | 1 | Nullability::NullSecond, |
1559 | 1 | NullBufferBuilder::new(DEFAULT_CAPACITY), |
1560 | 1 | Box::new(inner), |
1561 | 1 | ); |
1562 | 1 | let row1 = [ |
1563 | 1 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, |
1564 | 1 | 0xE2, 0x40, |
1565 | 1 | ]; |
1566 | 1 | let row3 = [ |
1567 | 1 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, |
1568 | 1 | 0x1D, 0xC0, |
1569 | 1 | ]; |
1570 | 1 | let mut data = Vec::new(); |
1571 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1572 | 1 | data.extend_from_slice(&row1); |
1573 | 1 | data.extend_from_slice(&encode_avro_int(1)); |
1574 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1575 | 1 | data.extend_from_slice(&row3); |
1576 | 1 | let mut cursor = AvroCursor::new(&data); |
1577 | 1 | decoder.decode(&mut cursor).unwrap(); |
1578 | 1 | decoder.decode(&mut cursor).unwrap(); |
1579 | 1 | decoder.decode(&mut cursor).unwrap(); |
1580 | 1 | let arr = decoder.flush(None).unwrap(); |
1581 | 1 | let dec_arr = arr.as_any().downcast_ref::<Decimal128Array>().unwrap(); |
1582 | 1 | assert_eq!(dec_arr.len(), 3); |
1583 | 1 | assert!(dec_arr.is_valid(0)); |
1584 | 1 | assert!(!dec_arr.is_valid(1)); |
1585 | 1 | assert!(dec_arr.is_valid(2)); |
1586 | 1 | assert_eq!(dec_arr.value_as_string(0), "1234.56"); |
1587 | 1 | assert_eq!(dec_arr.value_as_string(2), "-1234.56"); |
1588 | 1 | } |
1589 | | |
1590 | | #[test] |
1591 | 1 | fn test_enum_decoding() { |
1592 | 1 | let symbols: Arc<[String]> = vec!["A", "B", "C"].into_iter().map(String::from).collect(); |
1593 | 1 | let avro_type = avro_from_codec(Codec::Enum(symbols.clone())); |
1594 | 1 | let mut decoder = Decoder::try_new(&avro_type).unwrap(); |
1595 | 1 | let mut data = Vec::new(); |
1596 | 1 | data.extend_from_slice(&encode_avro_int(2)); |
1597 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1598 | 1 | data.extend_from_slice(&encode_avro_int(1)); |
1599 | 1 | let mut cursor = AvroCursor::new(&data); |
1600 | 1 | decoder.decode(&mut cursor).unwrap(); |
1601 | 1 | decoder.decode(&mut cursor).unwrap(); |
1602 | 1 | decoder.decode(&mut cursor).unwrap(); |
1603 | 1 | let array = decoder.flush(None).unwrap(); |
1604 | 1 | let dict_array = array |
1605 | 1 | .as_any() |
1606 | 1 | .downcast_ref::<DictionaryArray<Int32Type>>() |
1607 | 1 | .unwrap(); |
1608 | | |
1609 | 1 | assert_eq!(dict_array.len(), 3); |
1610 | 1 | let values = dict_array |
1611 | 1 | .values() |
1612 | 1 | .as_any() |
1613 | 1 | .downcast_ref::<StringArray>() |
1614 | 1 | .unwrap(); |
1615 | 1 | assert_eq!(values.value(0), "A"); |
1616 | 1 | assert_eq!(values.value(1), "B"); |
1617 | 1 | assert_eq!(values.value(2), "C"); |
1618 | 1 | assert_eq!(dict_array.keys().values(), &[2, 0, 1]); |
1619 | 1 | } |
1620 | | |
1621 | | #[test] |
1622 | 1 | fn test_enum_decoding_with_nulls() { |
1623 | 1 | let symbols: Arc<[String]> = vec!["X", "Y"].into_iter().map(String::from).collect(); |
1624 | 1 | let enum_codec = Codec::Enum(symbols.clone()); |
1625 | 1 | let avro_type = |
1626 | 1 | AvroDataType::new(enum_codec, Default::default(), Some(Nullability::NullFirst)); |
1627 | 1 | let mut decoder = Decoder::try_new(&avro_type).unwrap(); |
1628 | 1 | let mut data = Vec::new(); |
1629 | 1 | data.extend_from_slice(&encode_avro_long(1)); |
1630 | 1 | data.extend_from_slice(&encode_avro_int(1)); |
1631 | 1 | data.extend_from_slice(&encode_avro_long(0)); |
1632 | 1 | data.extend_from_slice(&encode_avro_long(1)); |
1633 | 1 | data.extend_from_slice(&encode_avro_int(0)); |
1634 | 1 | let mut cursor = AvroCursor::new(&data); |
1635 | 1 | decoder.decode(&mut cursor).unwrap(); |
1636 | 1 | decoder.decode(&mut cursor).unwrap(); |
1637 | 1 | decoder.decode(&mut cursor).unwrap(); |
1638 | 1 | let array = decoder.flush(None).unwrap(); |
1639 | 1 | let dict_array = array |
1640 | 1 | .as_any() |
1641 | 1 | .downcast_ref::<DictionaryArray<Int32Type>>() |
1642 | 1 | .unwrap(); |
1643 | 1 | assert_eq!(dict_array.len(), 3); |
1644 | 1 | assert!(dict_array.is_valid(0)); |
1645 | 1 | assert!(dict_array.is_null(1)); |
1646 | 1 | assert!(dict_array.is_valid(2)); |
1647 | 1 | let expected_keys = Int32Array::from(vec![Some(1), None, Some(0)]); |
1648 | 1 | assert_eq!(dict_array.keys(), &expected_keys); |
1649 | 1 | let values = dict_array |
1650 | 1 | .values() |
1651 | 1 | .as_any() |
1652 | 1 | .downcast_ref::<StringArray>() |
1653 | 1 | .unwrap(); |
1654 | 1 | assert_eq!(values.value(0), "X"); |
1655 | 1 | assert_eq!(values.value(1), "Y"); |
1656 | 1 | } |
1657 | | |
1658 | | #[test] |
1659 | 1 | fn test_duration_decoding_with_nulls() { |
1660 | 1 | let duration_codec = Codec::Interval; |
1661 | 1 | let avro_type = AvroDataType::new( |
1662 | 1 | duration_codec, |
1663 | 1 | Default::default(), |
1664 | 1 | Some(Nullability::NullFirst), |
1665 | | ); |
1666 | 1 | let mut decoder = Decoder::try_new(&avro_type).unwrap(); |
1667 | 1 | let mut data = Vec::new(); |
1668 | | // First value: 1 month, 2 days, 3 millis |
1669 | 1 | data.extend_from_slice(&encode_avro_long(1)); // not null |
1670 | 1 | let mut duration1 = Vec::new(); |
1671 | 1 | duration1.extend_from_slice(&1u32.to_le_bytes()); |
1672 | 1 | duration1.extend_from_slice(&2u32.to_le_bytes()); |
1673 | 1 | duration1.extend_from_slice(&3u32.to_le_bytes()); |
1674 | 1 | data.extend_from_slice(&duration1); |
1675 | | // Second value: null |
1676 | 1 | data.extend_from_slice(&encode_avro_long(0)); // null |
1677 | 1 | data.extend_from_slice(&encode_avro_long(1)); // not null |
1678 | 1 | let mut duration2 = Vec::new(); |
1679 | 1 | duration2.extend_from_slice(&4u32.to_le_bytes()); |
1680 | 1 | duration2.extend_from_slice(&5u32.to_le_bytes()); |
1681 | 1 | duration2.extend_from_slice(&6u32.to_le_bytes()); |
1682 | 1 | data.extend_from_slice(&duration2); |
1683 | 1 | let mut cursor = AvroCursor::new(&data); |
1684 | 1 | decoder.decode(&mut cursor).unwrap(); |
1685 | 1 | decoder.decode(&mut cursor).unwrap(); |
1686 | 1 | decoder.decode(&mut cursor).unwrap(); |
1687 | 1 | let array = decoder.flush(None).unwrap(); |
1688 | 1 | let interval_array = array |
1689 | 1 | .as_any() |
1690 | 1 | .downcast_ref::<IntervalMonthDayNanoArray>() |
1691 | 1 | .unwrap(); |
1692 | 1 | assert_eq!(interval_array.len(), 3); |
1693 | 1 | assert!(interval_array.is_valid(0)); |
1694 | 1 | assert!(interval_array.is_null(1)); |
1695 | 1 | assert!(interval_array.is_valid(2)); |
1696 | 1 | let expected = IntervalMonthDayNanoArray::from(vec![ |
1697 | 1 | Some(IntervalMonthDayNano { |
1698 | 1 | months: 1, |
1699 | 1 | days: 2, |
1700 | 1 | nanoseconds: 3_000_000, |
1701 | 1 | }), |
1702 | 1 | None, |
1703 | 1 | Some(IntervalMonthDayNano { |
1704 | 1 | months: 4, |
1705 | 1 | days: 5, |
1706 | 1 | nanoseconds: 6_000_000, |
1707 | 1 | }), |
1708 | | ]); |
1709 | 1 | assert_eq!(interval_array, &expected); |
1710 | 1 | } |
1711 | | |
1712 | | #[test] |
1713 | 1 | fn test_duration_decoding_empty() { |
1714 | 1 | let duration_codec = Codec::Interval; |
1715 | 1 | let avro_type = AvroDataType::new(duration_codec, Default::default(), None); |
1716 | 1 | let mut decoder = Decoder::try_new(&avro_type).unwrap(); |
1717 | 1 | let array = decoder.flush(None).unwrap(); |
1718 | 1 | assert_eq!(array.len(), 0); |
1719 | 1 | } |
1720 | | |
1721 | | #[test] |
1722 | 1 | fn test_nullable_decode_error_bitmap_corruption() { |
1723 | | // Nullable Int32 with ['T','null'] encoding (NullSecond) |
1724 | 1 | let avro_type = AvroDataType::new( |
1725 | 1 | Codec::Int32, |
1726 | 1 | Default::default(), |
1727 | 1 | Some(Nullability::NullSecond), |
1728 | | ); |
1729 | 1 | let mut decoder = Decoder::try_new(&avro_type).unwrap(); |
1730 | | |
1731 | | // Row 1: union branch 1 (null) |
1732 | 1 | let mut row1 = Vec::new(); |
1733 | 1 | row1.extend_from_slice(&encode_avro_int(1)); |
1734 | | |
1735 | | // Row 2: union branch 0 (non-null) but missing the int payload -> decode error |
1736 | 1 | let mut row2 = Vec::new(); |
1737 | 1 | row2.extend_from_slice(&encode_avro_int(0)); // branch = 0 => non-null |
1738 | | |
1739 | | // Row 3: union branch 0 (non-null) with correct int payload -> should succeed |
1740 | 1 | let mut row3 = Vec::new(); |
1741 | 1 | row3.extend_from_slice(&encode_avro_int(0)); // branch |
1742 | 1 | row3.extend_from_slice(&encode_avro_int(42)); // actual value |
1743 | | |
1744 | 1 | decoder.decode(&mut AvroCursor::new(&row1)).unwrap(); |
1745 | 1 | assert!(decoder.decode(&mut AvroCursor::new(&row2)).is_err()); // decode error |
1746 | 1 | decoder.decode(&mut AvroCursor::new(&row3)).unwrap(); |
1747 | | |
1748 | 1 | let array = decoder.flush(None).unwrap(); |
1749 | | |
1750 | | // Should contain 2 elements: row1 (null) and row3 (42) |
1751 | 1 | assert_eq!(array.len(), 2); |
1752 | 1 | let int_array = array.as_any().downcast_ref::<Int32Array>().unwrap(); |
1753 | 1 | assert!(int_array.is_null(0)); // row1 is null |
1754 | 1 | assert_eq!(int_array.value(1), 42); // row3 value is 42 |
1755 | 1 | } |
1756 | | |
1757 | 5 | fn make_record_resolved_decoder( |
1758 | 5 | reader_fields: &[(&str, DataType, bool)], |
1759 | 5 | writer_to_reader: Vec<Option<usize>>, |
1760 | 5 | mut skip_decoders: Vec<Option<super::Skipper>>, |
1761 | 5 | ) -> Decoder { |
1762 | 5 | let mut field_refs: Vec<FieldRef> = Vec::with_capacity(reader_fields.len()); |
1763 | 5 | let mut encodings: Vec<Decoder> = Vec::with_capacity(reader_fields.len()); |
1764 | 11 | for (name6 , dt6 , nullable6 ) in reader_fields { |
1765 | 6 | field_refs.push(Arc::new(ArrowField::new(*name, dt.clone(), *nullable))); |
1766 | 6 | let enc = match dt { |
1767 | 5 | DataType::Int32 => Decoder::Int32(Vec::new()), |
1768 | 1 | DataType::Int64 => Decoder::Int64(Vec::new()), |
1769 | | DataType::Utf8 => { |
1770 | 0 | Decoder::String(OffsetBufferBuilder::new(DEFAULT_CAPACITY), Vec::new()) |
1771 | | } |
1772 | 0 | other => panic!("Unsupported test reader field type: {other:?}"), |
1773 | | }; |
1774 | 6 | encodings.push(enc); |
1775 | | } |
1776 | 5 | let fields: Fields = field_refs.into(); |
1777 | 5 | Decoder::RecordResolved { |
1778 | 5 | fields, |
1779 | 5 | encodings, |
1780 | 5 | writer_to_reader: Arc::from(writer_to_reader), |
1781 | 5 | skip_decoders, |
1782 | 5 | } |
1783 | 5 | } |
1784 | | |
1785 | | #[test] |
1786 | 1 | fn test_skip_writer_trailing_field_int32() { |
1787 | 1 | let mut dec = make_record_resolved_decoder( |
1788 | 1 | &[("id", arrow_schema::DataType::Int32, false)], |
1789 | 1 | vec![Some(0), None], |
1790 | 1 | vec![None, Some(super::Skipper::Int32)], |
1791 | | ); |
1792 | 1 | let mut data = Vec::new(); |
1793 | 1 | data.extend_from_slice(&encode_avro_int(7)); |
1794 | 1 | data.extend_from_slice(&encode_avro_int(999)); |
1795 | 1 | let mut cur = AvroCursor::new(&data); |
1796 | 1 | dec.decode(&mut cur).unwrap(); |
1797 | 1 | assert_eq!(cur.position(), data.len()); |
1798 | 1 | let arr = dec.flush(None).unwrap(); |
1799 | 1 | let struct_arr = arr.as_any().downcast_ref::<StructArray>().unwrap(); |
1800 | 1 | assert_eq!(struct_arr.len(), 1); |
1801 | 1 | let id = struct_arr |
1802 | 1 | .column_by_name("id") |
1803 | 1 | .unwrap() |
1804 | 1 | .as_any() |
1805 | 1 | .downcast_ref::<Int32Array>() |
1806 | 1 | .unwrap(); |
1807 | 1 | assert_eq!(id.value(0), 7); |
1808 | 1 | } |
1809 | | |
1810 | | #[test] |
1811 | 1 | fn test_skip_writer_middle_field_string() { |
1812 | 1 | let mut dec = make_record_resolved_decoder( |
1813 | 1 | &[ |
1814 | 1 | ("id", DataType::Int32, false), |
1815 | 1 | ("score", DataType::Int64, false), |
1816 | 1 | ], |
1817 | 1 | vec![Some(0), None, Some(1)], |
1818 | 1 | vec![None, Some(Skipper::String), None], |
1819 | | ); |
1820 | 1 | let mut data = Vec::new(); |
1821 | 1 | data.extend_from_slice(&encode_avro_int(42)); |
1822 | 1 | data.extend_from_slice(&encode_avro_bytes(b"abcdef")); |
1823 | 1 | data.extend_from_slice(&encode_avro_long(1000)); |
1824 | 1 | let mut cur = AvroCursor::new(&data); |
1825 | 1 | dec.decode(&mut cur).unwrap(); |
1826 | 1 | assert_eq!(cur.position(), data.len()); |
1827 | 1 | let arr = dec.flush(None).unwrap(); |
1828 | 1 | let s = arr.as_any().downcast_ref::<StructArray>().unwrap(); |
1829 | 1 | let id = s |
1830 | 1 | .column_by_name("id") |
1831 | 1 | .unwrap() |
1832 | 1 | .as_any() |
1833 | 1 | .downcast_ref::<Int32Array>() |
1834 | 1 | .unwrap(); |
1835 | 1 | let score = s |
1836 | 1 | .column_by_name("score") |
1837 | 1 | .unwrap() |
1838 | 1 | .as_any() |
1839 | 1 | .downcast_ref::<Int64Array>() |
1840 | 1 | .unwrap(); |
1841 | 1 | assert_eq!(id.value(0), 42); |
1842 | 1 | assert_eq!(score.value(0), 1000); |
1843 | 1 | } |
1844 | | |
1845 | | #[test] |
1846 | 1 | fn test_skip_writer_array_with_negative_block_count_fast() { |
1847 | 1 | let mut dec = make_record_resolved_decoder( |
1848 | 1 | &[("id", DataType::Int32, false)], |
1849 | 1 | vec![None, Some(0)], |
1850 | 1 | vec![Some(super::Skipper::List(Box::new(Skipper::Int32))), None], |
1851 | | ); |
1852 | 1 | let mut array_payload = Vec::new(); |
1853 | 1 | array_payload.extend_from_slice(&encode_avro_int(1)); |
1854 | 1 | array_payload.extend_from_slice(&encode_avro_int(2)); |
1855 | 1 | array_payload.extend_from_slice(&encode_avro_int(3)); |
1856 | 1 | let mut data = Vec::new(); |
1857 | 1 | data.extend_from_slice(&encode_avro_long(-3)); |
1858 | 1 | data.extend_from_slice(&encode_avro_long(array_payload.len() as i64)); |
1859 | 1 | data.extend_from_slice(&array_payload); |
1860 | 1 | data.extend_from_slice(&encode_avro_long(0)); |
1861 | 1 | data.extend_from_slice(&encode_avro_int(5)); |
1862 | 1 | let mut cur = AvroCursor::new(&data); |
1863 | 1 | dec.decode(&mut cur).unwrap(); |
1864 | 1 | assert_eq!(cur.position(), data.len()); |
1865 | 1 | let arr = dec.flush(None).unwrap(); |
1866 | 1 | let s = arr.as_any().downcast_ref::<StructArray>().unwrap(); |
1867 | 1 | let id = s |
1868 | 1 | .column_by_name("id") |
1869 | 1 | .unwrap() |
1870 | 1 | .as_any() |
1871 | 1 | .downcast_ref::<Int32Array>() |
1872 | 1 | .unwrap(); |
1873 | 1 | assert_eq!(id.len(), 1); |
1874 | 1 | assert_eq!(id.value(0), 5); |
1875 | 1 | } |
1876 | | |
1877 | | #[test] |
1878 | 1 | fn test_skip_writer_map_with_negative_block_count_fast() { |
1879 | 1 | let mut dec = make_record_resolved_decoder( |
1880 | 1 | &[("id", DataType::Int32, false)], |
1881 | 1 | vec![None, Some(0)], |
1882 | 1 | vec![Some(Skipper::Map(Box::new(Skipper::Int32))), None], |
1883 | | ); |
1884 | 1 | let mut entries = Vec::new(); |
1885 | 1 | entries.extend_from_slice(&encode_avro_bytes(b"k1")); |
1886 | 1 | entries.extend_from_slice(&encode_avro_int(10)); |
1887 | 1 | entries.extend_from_slice(&encode_avro_bytes(b"k2")); |
1888 | 1 | entries.extend_from_slice(&encode_avro_int(20)); |
1889 | 1 | let mut data = Vec::new(); |
1890 | 1 | data.extend_from_slice(&encode_avro_long(-2)); |
1891 | 1 | data.extend_from_slice(&encode_avro_long(entries.len() as i64)); |
1892 | 1 | data.extend_from_slice(&entries); |
1893 | 1 | data.extend_from_slice(&encode_avro_long(0)); |
1894 | 1 | data.extend_from_slice(&encode_avro_int(123)); |
1895 | 1 | let mut cur = AvroCursor::new(&data); |
1896 | 1 | dec.decode(&mut cur).unwrap(); |
1897 | 1 | assert_eq!(cur.position(), data.len()); |
1898 | 1 | let arr = dec.flush(None).unwrap(); |
1899 | 1 | let s = arr.as_any().downcast_ref::<StructArray>().unwrap(); |
1900 | 1 | let id = s |
1901 | 1 | .column_by_name("id") |
1902 | 1 | .unwrap() |
1903 | 1 | .as_any() |
1904 | 1 | .downcast_ref::<Int32Array>() |
1905 | 1 | .unwrap(); |
1906 | 1 | assert_eq!(id.len(), 1); |
1907 | 1 | assert_eq!(id.value(0), 123); |
1908 | 1 | } |
1909 | | |
1910 | | #[test] |
1911 | 1 | fn test_skip_writer_nullable_field_union_nullfirst() { |
1912 | 1 | let mut dec = make_record_resolved_decoder( |
1913 | 1 | &[("id", DataType::Int32, false)], |
1914 | 1 | vec![None, Some(0)], |
1915 | 1 | vec![ |
1916 | 1 | Some(super::Skipper::Nullable( |
1917 | 1 | Nullability::NullFirst, |
1918 | 1 | Box::new(super::Skipper::Int32), |
1919 | 1 | )), |
1920 | 1 | None, |
1921 | | ], |
1922 | | ); |
1923 | 1 | let mut row1 = Vec::new(); |
1924 | 1 | row1.extend_from_slice(&encode_avro_long(0)); |
1925 | 1 | row1.extend_from_slice(&encode_avro_int(5)); |
1926 | 1 | let mut row2 = Vec::new(); |
1927 | 1 | row2.extend_from_slice(&encode_avro_long(1)); |
1928 | 1 | row2.extend_from_slice(&encode_avro_int(123)); |
1929 | 1 | row2.extend_from_slice(&encode_avro_int(7)); |
1930 | 1 | let mut cur1 = AvroCursor::new(&row1); |
1931 | 1 | let mut cur2 = AvroCursor::new(&row2); |
1932 | 1 | dec.decode(&mut cur1).unwrap(); |
1933 | 1 | dec.decode(&mut cur2).unwrap(); |
1934 | 1 | assert_eq!(cur1.position(), row1.len()); |
1935 | 1 | assert_eq!(cur2.position(), row2.len()); |
1936 | 1 | let arr = dec.flush(None).unwrap(); |
1937 | 1 | let s = arr.as_any().downcast_ref::<StructArray>().unwrap(); |
1938 | 1 | let id = s |
1939 | 1 | .column_by_name("id") |
1940 | 1 | .unwrap() |
1941 | 1 | .as_any() |
1942 | 1 | .downcast_ref::<Int32Array>() |
1943 | 1 | .unwrap(); |
1944 | 1 | assert_eq!(id.len(), 2); |
1945 | 1 | assert_eq!(id.value(0), 5); |
1946 | 1 | assert_eq!(id.value(1), 7); |
1947 | 1 | } |
1948 | | } |