Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/codec.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::schema::{
19
    Attributes, AvroSchema, ComplexType, PrimitiveType, Record, Schema, Type, TypeName,
20
    AVRO_ENUM_SYMBOLS_METADATA_KEY,
21
};
22
use arrow_schema::{
23
    ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
24
    DECIMAL128_MAX_SCALE,
25
};
26
use serde_json::Value;
27
use std::borrow::Cow;
28
use std::collections::HashMap;
29
use std::sync::Arc;
30
31
/// Avro types are not nullable, with nullability instead encoded as a union
32
/// where one of the variants is the null type.
33
///
34
/// To accommodate this we special case two-variant unions where one of the
35
/// variants is the null type, and use this to derive arrow's notion of nullability
36
#[derive(Debug, Copy, Clone, PartialEq)]
37
pub enum Nullability {
38
    /// The nulls are encoded as the first union variant
39
    NullFirst,
40
    /// The nulls are encoded as the second union variant
41
    NullSecond,
42
}
43
44
/// Contains information about how to resolve differences between a writer's and a reader's schema.
45
#[derive(Debug, Clone, PartialEq)]
46
pub(crate) enum ResolutionInfo {
47
    /// Indicates that the writer's type should be promoted to the reader's type.
48
    Promotion(Promotion),
49
    /// Indicates that a default value should be used for a field. (Implemented in a Follow-up PR)
50
    DefaultValue(AvroLiteral),
51
    /// Provides mapping information for resolving enums. (Implemented in a Follow-up PR)
52
    EnumMapping(EnumMapping),
53
    /// Provides resolution information for record fields. (Implemented in a Follow-up PR)
54
    Record(ResolvedRecord),
55
}
56
57
/// Represents a literal Avro value.
58
///
59
/// This is used to represent default values in an Avro schema.
60
#[derive(Debug, Clone, PartialEq)]
61
pub(crate) enum AvroLiteral {
62
    /// Represents a null value.
63
    Null,
64
    /// Represents a boolean value.
65
    Boolean(bool),
66
    /// Represents an integer value.
67
    Int(i32),
68
    /// Represents a long value.
69
    Long(i64),
70
    /// Represents a float value.
71
    Float(f32),
72
    /// Represents a double value.
73
    Double(f64),
74
    /// Represents a bytes value.
75
    Bytes(Vec<u8>),
76
    /// Represents a string value.
77
    String(String),
78
    /// Represents an enum symbol.
79
    Enum(String),
80
    /// Represents an unsupported literal type.
81
    Unsupported,
82
}
83
84
/// Contains the necessary information to resolve a writer's record against a reader's record schema.
85
#[derive(Debug, Clone, PartialEq)]
86
pub struct ResolvedRecord {
87
    /// Maps a writer's field index to the corresponding reader's field index.
88
    /// `None` if the writer's field is not present in the reader's schema.
89
    pub(crate) writer_to_reader: Arc<[Option<usize>]>,
90
    /// A list of indices in the reader's schema for fields that have a default value.
91
    pub(crate) default_fields: Arc<[usize]>,
92
    /// For fields present in the writer's schema but not the reader's, this stores their data type.
93
    /// This is needed to correctly skip over these fields during deserialization.
94
    pub(crate) skip_fields: Arc<[Option<AvroDataType>]>,
95
}
96
97
/// Defines the type of promotion to be applied during schema resolution.
98
///
99
/// Schema resolution may require promoting a writer's data type to a reader's data type.
100
/// For example, an `int` can be promoted to a `long`, `float`, or `double`.
101
#[derive(Debug, Clone, PartialEq, Eq)]
102
pub(crate) enum Promotion {
103
    /// Promotes an `int` to a `long`.
104
    IntToLong,
105
    /// Promotes an `int` to a `float`.
106
    IntToFloat,
107
    /// Promotes an `int` to a `double`.
108
    IntToDouble,
109
    /// Promotes a `long` to a `float`.
110
    LongToFloat,
111
    /// Promotes a `long` to a `double`.
112
    LongToDouble,
113
    /// Promotes a `float` to a `double`.
114
    FloatToDouble,
115
    /// Promotes a `string` to `bytes`.
116
    StringToBytes,
117
    /// Promotes `bytes` to a `string`.
118
    BytesToString,
119
}
120
121
/// Holds the mapping information for resolving Avro enums.
122
///
123
/// When resolving schemas, the writer's enum symbols must be mapped to the reader's symbols.
124
#[derive(Debug, Clone, PartialEq, Eq)]
125
pub struct EnumMapping {
126
    /// A mapping from the writer's symbol index to the reader's symbol index.
127
    pub(crate) mapping: Arc<[i32]>,
128
    /// The index to use for a writer's symbol that is not present in the reader's enum
129
    /// and a default value is specified in the reader's schema.
130
    pub(crate) default_index: i32,
131
}
132
133
#[cfg(feature = "canonical_extension_types")]
134
fn with_extension_type(codec: &Codec, field: Field) -> Field {
135
    match codec {
136
        Codec::Uuid => field.with_extension_type(arrow_schema::extension::Uuid),
137
        _ => field,
138
    }
139
}
140
141
/// An Avro datatype mapped to the arrow data model
142
#[derive(Debug, Clone, PartialEq)]
143
pub struct AvroDataType {
144
    nullability: Option<Nullability>,
145
    metadata: HashMap<String, String>,
146
    codec: Codec,
147
    pub(crate) resolution: Option<ResolutionInfo>,
148
}
149
150
impl AvroDataType {
151
    /// Create a new [`AvroDataType`] with the given parts.
152
610
    pub fn new(
153
610
        codec: Codec,
154
610
        metadata: HashMap<String, String>,
155
610
        nullability: Option<Nullability>,
156
610
    ) -> Self {
157
610
        AvroDataType {
158
610
            codec,
159
610
            metadata,
160
610
            nullability,
161
610
            resolution: None,
162
610
        }
163
610
    }
164
165
    #[inline]
166
29
    fn new_with_resolution(
167
29
        codec: Codec,
168
29
        metadata: HashMap<String, String>,
169
29
        nullability: Option<Nullability>,
170
29
        resolution: Option<ResolutionInfo>,
171
29
    ) -> Self {
172
29
        Self {
173
29
            codec,
174
29
            metadata,
175
29
            nullability,
176
29
            resolution,
177
29
        }
178
29
    }
179
180
    /// Returns an arrow [`Field`] with the given name
181
907
    pub fn field_with_name(&self, name: &str) -> Field {
182
907
        let nullable = self.nullability.is_some();
183
907
        let data_type = self.codec.data_type();
184
907
        let field = Field::new(name, data_type, nullable).with_metadata(self.metadata.clone());
185
        #[cfg(feature = "canonical_extension_types")]
186
        return with_extension_type(&self.codec, field);
187
        #[cfg(not(feature = "canonical_extension_types"))]
188
907
        field
189
907
    }
190
191
    /// Returns a reference to the codec used by this data type
192
    ///
193
    /// The codec determines how Avro data is encoded and mapped to Arrow data types.
194
    /// This is useful when we need to inspect or use the specific encoding of a field.
195
808
    pub fn codec(&self) -> &Codec {
196
808
        &self.codec
197
808
    }
198
199
    /// Returns the nullability status of this data type
200
    ///
201
    /// In Avro, nullability is represented through unions with null types.
202
    /// The returned value indicates how nulls are encoded in the Avro format:
203
    /// - `Some(Nullability::NullFirst)` - Nulls are encoded as the first union variant
204
    /// - `Some(Nullability::NullSecond)` - Nulls are encoded as the second union variant
205
    /// - `None` - The type is not nullable
206
697
    pub fn nullability(&self) -> Option<Nullability> {
207
697
        self.nullability
208
697
    }
209
}
210
211
/// A named [`AvroDataType`]
212
#[derive(Debug, Clone, PartialEq)]
213
pub struct AvroField {
214
    name: String,
215
    data_type: AvroDataType,
216
}
217
218
impl AvroField {
219
    /// Returns the arrow [`Field`]
220
737
    pub fn field(&self) -> Field {
221
737
        self.data_type.field_with_name(&self.name)
222
737
    }
223
224
    /// Returns the [`AvroDataType`]
225
693
    pub fn data_type(&self) -> &AvroDataType {
226
693
        &self.data_type
227
693
    }
228
229
    /// Returns a new [`AvroField`] with Utf8View support enabled
230
    ///
231
    /// This will convert any Utf8 codecs to Utf8View codecs. This method is used to
232
    /// enable potential performance optimizations in string-heavy workloads by using
233
    /// Arrow's StringViewArray data structure.
234
    ///
235
    /// Returns a new `AvroField` with the same structure, but with string types
236
    /// converted to use `Utf8View` instead of `Utf8`.
237
0
    pub fn with_utf8view(&self) -> Self {
238
0
        let mut field = self.clone();
239
0
        if let Codec::Utf8 = field.data_type.codec {
240
0
            field.data_type.codec = Codec::Utf8View;
241
0
        }
242
0
        field
243
0
    }
244
245
    /// Returns the name of this Avro field
246
    ///
247
    /// This is the field name as defined in the Avro schema.
248
    /// It's used to identify fields within a record structure.
249
9
    pub fn name(&self) -> &str {
250
9
        &self.name
251
9
    }
252
253
    /// Performs schema resolution between a writer and reader schema.
254
    ///
255
    /// This is the primary entry point for handling schema evolution. It produces an
256
    /// `AvroField` that contains all the necessary information to read data written
257
    /// with the `writer` schema as if it were written with the `reader` schema.
258
11
    pub(crate) fn resolve_from_writer_and_reader<'a>(
259
11
        writer_schema: &'a Schema<'a>,
260
11
        reader_schema: &'a Schema<'a>,
261
11
        use_utf8view: bool,
262
11
        strict_mode: bool,
263
11
    ) -> Result<Self, ArrowError> {
264
11
        let top_name = match 
reader_schema0
{
265
0
            Schema::Complex(ComplexType::Record(r)) => r.name.to_string(),
266
11
            _ => "root".to_string(),
267
        };
268
11
        let mut resolver = Maker::new(use_utf8view, strict_mode);
269
11
        let 
data_type10
= resolver.make_data_type(writer_schema, Some(reader_schema), None)
?1
;
270
10
        Ok(Self {
271
10
            name: top_name,
272
10
            data_type,
273
10
        })
274
11
    }
275
}
276
277
impl<'a> TryFrom<&Schema<'a>> for AvroField {
278
    type Error = ArrowError;
279
280
4
    fn try_from(schema: &Schema<'a>) -> Result<Self, Self::Error> {
281
4
        match schema {
282
4
            Schema::Complex(ComplexType::Record(r)) => {
283
4
                let mut resolver = Maker::new(false, false);
284
4
                let 
data_type3
= resolver.make_data_type(schema, None, None)
?1
;
285
3
                Ok(AvroField {
286
3
                    data_type,
287
3
                    name: r.name.to_string(),
288
3
                })
289
            }
290
0
            _ => Err(ArrowError::ParseError(format!(
291
0
                "Expected record got {schema:?}"
292
0
            ))),
293
        }
294
4
    }
295
}
296
297
/// Builder for an [`AvroField`]
298
#[derive(Debug)]
299
pub struct AvroFieldBuilder<'a> {
300
    writer_schema: &'a Schema<'a>,
301
    reader_schema: Option<&'a Schema<'a>>,
302
    use_utf8view: bool,
303
    strict_mode: bool,
304
}
305
306
impl<'a> AvroFieldBuilder<'a> {
307
    /// Creates a new [`AvroFieldBuilder`] for a given writer schema.
308
93
    pub fn new(writer_schema: &'a Schema<'a>) -> Self {
309
93
        Self {
310
93
            writer_schema,
311
93
            reader_schema: None,
312
93
            use_utf8view: false,
313
93
            strict_mode: false,
314
93
        }
315
93
    }
316
317
    /// Sets the reader schema for schema resolution.
318
    ///
319
    /// If a reader schema is provided, the builder will produce a resolved `AvroField`
320
    /// that can handle differences between the writer's and reader's schemas.
321
    #[inline]
322
30
    pub fn with_reader_schema(mut self, reader_schema: &'a Schema<'a>) -> Self {
323
30
        self.reader_schema = Some(reader_schema);
324
30
        self
325
30
    }
326
327
    /// Enable or disable Utf8View support
328
92
    pub fn with_utf8view(mut self, use_utf8view: bool) -> Self {
329
92
        self.use_utf8view = use_utf8view;
330
92
        self
331
92
    }
332
333
    /// Enable or disable strict mode.
334
92
    pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
335
92
        self.strict_mode = strict_mode;
336
92
        self
337
92
    }
338
339
    /// Build an [`AvroField`] from the builder
340
93
    pub fn build(self) -> Result<AvroField, ArrowError> {
341
93
        match self.writer_schema {
342
93
            Schema::Complex(ComplexType::Record(r)) => {
343
93
                let mut resolver = Maker::new(self.use_utf8view, self.strict_mode);
344
90
                let data_type =
345
93
                    resolver.make_data_type(self.writer_schema, self.reader_schema, None)
?3
;
346
90
                Ok(AvroField {
347
90
                    name: r.name.to_string(),
348
90
                    data_type,
349
90
                })
350
            }
351
0
            _ => Err(ArrowError::ParseError(format!(
352
0
                "Expected a Record schema to build an AvroField, but got {:?}",
353
0
                self.writer_schema
354
0
            ))),
355
        }
356
93
    }
357
}
358
359
/// An Avro encoding
360
///
361
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
362
#[derive(Debug, Clone, PartialEq)]
363
pub enum Codec {
364
    /// Represents Avro null type, maps to Arrow's Null data type
365
    Null,
366
    /// Represents Avro boolean type, maps to Arrow's Boolean data type
367
    Boolean,
368
    /// Represents Avro int type, maps to Arrow's Int32 data type
369
    Int32,
370
    /// Represents Avro long type, maps to Arrow's Int64 data type
371
    Int64,
372
    /// Represents Avro float type, maps to Arrow's Float32 data type
373
    Float32,
374
    /// Represents Avro double type, maps to Arrow's Float64 data type
375
    Float64,
376
    /// Represents Avro bytes type, maps to Arrow's Binary data type
377
    Binary,
378
    /// String data represented as UTF-8 encoded bytes, corresponding to Arrow's StringArray
379
    Utf8,
380
    /// String data represented as UTF-8 encoded bytes with an optimized view representation,
381
    /// corresponding to Arrow's StringViewArray which provides better performance for string operations
382
    ///
383
    /// The Utf8View option can be enabled via `ReadOptions::use_utf8view`.
384
    Utf8View,
385
    /// Represents Avro date logical type, maps to Arrow's Date32 data type
386
    Date32,
387
    /// Represents Avro time-millis logical type, maps to Arrow's Time32(TimeUnit::Millisecond) data type
388
    TimeMillis,
389
    /// Represents Avro time-micros logical type, maps to Arrow's Time64(TimeUnit::Microsecond) data type
390
    TimeMicros,
391
    /// Represents Avro timestamp-millis or local-timestamp-millis logical type
392
    ///
393
    /// Maps to Arrow's Timestamp(TimeUnit::Millisecond) data type
394
    /// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
395
    TimestampMillis(bool),
396
    /// Represents Avro timestamp-micros or local-timestamp-micros logical type
397
    ///
398
    /// Maps to Arrow's Timestamp(TimeUnit::Microsecond) data type
399
    /// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
400
    TimestampMicros(bool),
401
    /// Represents Avro fixed type, maps to Arrow's FixedSizeBinary data type
402
    /// The i32 parameter indicates the fixed binary size
403
    Fixed(i32),
404
    /// Represents Avro decimal type, maps to Arrow's Decimal128 or Decimal256 data types
405
    ///
406
    /// The fields are `(precision, scale, fixed_size)`.
407
    /// - `precision` (`usize`): Total number of digits.
408
    /// - `scale` (`Option<usize>`): Number of fractional digits.
409
    /// - `fixed_size` (`Option<usize>`): Size in bytes if backed by a `fixed` type, otherwise `None`.
410
    Decimal(usize, Option<usize>, Option<usize>),
411
    /// Represents Avro Uuid type, a FixedSizeBinary with a length of 16.
412
    Uuid,
413
    /// Represents an Avro enum, maps to Arrow's Dictionary(Int32, Utf8) type.
414
    ///
415
    /// The enclosed value contains the enum's symbols.
416
    Enum(Arc<[String]>),
417
    /// Represents Avro array type, maps to Arrow's List data type
418
    List(Arc<AvroDataType>),
419
    /// Represents Avro record type, maps to Arrow's Struct data type
420
    Struct(Arc<[AvroField]>),
421
    /// Represents Avro map type, maps to Arrow's Map data type
422
    Map(Arc<AvroDataType>),
423
    /// Represents Avro duration logical type, maps to Arrow's Interval(IntervalUnit::MonthDayNano) data type
424
    Interval,
425
}
426
427
impl Codec {
428
939
    fn data_type(&self) -> DataType {
429
939
        match self {
430
0
            Self::Null => DataType::Null,
431
47
            Self::Boolean => DataType::Boolean,
432
252
            Self::Int32 => DataType::Int32,
433
68
            Self::Int64 => DataType::Int64,
434
51
            Self::Float32 => DataType::Float32,
435
94
            Self::Float64 => DataType::Float64,
436
60
            Self::Binary => DataType::Binary,
437
77
            Self::Utf8 => DataType::Utf8,
438
0
            Self::Utf8View => DataType::Utf8View,
439
0
            Self::Date32 => DataType::Date32,
440
0
            Self::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
441
0
            Self::TimeMicros => DataType::Time64(TimeUnit::Microsecond),
442
0
            Self::TimestampMillis(is_utc) => {
443
0
                DataType::Timestamp(TimeUnit::Millisecond, is_utc.then(|| "+00:00".into()))
444
            }
445
40
            Self::TimestampMicros(is_utc) => {
446
40
                DataType::Timestamp(TimeUnit::Microsecond, is_utc.then(|| "+00:00".into()))
447
            }
448
1
            Self::Interval => DataType::Interval(IntervalUnit::MonthDayNano),
449
6
            Self::Fixed(size) => DataType::FixedSizeBinary(*size),
450
8
            Self::Decimal(precision, scale, size) => {
451
8
                let p = *precision as u8;
452
8
                let s = scale.unwrap_or(0) as i8;
453
8
                let too_large_for_128 = match *size {
454
8
                    Some(sz) => sz > 16,
455
                    None => {
456
0
                        (p as usize) > DECIMAL128_MAX_PRECISION as usize
457
0
                            || (s as usize) > DECIMAL128_MAX_SCALE as usize
458
                    }
459
                };
460
8
                if too_large_for_128 {
461
0
                    DataType::Decimal256(p, s)
462
                } else {
463
8
                    DataType::Decimal128(p, s)
464
                }
465
            }
466
1
            Self::Uuid => DataType::FixedSizeBinary(16),
467
            Self::Enum(_) => {
468
16
                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
469
            }
470
102
            Self::List(f) => {
471
102
                DataType::List(Arc::new(f.field_with_name(Field::LIST_FIELD_DEFAULT_NAME)))
472
            }
473
153
            Self::Struct(
f96
) => DataType::Struct(
f.iter()96
.
map96
(|x| x.field()).
collect96
()),
474
20
            Self::Map(value_type) => {
475
20
                let val_dt = value_type.codec.data_type();
476
20
                let val_field = Field::new("value", val_dt, value_type.nullability.is_some())
477
20
                    .with_metadata(value_type.metadata.clone());
478
20
                DataType::Map(
479
20
                    Arc::new(Field::new(
480
20
                        "entries",
481
20
                        DataType::Struct(Fields::from(vec![
482
20
                            Field::new("key", DataType::Utf8, false),
483
20
                            val_field,
484
20
                        ])),
485
20
                        false,
486
20
                    )),
487
20
                    false,
488
20
                )
489
            }
490
        }
491
939
    }
492
}
493
494
impl From<PrimitiveType> for Codec {
495
585
    fn from(value: PrimitiveType) -> Self {
496
585
        match value {
497
0
            PrimitiveType::Null => Self::Null,
498
45
            PrimitiveType::Boolean => Self::Boolean,
499
190
            PrimitiveType::Int => Self::Int32,
500
108
            PrimitiveType::Long => Self::Int64,
501
53
            PrimitiveType::Float => Self::Float32,
502
78
            PrimitiveType::Double => Self::Float64,
503
66
            PrimitiveType::Bytes => Self::Binary,
504
45
            PrimitiveType::String => Self::Utf8,
505
        }
506
585
    }
507
}
508
509
8
fn parse_decimal_attributes(
510
8
    attributes: &Attributes,
511
8
    fallback_size: Option<usize>,
512
8
    precision_required: bool,
513
8
) -> Result<(usize, usize, Option<usize>), ArrowError> {
514
8
    let precision = attributes
515
8
        .additional
516
8
        .get("precision")
517
8
        .and_then(|v| v.as_u64())
518
8
        .or(if precision_required { None } else { 
Some(10)0
})
519
8
        .ok_or_else(|| ArrowError::ParseError(
"Decimal requires precision"0
.
to_string0
()))
?0
520
        as usize;
521
8
    let scale = attributes
522
8
        .additional
523
8
        .get("scale")
524
8
        .and_then(|v| v.as_u64())
525
8
        .unwrap_or(0) as usize;
526
8
    let size = attributes
527
8
        .additional
528
8
        .get("size")
529
8
        .and_then(|v| 
v0
.
as_u640
())
530
8
        .map(|s| 
s0
as usize)
531
8
        .or(fallback_size);
532
8
    Ok((precision, scale, size))
533
8
}
534
535
impl Codec {
536
    /// Converts a string codec to use Utf8View if requested
537
    ///
538
    /// The conversion only happens if both:
539
    /// 1. `use_utf8view` is true
540
    /// 2. The codec is currently `Utf8`
541
    ///
542
    /// # Example
543
    /// ```
544
    /// # use arrow_avro::codec::Codec;
545
    /// let utf8_codec1 = Codec::Utf8;
546
    /// let utf8_codec2 = Codec::Utf8;
547
    ///
548
    /// // Convert to Utf8View
549
    /// let view_codec = utf8_codec1.with_utf8view(true);
550
    /// assert!(matches!(view_codec, Codec::Utf8View));
551
    ///
552
    /// // Don't convert if use_utf8view is false
553
    /// let unchanged_codec = utf8_codec2.with_utf8view(false);
554
    /// assert!(matches!(unchanged_codec, Codec::Utf8));
555
    /// ```
556
585
    pub fn with_utf8view(self, use_utf8view: bool) -> Self {
557
585
        if use_utf8view && 
matches!0
(
self3
, Self::Utf8) {
558
3
            Self::Utf8View
559
        } else {
560
582
            self
561
        }
562
585
    }
563
}
564
565
/// Resolves Avro type names to [`AvroDataType`]
566
///
567
/// See <https://avro.apache.org/docs/1.11.1/specification/#names>
568
#[derive(Debug, Default)]
569
struct Resolver<'a> {
570
    map: HashMap<(&'a str, &'a str), AvroDataType>,
571
}
572
573
impl<'a> Resolver<'a> {
574
155
    fn register(&mut self, name: &'a str, namespace: Option<&'a str>, schema: AvroDataType) {
575
155
        self.map.insert((namespace.unwrap_or(""), name), schema);
576
155
    }
577
578
11
    fn resolve(&self, name: &str, namespace: Option<&'a str>) -> Result<AvroDataType, ArrowError> {
579
11
        let (namespace, name) = name
580
11
            .rsplit_once('.')
581
11
            .unwrap_or_else(|| (
namespace8
.unwrap_or(""),
name8
));
582
583
11
        self.map
584
11
            .get(&(namespace, name))
585
11
            .ok_or_else(|| ArrowError::ParseError(
format!1
(
"Failed to resolve {namespace}.{name}"1
)))
586
11
            .cloned()
587
11
    }
588
}
589
590
/// Resolves Avro type names to [`AvroDataType`]
591
///
592
/// See <https://avro.apache.org/docs/1.11.1/specification/#names>
593
struct Maker<'a> {
594
    resolver: Resolver<'a>,
595
    use_utf8view: bool,
596
    strict_mode: bool,
597
}
598
599
impl<'a> Maker<'a> {
600
132
    fn new(use_utf8view: bool, strict_mode: bool) -> Self {
601
132
        Self {
602
132
            resolver: Default::default(),
603
132
            use_utf8view,
604
132
            strict_mode,
605
132
        }
606
132
    }
607
485
    fn make_data_type<'s>(
608
485
        &mut self,
609
485
        writer_schema: &'s Schema<'a>,
610
485
        reader_schema: Option<&'s Schema<'a>>,
611
485
        namespace: Option<&'a str>,
612
485
    ) -> Result<AvroDataType, ArrowError> {
613
485
        match reader_schema {
614
404
            Some(reader_schema) => self.resolve_type(writer_schema, reader_schema, namespace),
615
81
            None => self.parse_type(writer_schema, namespace),
616
        }
617
485
    }
618
619
    /// Parses a [`AvroDataType`] from the provided [`Schema`] and the given `name` and `namespace`
620
    ///
621
    /// `name`: is the name used to refer to `schema` in its parent
622
    /// `namespace`: an optional qualifier used as part of a type hierarchy
623
    /// If the data type is a string, convert to use Utf8View if requested
624
    ///
625
    /// This function is used during the schema conversion process to determine whether
626
    /// string data should be represented as StringArray (default) or StringViewArray.
627
    ///
628
    /// `use_utf8view`: if true, use Utf8View instead of Utf8 for string types
629
    ///
630
    /// See [`Resolver`] for more information
631
1.28k
    fn parse_type<'s>(
632
1.28k
        &mut self,
633
1.28k
        schema: &'s Schema<'a>,
634
1.28k
        namespace: Option<&'a str>,
635
1.28k
    ) -> Result<AvroDataType, ArrowError> {
636
596
        match schema {
637
585
            Schema::TypeName(TypeName::Primitive(p)) => Ok(AvroDataType::new(
638
585
                Codec::from(*p).with_utf8view(self.use_utf8view),
639
585
                Default::default(),
640
585
                None,
641
585
            )),
642
11
            Schema::TypeName(TypeName::Ref(name)) => self.resolver.resolve(name, namespace),
643
449
            Schema::Union(f) => {
644
                // Special case the common case of nullable primitives
645
449
                let null = f
646
449
                    .iter()
647
819
                    .
position449
(|x| x == &Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)));
648
449
                match (f.len() == 2, null) {
649
                    (true, Some(0)) => {
650
79
                        let 
mut field78
= self.parse_type(&f[1], namespace)
?1
;
651
78
                        field.nullability = Some(Nullability::NullFirst);
652
78
                        Ok(field)
653
                    }
654
                    (true, Some(1)) => {
655
370
                        if self.strict_mode {
656
3
                            return Err(ArrowError::SchemaError(
657
3
                                "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
658
3
                                    .to_string(),
659
3
                            ));
660
367
                        }
661
367
                        let mut field = self.parse_type(&f[0], namespace)
?0
;
662
367
                        field.nullability = Some(Nullability::NullSecond);
663
367
                        Ok(field)
664
                    }
665
0
                    _ => Err(ArrowError::NotYetImplemented(format!(
666
0
                        "Union of {f:?} not currently supported"
667
0
                    ))),
668
                }
669
            }
670
194
            Schema::Complex(c) => match c {
671
106
                ComplexType::Record(r) => {
672
106
                    let namespace = r.namespace.or(namespace);
673
106
                    let 
fields103
= r
674
106
                        .fields
675
106
                        .iter()
676
428
                        .
map106
(|field| {
677
                            Ok(AvroField {
678
428
                                name: field.name.to_string(),
679
428
                                data_type: self.parse_type(&field.r#type, namespace)
?3
,
680
                            })
681
428
                        })
682
106
                        .collect::<Result<_, ArrowError>>()
?3
;
683
103
                    let field = AvroDataType {
684
103
                        nullability: None,
685
103
                        codec: Codec::Struct(fields),
686
103
                        metadata: r.attributes.field_metadata(),
687
103
                        resolution: None,
688
103
                    };
689
103
                    self.resolver.register(r.name, namespace, field.clone());
690
103
                    Ok(field)
691
                }
692
51
                ComplexType::Array(a) => {
693
51
                    let mut field = self.parse_type(a.items.as_ref(), namespace)
?0
;
694
51
                    Ok(AvroDataType {
695
51
                        nullability: None,
696
51
                        metadata: a.attributes.field_metadata(),
697
51
                        codec: Codec::List(Arc::new(field)),
698
51
                        resolution: None,
699
51
                    })
700
                }
701
15
                ComplexType::Fixed(f) => {
702
15
                    let size = f.size.try_into().map_err(|e| 
{0
703
0
                        ArrowError::ParseError(format!("Overflow converting size to i32: {e}"))
704
0
                    })?;
705
15
                    let md = f.attributes.field_metadata();
706
15
                    let field = match f.attributes.logical_type {
707
9
                        Some("decimal") => {
708
8
                            let (precision, scale, _) =
709
8
                                parse_decimal_attributes(&f.attributes, Some(size as usize), true)
?0
;
710
8
                            AvroDataType {
711
8
                                nullability: None,
712
8
                                metadata: md,
713
8
                                codec: Codec::Decimal(precision, Some(scale), Some(size as usize)),
714
8
                                resolution: None,
715
8
                            }
716
                        }
717
1
                        Some("duration") => {
718
1
                            if size != 12 {
719
0
                                return Err(ArrowError::ParseError(format!(
720
0
                                    "Invalid fixed size for Duration: {size}, must be 12"
721
0
                                )));
722
1
                            };
723
1
                            AvroDataType {
724
1
                                nullability: None,
725
1
                                metadata: md,
726
1
                                codec: Codec::Interval,
727
1
                                resolution: None,
728
1
                            }
729
                        }
730
6
                        _ => AvroDataType {
731
6
                            nullability: None,
732
6
                            metadata: md,
733
6
                            codec: Codec::Fixed(size),
734
6
                            resolution: None,
735
6
                        },
736
                    };
737
15
                    self.resolver.register(f.name, namespace, field.clone());
738
15
                    Ok(field)
739
                }
740
8
                ComplexType::Enum(e) => {
741
8
                    let namespace = e.namespace.or(namespace);
742
8
                    let symbols = e
743
8
                        .symbols
744
8
                        .iter()
745
28
                        .
map8
(|s| s.to_string())
746
8
                        .collect::<Arc<[String]>>();
747
748
8
                    let mut metadata = e.attributes.field_metadata();
749
8
                    let symbols_json = serde_json::to_string(&e.symbols).map_err(|e| 
{0
750
0
                        ArrowError::ParseError(format!("Failed to serialize enum symbols: {e}"))
751
0
                    })?;
752
8
                    metadata.insert(AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(), symbols_json);
753
8
                    let field = AvroDataType {
754
8
                        nullability: None,
755
8
                        metadata,
756
8
                        codec: Codec::Enum(symbols),
757
8
                        resolution: None,
758
8
                    };
759
8
                    self.resolver.register(e.name, namespace, field.clone());
760
8
                    Ok(field)
761
                }
762
14
                ComplexType::Map(m) => {
763
14
                    let val = self.parse_type(&m.values, namespace)
?0
;
764
14
                    Ok(AvroDataType {
765
14
                        nullability: None,
766
14
                        metadata: m.attributes.field_metadata(),
767
14
                        codec: Codec::Map(Arc::new(val)),
768
14
                        resolution: None,
769
14
                    })
770
                }
771
            },
772
50
            Schema::Type(t) => {
773
50
                let mut field = self.parse_type(&Schema::TypeName(t.r#type.clone()), namespace)
?0
;
774
                // https://avro.apache.org/docs/1.11.1/specification/#logical-types
775
50
                match (t.attributes.logical_type, &mut field.codec) {
776
0
                    (Some("decimal"), c @ Codec::Binary) => {
777
0
                        let (prec, sc, _) = parse_decimal_attributes(&t.attributes, None, false)?;
778
0
                        *c = Codec::Decimal(prec, Some(sc), None);
779
                    }
780
3
                    (Some("date"), 
c1
@ Codec::Int32) =>
*c = Codec::Date321
,
781
2
                    (Some("time-millis"), 
c1
@ Codec::Int32) =>
*c = Codec::TimeMillis1
,
782
46
                    (Some("time-micros"), 
c1
@ Codec::Int64) =>
*c = Codec::TimeMicros1
,
783
45
                    (Some("timestamp-millis"), 
c1
@ Codec::Int64) => {
784
1
                        *c = Codec::TimestampMillis(true)
785
                    }
786
44
                    (Some("timestamp-micros"), 
c42
@ Codec::Int64) => {
787
42
                        *c = Codec::TimestampMicros(true)
788
                    }
789
2
                    (Some("local-timestamp-millis"), 
c1
@ Codec::Int64) => {
790
1
                        *c = Codec::TimestampMillis(false)
791
                    }
792
1
                    (Some("local-timestamp-micros"), c @ Codec::Int64) => {
793
1
                        *c = Codec::TimestampMicros(false)
794
                    }
795
1
                    (Some("uuid"), c @ Codec::Utf8) => *c = Codec::Uuid,
796
1
                    (Some(logical), _) => {
797
1
                        // Insert unrecognized logical type into metadata map
798
1
                        field.metadata.insert("logicalType".into(), logical.into());
799
1
                    }
800
0
                    (None, _) => {}
801
                }
802
50
                if !t.attributes.additional.is_empty() {
803
0
                    for (k, v) in &t.attributes.additional {
804
0
                        field.metadata.insert(k.to_string(), v.to_string());
805
0
                    }
806
50
                }
807
50
                Ok(field)
808
            }
809
        }
810
1.28k
    }
811
812
404
    fn resolve_type<'s>(
813
404
        &mut self,
814
404
        writer_schema: &'s Schema<'a>,
815
404
        reader_schema: &'s Schema<'a>,
816
404
        namespace: Option<&'a str>,
817
404
    ) -> Result<AvroDataType, ArrowError> {
818
404
        match (writer_schema, reader_schema) {
819
            (
820
187
                Schema::TypeName(TypeName::Primitive(writer_primitive)),
821
187
                Schema::TypeName(TypeName::Primitive(reader_primitive)),
822
187
            ) => self.resolve_primitives(*writer_primitive, *reader_primitive, reader_schema),
823
            (
824
                Schema::Type(Type {
825
16
                    r#type: TypeName::Primitive(writer_primitive),
826
                    ..
827
                }),
828
                Schema::Type(Type {
829
16
                    r#type: TypeName::Primitive(reader_primitive),
830
                    ..
831
                }),
832
16
            ) => self.resolve_primitives(*writer_primitive, *reader_primitive, reader_schema),
833
            (
834
0
                Schema::TypeName(TypeName::Primitive(writer_primitive)),
835
                Schema::Type(Type {
836
0
                    r#type: TypeName::Primitive(reader_primitive),
837
                    ..
838
                }),
839
0
            ) => self.resolve_primitives(*writer_primitive, *reader_primitive, reader_schema),
840
            (
841
                Schema::Type(Type {
842
0
                    r#type: TypeName::Primitive(writer_primitive),
843
                    ..
844
                }),
845
0
                Schema::TypeName(TypeName::Primitive(reader_primitive)),
846
0
            ) => self.resolve_primitives(*writer_primitive, *reader_primitive, reader_schema),
847
            (
848
30
                Schema::Complex(ComplexType::Record(writer_record)),
849
30
                Schema::Complex(ComplexType::Record(reader_record)),
850
30
            ) => self.resolve_records(writer_record, reader_record, namespace),
851
171
            (Schema::Union(writer_variants), Schema::Union(reader_variants)) => {
852
171
                self.resolve_nullable_union(writer_variants, reader_variants, namespace)
853
            }
854
0
            _ => Err(ArrowError::NotYetImplemented(
855
0
                "Other resolutions not yet implemented".to_string(),
856
0
            )),
857
        }
858
404
    }
859
860
203
    fn resolve_primitives(
861
203
        &mut self,
862
203
        write_primitive: PrimitiveType,
863
203
        read_primitive: PrimitiveType,
864
203
        reader_schema: &Schema<'a>,
865
203
    ) -> Result<AvroDataType, ArrowError> {
866
203
        if write_primitive == read_primitive {
867
122
            return self.parse_type(reader_schema, None);
868
81
        }
869
81
        let 
promotion78
= match (write_primitive, read_primitive) {
870
12
            (PrimitiveType::Int, PrimitiveType::Long) => Promotion::IntToLong,
871
7
            (PrimitiveType::Int, PrimitiveType::Float) => Promotion::IntToFloat,
872
13
            (PrimitiveType::Int, PrimitiveType::Double) => Promotion::IntToDouble,
873
7
            (PrimitiveType::Long, PrimitiveType::Float) => Promotion::LongToFloat,
874
7
            (PrimitiveType::Long, PrimitiveType::Double) => Promotion::LongToDouble,
875
7
            (PrimitiveType::Float, PrimitiveType::Double) => Promotion::FloatToDouble,
876
2
            (PrimitiveType::String, PrimitiveType::Bytes) => Promotion::StringToBytes,
877
23
            (PrimitiveType::Bytes, PrimitiveType::String) => Promotion::BytesToString,
878
            _ => {
879
3
                return Err(ArrowError::ParseError(format!(
880
3
                    "Illegal promotion {write_primitive:?} to {read_primitive:?}"
881
3
                )))
882
            }
883
        };
884
78
        let mut datatype = self.parse_type(reader_schema, None)
?0
;
885
78
        datatype.resolution = Some(ResolutionInfo::Promotion(promotion));
886
78
        Ok(datatype)
887
203
    }
888
889
171
    fn resolve_nullable_union(
890
171
        &mut self,
891
171
        writer_variants: &[Schema<'a>],
892
171
        reader_variants: &[Schema<'a>],
893
171
        namespace: Option<&'a str>,
894
171
    ) -> Result<AvroDataType, ArrowError> {
895
        // Only support unions with exactly two branches, one of which is `null` on both sides
896
171
        if writer_variants.len() != 2 || reader_variants.len() != 2 {
897
0
            return Err(ArrowError::NotYetImplemented(
898
0
                "Only 2-branch unions are supported for schema resolution".to_string(),
899
0
            ));
900
171
        }
901
683
        let 
is_null171
= |s: &Schema<'a>| {
902
341
            matches!(
903
651
                s,
904
                Schema::TypeName(TypeName::Primitive(PrimitiveType::Null))
905
            )
906
683
        };
907
171
        let w_null_pos = writer_variants.iter().position(is_null);
908
171
        let r_null_pos = reader_variants.iter().position(is_null);
909
171
        match (w_null_pos, r_null_pos) {
910
171
            (Some(wp), Some(rp)) => {
911
                // Extract a non-null branch on each side
912
171
                let w_nonnull = &writer_variants[1 - wp];
913
171
                let r_nonnull = &reader_variants[1 - rp];
914
                // Resolve the non-null branch
915
171
                let 
mut dt170
= self.make_data_type(w_nonnull, Some(r_nonnull), namespace)
?1
;
916
                // Adopt reader union null ordering
917
170
                dt.nullability = Some(match rp {
918
0
                    0 => Nullability::NullFirst,
919
170
                    1 => Nullability::NullSecond,
920
0
                    _ => unreachable!(),
921
                });
922
170
                Ok(dt)
923
            }
924
0
            _ => Err(ArrowError::NotYetImplemented(
925
0
                "Union resolution requires both writer and reader to be nullable unions"
926
0
                    .to_string(),
927
0
            )),
928
        }
929
171
    }
930
931
30
    fn resolve_records(
932
30
        &mut self,
933
30
        writer_record: &Record<'a>,
934
30
        reader_record: &Record<'a>,
935
30
        namespace: Option<&'a str>,
936
30
    ) -> Result<AvroDataType, ArrowError> {
937
        // Names must match or be aliased
938
30
        let names_match = writer_record.name == reader_record.name
939
0
            || reader_record.aliases.contains(&writer_record.name)
940
0
            || writer_record.aliases.contains(&reader_record.name);
941
30
        if !names_match {
942
0
            return Err(ArrowError::ParseError(format!(
943
0
                "Record name mismatch writer={}, reader={}",
944
0
                writer_record.name, reader_record.name
945
0
            )));
946
30
        }
947
30
        let writer_ns = writer_record.namespace.or(namespace);
948
30
        let reader_ns = reader_record.namespace.or(namespace);
949
        // Map writer field name -> index
950
30
        let mut writer_index_map =
951
30
            HashMap::<&str, usize>::with_capacity(writer_record.fields.len());
952
210
        for (idx, write_field) in 
writer_record.fields.iter()30
.
enumerate30
() {
953
210
            writer_index_map.insert(write_field.name, idx);
954
210
        }
955
        // Prepare outputs
956
30
        let mut reader_fields: Vec<AvroField> = Vec::with_capacity(reader_record.fields.len());
957
30
        let mut writer_to_reader: Vec<Option<usize>> = vec![None; writer_record.fields.len()];
958
30
        let mut skip_fields: Vec<Option<AvroDataType>> = vec![None; writer_record.fields.len()];
959
        //let mut default_fields: Vec<usize> = Vec::new();
960
        // Build reader fields and mapping
961
182
        for (reader_idx, r_field) in 
reader_record.fields.iter()30
.
enumerate30
() {
962
182
            if let Some(&writer_idx) = writer_index_map.get(r_field.name) {
963
                // Field exists in writer: resolve types (including promotions and union-of-null)
964
182
                let w_schema = &writer_record.fields[writer_idx].r#type;
965
181
                let resolved_dt =
966
182
                    self.make_data_type(w_schema, Some(&r_field.r#type), reader_ns)
?1
;
967
181
                reader_fields.push(AvroField {
968
181
                    name: r_field.name.to_string(),
969
181
                    data_type: resolved_dt,
970
181
                });
971
181
                writer_to_reader[writer_idx] = Some(reader_idx);
972
            } else {
973
0
                return Err(ArrowError::NotYetImplemented(
974
0
                    "New fields from reader with default values not yet implemented".to_string(),
975
0
                ));
976
            }
977
        }
978
        // Any writer fields not mapped should be skipped
979
199
        for (writer_idx, writer_field) in 
writer_record.fields.iter()29
.
enumerate29
() {
980
199
            if writer_to_reader[writer_idx].is_none() {
981
                // Parse writer field type to know how to skip data
982
19
                let writer_dt = self.parse_type(&writer_field.r#type, writer_ns)
?0
;
983
19
                skip_fields[writer_idx] = Some(writer_dt);
984
180
            }
985
        }
986
        // Implement writer-only fields to skip in Follow-up PR here
987
        // Build resolved record AvroDataType
988
29
        let resolved = AvroDataType::new_with_resolution(
989
29
            Codec::Struct(Arc::from(reader_fields)),
990
29
            reader_record.attributes.field_metadata(),
991
29
            None,
992
29
            Some(ResolutionInfo::Record(ResolvedRecord {
993
29
                writer_to_reader: Arc::from(writer_to_reader),
994
29
                default_fields: Arc::default(),
995
29
                skip_fields: Arc::from(skip_fields),
996
29
            })),
997
        );
998
        // Register a resolved record by reader name+namespace for potential named type refs
999
29
        self.resolver
1000
29
            .register(reader_record.name, reader_ns, resolved.clone());
1001
29
        Ok(resolved)
1002
30
    }
1003
}
1004
1005
#[cfg(test)]
1006
mod tests {
1007
    use super::*;
1008
    use crate::schema::{Attributes, Fixed, PrimitiveType, Schema, Type, TypeName};
1009
    use serde_json;
1010
1011
8
    fn create_schema_with_logical_type(
1012
8
        primitive_type: PrimitiveType,
1013
8
        logical_type: &'static str,
1014
8
    ) -> Schema<'static> {
1015
8
        let attributes = Attributes {
1016
8
            logical_type: Some(logical_type),
1017
8
            additional: Default::default(),
1018
8
        };
1019
1020
8
        Schema::Type(Type {
1021
8
            r#type: TypeName::Primitive(primitive_type),
1022
8
            attributes,
1023
8
        })
1024
8
    }
1025
1026
0
    fn create_fixed_schema(size: usize, logical_type: &'static str) -> Schema<'static> {
1027
0
        let attributes = Attributes {
1028
0
            logical_type: Some(logical_type),
1029
0
            additional: Default::default(),
1030
0
        };
1031
1032
0
        Schema::Complex(ComplexType::Fixed(Fixed {
1033
0
            name: "fixed_type",
1034
0
            namespace: None,
1035
0
            aliases: Vec::new(),
1036
0
            size,
1037
0
            attributes,
1038
0
        }))
1039
0
    }
1040
1041
7
    fn resolve_promotion(writer: PrimitiveType, reader: PrimitiveType) -> AvroDataType {
1042
7
        let writer_schema = Schema::TypeName(TypeName::Primitive(writer));
1043
7
        let reader_schema = Schema::TypeName(TypeName::Primitive(reader));
1044
7
        let mut maker = Maker::new(false, false);
1045
7
        maker
1046
7
            .make_data_type(&writer_schema, Some(&reader_schema), None)
1047
7
            .expect("promotion should resolve")
1048
7
    }
1049
1050
    #[test]
1051
1
    fn test_date_logical_type() {
1052
1
        let schema = create_schema_with_logical_type(PrimitiveType::Int, "date");
1053
1054
1
        let mut maker = Maker::new(false, false);
1055
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1056
1057
1
        assert!(
matches!0
(result.codec, Codec::Date32));
1058
1
    }
1059
1060
    #[test]
1061
1
    fn test_time_millis_logical_type() {
1062
1
        let schema = create_schema_with_logical_type(PrimitiveType::Int, "time-millis");
1063
1064
1
        let mut maker = Maker::new(false, false);
1065
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1066
1067
1
        assert!(
matches!0
(result.codec, Codec::TimeMillis));
1068
1
    }
1069
1070
    #[test]
1071
1
    fn test_time_micros_logical_type() {
1072
1
        let schema = create_schema_with_logical_type(PrimitiveType::Long, "time-micros");
1073
1074
1
        let mut maker = Maker::new(false, false);
1075
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1076
1077
1
        assert!(
matches!0
(result.codec, Codec::TimeMicros));
1078
1
    }
1079
1080
    #[test]
1081
1
    fn test_timestamp_millis_logical_type() {
1082
1
        let schema = create_schema_with_logical_type(PrimitiveType::Long, "timestamp-millis");
1083
1084
1
        let mut maker = Maker::new(false, false);
1085
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1086
1087
1
        assert!(matches!(result.codec, Codec::TimestampMillis(true)));
1088
1
    }
1089
1090
    #[test]
1091
1
    fn test_timestamp_micros_logical_type() {
1092
1
        let schema = create_schema_with_logical_type(PrimitiveType::Long, "timestamp-micros");
1093
1094
1
        let mut maker = Maker::new(false, false);
1095
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1096
1097
1
        assert!(matches!(result.codec, Codec::TimestampMicros(true)));
1098
1
    }
1099
1100
    #[test]
1101
1
    fn test_local_timestamp_millis_logical_type() {
1102
1
        let schema = create_schema_with_logical_type(PrimitiveType::Long, "local-timestamp-millis");
1103
1104
1
        let mut maker = Maker::new(false, false);
1105
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1106
1107
1
        assert!(
matches!0
(result.codec, Codec::TimestampMillis(false)));
1108
1
    }
1109
1110
    #[test]
1111
1
    fn test_local_timestamp_micros_logical_type() {
1112
1
        let schema = create_schema_with_logical_type(PrimitiveType::Long, "local-timestamp-micros");
1113
1114
1
        let mut maker = Maker::new(false, false);
1115
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1116
1117
1
        assert!(
matches!0
(result.codec, Codec::TimestampMicros(false)));
1118
1
    }
1119
1120
    #[test]
1121
1
    fn test_uuid_type() {
1122
1
        let mut codec = Codec::Fixed(16);
1123
1
        if let c @ Codec::Fixed(16) = &mut codec {
1124
1
            *c = Codec::Uuid;
1125
1
        
}0
1126
1
        assert!(
matches!0
(codec, Codec::Uuid));
1127
1
    }
1128
1129
    #[test]
1130
1
    fn test_duration_logical_type() {
1131
1
        let mut codec = Codec::Fixed(12);
1132
1133
1
        if let c @ Codec::Fixed(12) = &mut codec {
1134
1
            *c = Codec::Interval;
1135
1
        
}0
1136
1137
1
        assert!(
matches!0
(codec, Codec::Interval));
1138
1
    }
1139
1140
    #[test]
1141
1
    fn test_decimal_logical_type_not_implemented() {
1142
1
        let mut codec = Codec::Fixed(16);
1143
1144
1
        let process_decimal = || -> Result<(), ArrowError> {
1145
1
            if let Codec::Fixed(_) = codec {
1146
1
                return Err(ArrowError::NotYetImplemented(
1147
1
                    "Decimals are not currently supported".to_string(),
1148
1
                ));
1149
0
            }
1150
0
            Ok(())
1151
1
        };
1152
1153
1
        let result = process_decimal();
1154
1155
1
        assert!(result.is_err());
1156
1
        if let Err(ArrowError::NotYetImplemented(msg)) = result {
1157
1
            assert!(msg.contains("Decimals are not currently supported"));
1158
        } else {
1159
0
            panic!("Expected NotYetImplemented error");
1160
        }
1161
1
    }
1162
    #[test]
1163
1
    fn test_unknown_logical_type_added_to_metadata() {
1164
1
        let schema = create_schema_with_logical_type(PrimitiveType::Int, "custom-type");
1165
1166
1
        let mut maker = Maker::new(false, false);
1167
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1168
1169
1
        assert_eq!(
1170
1
            result.metadata.get("logicalType"),
1171
1
            Some(&"custom-type".to_string())
1172
        );
1173
1
    }
1174
1175
    #[test]
1176
1
    fn test_string_with_utf8view_enabled() {
1177
1
        let schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::String));
1178
1179
1
        let mut maker = Maker::new(true, false);
1180
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1181
1182
1
        assert!(
matches!0
(result.codec, Codec::Utf8View));
1183
1
    }
1184
1185
    #[test]
1186
1
    fn test_string_without_utf8view_enabled() {
1187
1
        let schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::String));
1188
1189
1
        let mut maker = Maker::new(false, false);
1190
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1191
1192
1
        assert!(
matches!0
(result.codec, Codec::Utf8));
1193
1
    }
1194
1195
    #[test]
1196
1
    fn test_record_with_string_and_utf8view_enabled() {
1197
1
        let field_schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::String));
1198
1199
1
        let avro_field = crate::schema::Field {
1200
1
            name: "string_field",
1201
1
            r#type: field_schema,
1202
1
            default: None,
1203
1
            doc: None,
1204
1
        };
1205
1206
1
        let record = Record {
1207
1
            name: "test_record",
1208
1
            namespace: None,
1209
1
            aliases: vec![],
1210
1
            doc: None,
1211
1
            fields: vec![avro_field],
1212
1
            attributes: Attributes::default(),
1213
1
        };
1214
1215
1
        let schema = Schema::Complex(ComplexType::Record(record));
1216
1217
1
        let mut maker = Maker::new(true, false);
1218
1
        let result = maker.make_data_type(&schema, None, None).unwrap();
1219
1220
1
        if let Codec::Struct(fields) = &result.codec {
1221
1
            let first_field_codec = &fields[0].data_type().codec;
1222
1
            assert!(
matches!0
(first_field_codec, Codec::Utf8View));
1223
        } else {
1224
0
            panic!("Expected Struct codec");
1225
        }
1226
1
    }
1227
1228
    #[test]
1229
1
    fn test_union_with_strict_mode() {
1230
1
        let schema = Schema::Union(vec![
1231
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::String)),
1232
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1233
1
        ]);
1234
1235
1
        let mut maker = Maker::new(false, true);
1236
1
        let result = maker.make_data_type(&schema, None, None);
1237
1238
1
        assert!(result.is_err());
1239
1
        match result {
1240
1
            Err(ArrowError::SchemaError(msg)) => {
1241
1
                assert!(msg.contains(
1242
1
                    "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
1243
1
                ));
1244
            }
1245
0
            _ => panic!("Expected SchemaError"),
1246
        }
1247
1
    }
1248
1249
    #[test]
1250
1
    fn test_resolve_int_to_float_promotion() {
1251
1
        let result = resolve_promotion(PrimitiveType::Int, PrimitiveType::Float);
1252
1
        assert!(
matches!0
(result.codec, Codec::Float32));
1253
1
        assert_eq!(
1254
            result.resolution,
1255
            Some(ResolutionInfo::Promotion(Promotion::IntToFloat))
1256
        );
1257
1
    }
1258
1259
    #[test]
1260
1
    fn test_resolve_int_to_double_promotion() {
1261
1
        let result = resolve_promotion(PrimitiveType::Int, PrimitiveType::Double);
1262
1
        assert!(
matches!0
(result.codec, Codec::Float64));
1263
1
        assert_eq!(
1264
            result.resolution,
1265
            Some(ResolutionInfo::Promotion(Promotion::IntToDouble))
1266
        );
1267
1
    }
1268
1269
    #[test]
1270
1
    fn test_resolve_long_to_float_promotion() {
1271
1
        let result = resolve_promotion(PrimitiveType::Long, PrimitiveType::Float);
1272
1
        assert!(
matches!0
(result.codec, Codec::Float32));
1273
1
        assert_eq!(
1274
            result.resolution,
1275
            Some(ResolutionInfo::Promotion(Promotion::LongToFloat))
1276
        );
1277
1
    }
1278
1279
    #[test]
1280
1
    fn test_resolve_long_to_double_promotion() {
1281
1
        let result = resolve_promotion(PrimitiveType::Long, PrimitiveType::Double);
1282
1
        assert!(
matches!0
(result.codec, Codec::Float64));
1283
1
        assert_eq!(
1284
            result.resolution,
1285
            Some(ResolutionInfo::Promotion(Promotion::LongToDouble))
1286
        );
1287
1
    }
1288
1289
    #[test]
1290
1
    fn test_resolve_float_to_double_promotion() {
1291
1
        let result = resolve_promotion(PrimitiveType::Float, PrimitiveType::Double);
1292
1
        assert!(
matches!0
(result.codec, Codec::Float64));
1293
1
        assert_eq!(
1294
            result.resolution,
1295
            Some(ResolutionInfo::Promotion(Promotion::FloatToDouble))
1296
        );
1297
1
    }
1298
1299
    #[test]
1300
1
    fn test_resolve_string_to_bytes_promotion() {
1301
1
        let result = resolve_promotion(PrimitiveType::String, PrimitiveType::Bytes);
1302
1
        assert!(
matches!0
(result.codec, Codec::Binary));
1303
1
        assert_eq!(
1304
            result.resolution,
1305
            Some(ResolutionInfo::Promotion(Promotion::StringToBytes))
1306
        );
1307
1
    }
1308
1309
    #[test]
1310
1
    fn test_resolve_bytes_to_string_promotion() {
1311
1
        let result = resolve_promotion(PrimitiveType::Bytes, PrimitiveType::String);
1312
1
        assert!(
matches!0
(result.codec, Codec::Utf8));
1313
1
        assert_eq!(
1314
            result.resolution,
1315
            Some(ResolutionInfo::Promotion(Promotion::BytesToString))
1316
        );
1317
1
    }
1318
1319
    #[test]
1320
1
    fn test_resolve_illegal_promotion_double_to_float_errors() {
1321
1
        let writer_schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::Double));
1322
1
        let reader_schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::Float));
1323
1
        let mut maker = Maker::new(false, false);
1324
1
        let result = maker.make_data_type(&writer_schema, Some(&reader_schema), None);
1325
1
        assert!(result.is_err());
1326
1
        match result {
1327
1
            Err(ArrowError::ParseError(msg)) => {
1328
1
                assert!(msg.contains("Illegal promotion"));
1329
            }
1330
0
            _ => panic!("Expected ParseError for illegal promotion Double -> Float"),
1331
        }
1332
1
    }
1333
1334
    #[test]
1335
1
    fn test_promotion_within_nullable_union_keeps_reader_null_ordering() {
1336
1
        let writer = Schema::Union(vec![
1337
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1338
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)),
1339
1
        ]);
1340
1
        let reader = Schema::Union(vec![
1341
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::Double)),
1342
1
            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1343
1
        ]);
1344
1
        let mut maker = Maker::new(false, false);
1345
1
        let result = maker.make_data_type(&writer, Some(&reader), None).unwrap();
1346
1
        assert!(
matches!0
(result.codec, Codec::Float64));
1347
1
        assert_eq!(
1348
            result.resolution,
1349
            Some(ResolutionInfo::Promotion(Promotion::IntToDouble))
1350
        );
1351
1
        assert_eq!(result.nullability, Some(Nullability::NullSecond));
1352
1
    }
1353
1354
    #[test]
1355
1
    fn test_resolve_type_promotion() {
1356
1
        let writer_schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::Int));
1357
1
        let reader_schema = Schema::TypeName(TypeName::Primitive(PrimitiveType::Long));
1358
1
        let mut maker = Maker::new(false, false);
1359
1
        let result = maker
1360
1
            .make_data_type(&writer_schema, Some(&reader_schema), None)
1361
1
            .unwrap();
1362
1
        assert!(
matches!0
(result.codec, Codec::Int64));
1363
1
        assert_eq!(
1364
            result.resolution,
1365
            Some(ResolutionInfo::Promotion(Promotion::IntToLong))
1366
        );
1367
1
    }
1368
1369
    #[test]
1370
1
    fn test_nested_record_type_reuse_without_namespace() {
1371
1
        let schema_str = r#"
1372
1
        {
1373
1
          "type": "record",
1374
1
          "name": "Record",
1375
1
          "fields": [
1376
1
            {
1377
1
              "name": "nested",
1378
1
              "type": {
1379
1
                "type": "record",
1380
1
                "name": "Nested",
1381
1
                "fields": [
1382
1
                  { "name": "nested_int", "type": "int" }
1383
1
                ]
1384
1
              }
1385
1
            },
1386
1
            { "name": "nestedRecord", "type": "Nested" },
1387
1
            { "name": "nestedArray", "type": { "type": "array", "items": "Nested" } },
1388
1
            { "name": "nestedMap", "type": { "type": "map", "values": "Nested" } }
1389
1
          ]
1390
1
        }
1391
1
        "#;
1392
1393
1
        let schema: Schema = serde_json::from_str(schema_str).unwrap();
1394
1395
1
        let mut maker = Maker::new(false, false);
1396
1
        let avro_data_type = maker.make_data_type(&schema, None, None).unwrap();
1397
1398
1
        if let Codec::Struct(fields) = avro_data_type.codec() {
1399
1
            assert_eq!(fields.len(), 4);
1400
1401
            // nested
1402
1
            assert_eq!(fields[0].name(), "nested");
1403
1
            let nested_data_type = fields[0].data_type();
1404
1
            if let Codec::Struct(nested_fields) = nested_data_type.codec() {
1405
1
                assert_eq!(nested_fields.len(), 1);
1406
1
                assert_eq!(nested_fields[0].name(), "nested_int");
1407
1
                assert!(
matches!0
(nested_fields[0].data_type().codec(), Codec::Int32));
1408
            } else {
1409
0
                panic!(
1410
0
                    "'nested' field is not a struct but {:?}",
1411
0
                    nested_data_type.codec()
1412
                );
1413
            }
1414
1415
            // nestedRecord
1416
1
            assert_eq!(fields[1].name(), "nestedRecord");
1417
1
            let nested_record_data_type = fields[1].data_type();
1418
1
            assert_eq!(
1419
1
                nested_record_data_type.codec().data_type(),
1420
1
                nested_data_type.codec().data_type()
1421
            );
1422
1423
            // nestedArray
1424
1
            assert_eq!(fields[2].name(), "nestedArray");
1425
1
            if let Codec::List(item_type) = fields[2].data_type().codec() {
1426
1
                assert_eq!(
1427
1
                    item_type.codec().data_type(),
1428
1
                    nested_data_type.codec().data_type()
1429
                );
1430
            } else {
1431
0
                panic!("'nestedArray' field is not a list");
1432
            }
1433
1434
            // nestedMap
1435
1
            assert_eq!(fields[3].name(), "nestedMap");
1436
1
            if let Codec::Map(value_type) = fields[3].data_type().codec() {
1437
1
                assert_eq!(
1438
1
                    value_type.codec().data_type(),
1439
1
                    nested_data_type.codec().data_type()
1440
                );
1441
            } else {
1442
0
                panic!("'nestedMap' field is not a map");
1443
            }
1444
        } else {
1445
0
            panic!("Top-level schema is not a struct");
1446
        }
1447
1
    }
1448
1449
    #[test]
1450
1
    fn test_nested_enum_type_reuse_with_namespace() {
1451
1
        let schema_str = r#"
1452
1
        {
1453
1
          "type": "record",
1454
1
          "name": "Record",
1455
1
          "namespace": "record_ns",
1456
1
          "fields": [
1457
1
            {
1458
1
              "name": "status",
1459
1
              "type": {
1460
1
                "type": "enum",
1461
1
                "name": "Status",
1462
1
                "namespace": "enum_ns",
1463
1
                "symbols": ["ACTIVE", "INACTIVE", "PENDING"]
1464
1
              }
1465
1
            },
1466
1
            { "name": "backupStatus", "type": "enum_ns.Status" },
1467
1
            { "name": "statusHistory", "type": { "type": "array", "items": "enum_ns.Status" } },
1468
1
            { "name": "statusMap", "type": { "type": "map", "values": "enum_ns.Status" } }
1469
1
          ]
1470
1
        }
1471
1
        "#;
1472
1473
1
        let schema: Schema = serde_json::from_str(schema_str).unwrap();
1474
1475
1
        let mut maker = Maker::new(false, false);
1476
1
        let avro_data_type = maker.make_data_type(&schema, None, None).unwrap();
1477
1478
1
        if let Codec::Struct(fields) = avro_data_type.codec() {
1479
1
            assert_eq!(fields.len(), 4);
1480
1481
            // status
1482
1
            assert_eq!(fields[0].name(), "status");
1483
1
            let status_data_type = fields[0].data_type();
1484
1
            if let Codec::Enum(symbols) = status_data_type.codec() {
1485
1
                assert_eq!(symbols.as_ref(), &["ACTIVE", "INACTIVE", "PENDING"]);
1486
            } else {
1487
0
                panic!(
1488
0
                    "'status' field is not an enum but {:?}",
1489
0
                    status_data_type.codec()
1490
                );
1491
            }
1492
1493
            // backupStatus
1494
1
            assert_eq!(fields[1].name(), "backupStatus");
1495
1
            let backup_status_data_type = fields[1].data_type();
1496
1
            assert_eq!(
1497
1
                backup_status_data_type.codec().data_type(),
1498
1
                status_data_type.codec().data_type()
1499
            );
1500
1501
            // statusHistory
1502
1
            assert_eq!(fields[2].name(), "statusHistory");
1503
1
            if let Codec::List(item_type) = fields[2].data_type().codec() {
1504
1
                assert_eq!(
1505
1
                    item_type.codec().data_type(),
1506
1
                    status_data_type.codec().data_type()
1507
                );
1508
            } else {
1509
0
                panic!("'statusHistory' field is not a list");
1510
            }
1511
1512
            // statusMap
1513
1
            assert_eq!(fields[3].name(), "statusMap");
1514
1
            if let Codec::Map(value_type) = fields[3].data_type().codec() {
1515
1
                assert_eq!(
1516
1
                    value_type.codec().data_type(),
1517
1
                    status_data_type.codec().data_type()
1518
                );
1519
            } else {
1520
0
                panic!("'statusMap' field is not a map");
1521
            }
1522
        } else {
1523
0
            panic!("Top-level schema is not a struct");
1524
        }
1525
1
    }
1526
}