Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/schema.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use arrow_schema::{
19
    ArrowError, DataType, Field as ArrowField, IntervalUnit, Schema as ArrowSchema, TimeUnit,
20
};
21
use serde::{Deserialize, Serialize};
22
use serde_json::{json, Map as JsonMap, Value};
23
use std::cmp::PartialEq;
24
use std::collections::hash_map::Entry;
25
use std::collections::{HashMap, HashSet};
26
use strum_macros::AsRefStr;
27
28
/// The metadata key used for storing the JSON encoded [`Schema`]
29
pub const SCHEMA_METADATA_KEY: &str = "avro.schema";
30
31
/// The Avro single‑object encoding “magic” bytes (`0xC3 0x01`)
32
pub const SINGLE_OBJECT_MAGIC: [u8; 2] = [0xC3, 0x01];
33
34
/// Metadata key used to represent Avro enum symbols in an Arrow schema.
35
pub const AVRO_ENUM_SYMBOLS_METADATA_KEY: &str = "avro.enum.symbols";
36
37
/// Metadata key used to store the default value of a field in an Avro schema.
38
pub const AVRO_FIELD_DEFAULT_METADATA_KEY: &str = "avro.field.default";
39
40
/// Metadata key used to store the name of a type in an Avro schema.
41
pub const AVRO_NAME_METADATA_KEY: &str = "avro.name";
42
43
/// Metadata key used to store the name of a type in an Avro schema.
44
pub const AVRO_NAMESPACE_METADATA_KEY: &str = "avro.namespace";
45
46
/// Metadata key used to store the documentation for a type in an Avro schema.
47
pub const AVRO_DOC_METADATA_KEY: &str = "avro.doc";
48
49
/// Compare two Avro schemas for equality (identical schemas).
50
/// Returns true if the schemas have the same parsing canonical form (i.e., logically identical).
51
0
pub fn compare_schemas(writer: &Schema, reader: &Schema) -> Result<bool, ArrowError> {
52
0
    let canon_writer = generate_canonical_form(writer)?;
53
0
    let canon_reader = generate_canonical_form(reader)?;
54
0
    Ok(canon_writer == canon_reader)
55
0
}
56
57
/// Either a [`PrimitiveType`] or a reference to a previously defined named type
58
///
59
/// <https://avro.apache.org/docs/1.11.1/specification/#names>
60
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61
#[serde(untagged)]
62
/// A type name in an Avro schema
63
///
64
/// This represents the different ways a type can be referenced in an Avro schema.
65
pub enum TypeName<'a> {
66
    /// A primitive type like null, boolean, int, etc.
67
    Primitive(PrimitiveType),
68
    /// A reference to another named type
69
    Ref(&'a str),
70
}
71
72
/// A primitive type
73
///
74
/// <https://avro.apache.org/docs/1.11.1/specification/#primitive-types>
75
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, AsRefStr)]
76
#[serde(rename_all = "camelCase")]
77
#[strum(serialize_all = "lowercase")]
78
pub enum PrimitiveType {
79
    /// null: no value
80
    Null,
81
    /// boolean: a binary value
82
    Boolean,
83
    /// int: 32-bit signed integer
84
    Int,
85
    /// long: 64-bit signed integer
86
    Long,
87
    /// float: single precision (32-bit) IEEE 754 floating-point number
88
    Float,
89
    /// double: double precision (64-bit) IEEE 754 floating-point number
90
    Double,
91
    /// bytes: sequence of 8-bit unsigned bytes
92
    Bytes,
93
    /// string: Unicode character sequence
94
    String,
95
}
96
97
/// Additional attributes within a [`Schema`]
98
///
99
/// <https://avro.apache.org/docs/1.11.1/specification/#schema-declaration>
100
#[derive(Debug, Clone, PartialEq, Eq, Default, Deserialize, Serialize)]
101
#[serde(rename_all = "camelCase")]
102
pub struct Attributes<'a> {
103
    /// A logical type name
104
    ///
105
    /// <https://avro.apache.org/docs/1.11.1/specification/#logical-types>
106
    #[serde(default)]
107
    pub logical_type: Option<&'a str>,
108
109
    /// Additional JSON attributes
110
    #[serde(flatten)]
111
    pub additional: HashMap<&'a str, serde_json::Value>,
112
}
113
114
impl Attributes<'_> {
115
    /// Returns the field metadata for this [`Attributes`]
116
220
    pub(crate) fn field_metadata(&self) -> HashMap<String, String> {
117
220
        self.additional
118
220
            .iter()
119
220
            .map(|(k, v)| (
k16
.
to_string16
(),
v16
.
to_string16
()))
120
220
            .collect()
121
220
    }
122
}
123
124
/// A type definition that is not a variant of [`ComplexType`]
125
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
126
#[serde(rename_all = "camelCase")]
127
pub struct Type<'a> {
128
    /// The type of this Avro data structure
129
    #[serde(borrow)]
130
    pub r#type: TypeName<'a>,
131
    /// Additional attributes associated with this type
132
    #[serde(flatten)]
133
    pub attributes: Attributes<'a>,
134
}
135
136
/// An Avro schema
137
///
138
/// This represents the different shapes of Avro schemas as defined in the specification.
139
/// See <https://avro.apache.org/docs/1.11.1/specification/#schemas> for more details.
140
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
141
#[serde(untagged)]
142
pub enum Schema<'a> {
143
    /// A direct type name (primitive or reference)
144
    #[serde(borrow)]
145
    TypeName(TypeName<'a>),
146
    /// A union of multiple schemas (e.g., ["null", "string"])
147
    #[serde(borrow)]
148
    Union(Vec<Schema<'a>>),
149
    /// A complex type such as record, array, map, etc.
150
    #[serde(borrow)]
151
    Complex(ComplexType<'a>),
152
    /// A type with attributes
153
    #[serde(borrow)]
154
    Type(Type<'a>),
155
}
156
157
/// A complex type
158
///
159
/// <https://avro.apache.org/docs/1.11.1/specification/#complex-types>
160
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
161
#[serde(tag = "type", rename_all = "camelCase")]
162
pub enum ComplexType<'a> {
163
    /// Record type: a sequence of fields with names and types
164
    #[serde(borrow)]
165
    Record(Record<'a>),
166
    /// Enum type: a set of named values
167
    #[serde(borrow)]
168
    Enum(Enum<'a>),
169
    /// Array type: a sequence of values of the same type
170
    #[serde(borrow)]
171
    Array(Array<'a>),
172
    /// Map type: a mapping from strings to values of the same type
173
    #[serde(borrow)]
174
    Map(Map<'a>),
175
    /// Fixed type: a fixed-size byte array
176
    #[serde(borrow)]
177
    Fixed(Fixed<'a>),
178
}
179
180
/// A record
181
///
182
/// <https://avro.apache.org/docs/1.11.1/specification/#schema-record>
183
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
184
pub struct Record<'a> {
185
    /// Name of the record
186
    #[serde(borrow)]
187
    pub name: &'a str,
188
    /// Optional namespace for the record, provides a way to organize names
189
    #[serde(borrow, default)]
190
    pub namespace: Option<&'a str>,
191
    /// Optional documentation string for the record
192
    #[serde(borrow, default)]
193
    pub doc: Option<&'a str>,
194
    /// Alternative names for this record
195
    #[serde(borrow, default)]
196
    pub aliases: Vec<&'a str>,
197
    /// The fields contained in this record
198
    #[serde(borrow)]
199
    pub fields: Vec<Field<'a>>,
200
    /// Additional attributes for this record
201
    #[serde(flatten)]
202
    pub attributes: Attributes<'a>,
203
}
204
205
/// A field within a [`Record`]
206
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
207
pub struct Field<'a> {
208
    /// Name of the field within the record
209
    #[serde(borrow)]
210
    pub name: &'a str,
211
    /// Optional documentation for this field
212
    #[serde(borrow, default)]
213
    pub doc: Option<&'a str>,
214
    /// The field's type definition
215
    #[serde(borrow)]
216
    pub r#type: Schema<'a>,
217
    /// Optional default value for this field
218
    #[serde(borrow, default)]
219
    pub default: Option<&'a str>,
220
}
221
222
/// An enumeration
223
///
224
/// <https://avro.apache.org/docs/1.11.1/specification/#enums>
225
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
226
pub struct Enum<'a> {
227
    /// Name of the enum
228
    #[serde(borrow)]
229
    pub name: &'a str,
230
    /// Optional namespace for the enum, provides organizational structure
231
    #[serde(borrow, default)]
232
    pub namespace: Option<&'a str>,
233
    /// Optional documentation string describing the enum
234
    #[serde(borrow, default)]
235
    pub doc: Option<&'a str>,
236
    /// Alternative names for this enum
237
    #[serde(borrow, default)]
238
    pub aliases: Vec<&'a str>,
239
    /// The symbols (values) that this enum can have
240
    #[serde(borrow)]
241
    pub symbols: Vec<&'a str>,
242
    /// Optional default value for this enum
243
    #[serde(borrow, default)]
244
    pub default: Option<&'a str>,
245
    /// Additional attributes for this enum
246
    #[serde(flatten)]
247
    pub attributes: Attributes<'a>,
248
}
249
250
/// An array
251
///
252
/// <https://avro.apache.org/docs/1.11.1/specification/#arrays>
253
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
254
pub struct Array<'a> {
255
    /// The schema for items in this array
256
    #[serde(borrow)]
257
    pub items: Box<Schema<'a>>,
258
    /// Additional attributes for this array
259
    #[serde(flatten)]
260
    pub attributes: Attributes<'a>,
261
}
262
263
/// A map
264
///
265
/// <https://avro.apache.org/docs/1.11.1/specification/#maps>
266
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
267
pub struct Map<'a> {
268
    /// The schema for values in this map
269
    #[serde(borrow)]
270
    pub values: Box<Schema<'a>>,
271
    /// Additional attributes for this map
272
    #[serde(flatten)]
273
    pub attributes: Attributes<'a>,
274
}
275
276
/// A fixed length binary array
277
///
278
/// <https://avro.apache.org/docs/1.11.1/specification/#fixed>
279
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
280
pub struct Fixed<'a> {
281
    /// Name of the fixed type
282
    #[serde(borrow)]
283
    pub name: &'a str,
284
    /// Optional namespace for the fixed type
285
    #[serde(borrow, default)]
286
    pub namespace: Option<&'a str>,
287
    /// Alternative names for this fixed type
288
    #[serde(borrow, default)]
289
    pub aliases: Vec<&'a str>,
290
    /// The number of bytes in this fixed type
291
    pub size: usize,
292
    /// Additional attributes for this fixed type
293
    #[serde(flatten)]
294
    pub attributes: Attributes<'a>,
295
}
296
297
/// A wrapper for an Avro schema in its JSON string representation.
298
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
299
pub struct AvroSchema {
300
    /// The Avro schema as a JSON string.
301
    pub json_string: String,
302
}
303
304
impl TryFrom<&ArrowSchema> for AvroSchema {
305
    type Error = ArrowError;
306
307
46
    fn try_from(schema: &ArrowSchema) -> Result<Self, Self::Error> {
308
        // Fast‑path: schema already contains Avro JSON
309
46
        if let Some(
json0
) = schema.metadata.get(SCHEMA_METADATA_KEY) {
310
0
            return Ok(AvroSchema::new(json.clone()));
311
46
        }
312
46
        let mut name_gen = NameGenerator::default();
313
46
        let 
fields_json45
= schema
314
46
            .fields()
315
46
            .iter()
316
101
            .
map46
(|f| arrow_field_to_avro(f, &mut name_gen))
317
46
            .collect::<Result<Vec<_>, _>>()
?1
;
318
        // Assemble top‑level record
319
45
        let record_name = schema
320
45
            .metadata
321
45
            .get(AVRO_NAME_METADATA_KEY)
322
45
            .map_or("topLevelRecord", |s| 
s0
.
as_str0
());
323
45
        let mut record = JsonMap::with_capacity(schema.metadata.len() + 4);
324
45
        record.insert("type".into(), Value::String("record".into()));
325
45
        record.insert(
326
45
            "name".into(),
327
45
            Value::String(sanitise_avro_name(record_name)),
328
        );
329
45
        if let Some(
ns0
) = schema.metadata.get(AVRO_NAMESPACE_METADATA_KEY) {
330
0
            record.insert("namespace".into(), Value::String(ns.clone()));
331
45
        }
332
45
        if let Some(
doc0
) = schema.metadata.get(AVRO_DOC_METADATA_KEY) {
333
0
            record.insert("doc".into(), Value::String(doc.clone()));
334
45
        }
335
45
        record.insert("fields".into(), Value::Array(fields_json));
336
45
        let schema_prefix = format!("{SCHEMA_METADATA_KEY}.");
337
45
        for (
meta_key0
,
meta_val0
) in &schema.metadata {
338
            // Skip keys already handled or internal
339
0
            if meta_key.starts_with("avro.")
340
0
                || meta_key.starts_with(schema_prefix.as_str())
341
0
                || is_internal_arrow_key(meta_key)
342
            {
343
0
                continue;
344
0
            }
345
0
            let json_val =
346
0
                serde_json::from_str(meta_val).unwrap_or_else(|_| Value::String(meta_val.clone()));
347
0
            record.insert(meta_key.clone(), json_val);
348
        }
349
45
        let json_string = serde_json::to_string(&Value::Object(record))
350
45
            .map_err(|e| ArrowError::SchemaError(
format!0
(
"Serialising Avro JSON failed: {e}"0
)))
?0
;
351
45
        Ok(AvroSchema::new(json_string))
352
46
    }
353
}
354
355
impl AvroSchema {
356
    /// Creates a new `AvroSchema` from a JSON string.
357
91
    pub fn new(json_string: String) -> Self {
358
91
        Self { json_string }
359
91
    }
360
361
    /// Deserializes and returns the `AvroSchema`.
362
    ///
363
    /// The returned schema borrows from `self`.
364
80
    pub fn schema(&self) -> Result<Schema<'_>, ArrowError> {
365
80
        serde_json::from_str(self.json_string.as_str())
366
80
            .map_err(|e| ArrowError::ParseError(
format!0
(
"Invalid Avro schema JSON: {e}"0
)))
367
80
    }
368
369
    /// Returns the Rabin fingerprint of the schema.
370
3
    pub fn fingerprint(&self) -> Result<Fingerprint, ArrowError> {
371
3
        generate_fingerprint_rabin(&self.schema()
?0
)
372
3
    }
373
}
374
375
/// Supported fingerprint algorithms for Avro schema identification.
376
/// Currently only `Rabin` is supported, `SHA256` and `MD5` support will come in a future update
377
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
378
pub enum FingerprintAlgorithm {
379
    /// 64‑bit CRC‑64‑AVRO Rabin fingerprint.
380
    #[default]
381
    Rabin,
382
}
383
384
/// A schema fingerprint in one of the supported formats.
385
///
386
/// This is used as the key inside `SchemaStore` `HashMap`. Each `SchemaStore`
387
/// instance always stores only one variant, matching its configured
388
/// `FingerprintAlgorithm`, but the enum makes the API uniform.
389
/// Currently only `Rabin` is supported
390
///
391
/// <https://avro.apache.org/docs/1.11.1/specification/#schema-fingerprints>
392
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
393
pub enum Fingerprint {
394
    /// A 64-bit Rabin fingerprint.
395
    Rabin(u64),
396
}
397
398
/// Allow easy extraction of the algorithm used to create a fingerprint.
399
impl From<&Fingerprint> for FingerprintAlgorithm {
400
0
    fn from(fp: &Fingerprint) -> Self {
401
0
        match fp {
402
0
            Fingerprint::Rabin(_) => FingerprintAlgorithm::Rabin,
403
        }
404
0
    }
405
}
406
407
/// Generates a fingerprint for the given `Schema` using the specified `FingerprintAlgorithm`.
408
32
pub(crate) fn generate_fingerprint(
409
32
    schema: &Schema,
410
32
    hash_type: FingerprintAlgorithm,
411
32
) -> Result<Fingerprint, ArrowError> {
412
32
    let canonical = generate_canonical_form(schema).map_err(|e| 
{0
413
0
        ArrowError::ComputeError(format!("Failed to generate canonical form for schema: {e}"))
414
0
    })?;
415
32
    match hash_type {
416
        FingerprintAlgorithm::Rabin => {
417
32
            Ok(Fingerprint::Rabin(compute_fingerprint_rabin(&canonical)))
418
        }
419
    }
420
32
}
421
422
/// Generates the 64-bit Rabin fingerprint for the given `Schema`.
423
///
424
/// The fingerprint is computed from the canonical form of the schema.
425
/// This is also known as `CRC-64-AVRO`.
426
///
427
/// # Returns
428
/// A `Fingerprint::Rabin` variant containing the 64-bit fingerprint.
429
3
pub fn generate_fingerprint_rabin(schema: &Schema) -> Result<Fingerprint, ArrowError> {
430
3
    generate_fingerprint(schema, FingerprintAlgorithm::Rabin)
431
3
}
432
433
/// Generates the Parsed Canonical Form for the given [`Schema`].
434
///
435
/// The canonical form is a standardized JSON representation of the schema,
436
/// primarily used for generating a schema fingerprint for equality checking.
437
///
438
/// This form strips attributes that do not affect the schema's identity,
439
/// such as `doc` fields, `aliases`, and any properties not defined in the
440
/// Avro specification.
441
///
442
/// <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas>
443
35
pub fn generate_canonical_form(schema: &Schema) -> Result<String, ArrowError> {
444
35
    build_canonical(schema, None)
445
35
}
446
447
/// An in-memory cache of Avro schemas, indexed by their fingerprint.
448
///
449
/// `SchemaStore` provides a mechanism to store and retrieve Avro schemas efficiently.
450
/// Each schema is associated with a unique [`Fingerprint`], which is generated based
451
/// on the schema's canonical form and a specific hashing algorithm.
452
///
453
/// A `SchemaStore` instance is configured to use a single [`FingerprintAlgorithm`] such as Rabin,
454
/// MD5 (not yet supported), or SHA256 (not yet supported) for all its operations.
455
/// This ensures consistency when generating fingerprints and looking up schemas.
456
/// All schemas registered will have their fingerprint computed with this algorithm, and
457
/// lookups must use a matching fingerprint.
458
///
459
/// # Examples
460
///
461
/// ```no_run
462
/// // Create a new store with the default Rabin fingerprinting.
463
/// use arrow_avro::schema::{AvroSchema, SchemaStore};
464
///
465
/// let mut store = SchemaStore::new();
466
/// let schema = AvroSchema::new("\"string\"".to_string());
467
/// // Register the schema to get its fingerprint.
468
/// let fingerprint = store.register(schema.clone()).unwrap();
469
/// // Use the fingerprint to look up the schema.
470
/// let retrieved_schema = store.lookup(&fingerprint).cloned();
471
/// assert_eq!(retrieved_schema, Some(schema));
472
/// ```
473
#[derive(Debug, Clone, Default)]
474
pub struct SchemaStore {
475
    /// The hashing algorithm used for generating fingerprints.
476
    fingerprint_algorithm: FingerprintAlgorithm,
477
    /// A map from a schema's fingerprint to the schema itself.
478
    schemas: HashMap<Fingerprint, AvroSchema>,
479
}
480
481
impl TryFrom<&[AvroSchema]> for SchemaStore {
482
    type Error = ArrowError;
483
484
    /// Creates a `SchemaStore` from a slice of schemas.
485
    /// Each schema in the slice is registered with the new store.
486
2
    fn try_from(schemas: &[AvroSchema]) -> Result<Self, Self::Error> {
487
2
        let mut store = SchemaStore::new();
488
7
        for 
schema5
in schemas {
489
5
            store.register(schema.clone())
?0
;
490
        }
491
2
        Ok(store)
492
2
    }
493
}
494
495
impl SchemaStore {
496
    /// Creates an empty `SchemaStore` using the default fingerprinting algorithm (64-bit Rabin).
497
18
    pub fn new() -> Self {
498
18
        Self::default()
499
18
    }
500
501
    /// Registers a schema with the store and returns its fingerprint.
502
    ///
503
    /// A fingerprint is calculated for the given schema using the store's configured
504
    /// hash type. If a schema with the same fingerprint does not already exist in the
505
    /// store, the new schema is inserted. If the fingerprint already exists, the
506
    /// existing schema is not overwritten.
507
    ///
508
    /// # Arguments
509
    ///
510
    /// * `schema` - The `AvroSchema` to register.
511
    ///
512
    /// # Returns
513
    ///
514
    /// A `Result` containing the `Fingerprint` of the schema if successful,
515
    /// or an `ArrowError` on failure.
516
29
    pub fn register(&mut self, schema: AvroSchema) -> Result<Fingerprint, ArrowError> {
517
29
        let fingerprint = generate_fingerprint(&schema.schema()
?0
, self.fingerprint_algorithm)
?0
;
518
29
        match self.schemas.entry(fingerprint) {
519
2
            Entry::Occupied(entry) => {
520
2
                if entry.get() != &schema {
521
0
                    return Err(ArrowError::ComputeError(format!(
522
0
                        "Schema fingerprint collision detected for fingerprint {fingerprint:?}"
523
0
                    )));
524
2
                }
525
            }
526
27
            Entry::Vacant(entry) => {
527
27
                entry.insert(schema);
528
27
            }
529
        }
530
29
        Ok(fingerprint)
531
29
    }
532
533
    /// Looks up a schema by its `Fingerprint`.
534
    ///
535
    /// # Arguments
536
    ///
537
    /// * `fingerprint` - A reference to the `Fingerprint` of the schema to look up.
538
    ///
539
    /// # Returns
540
    ///
541
    /// An `Option` containing a clone of the `AvroSchema` if found, otherwise `None`.
542
24
    pub fn lookup(&self, fingerprint: &Fingerprint) -> Option<&AvroSchema> {
543
24
        self.schemas.get(fingerprint)
544
24
    }
545
546
    /// Returns a `Vec` containing **all unique [`Fingerprint`]s** currently
547
    /// held by this [`SchemaStore`].
548
    ///
549
    /// The order of the returned fingerprints is unspecified and should not be
550
    /// relied upon.
551
21
    pub fn fingerprints(&self) -> Vec<Fingerprint> {
552
21
        self.schemas.keys().copied().collect()
553
21
    }
554
555
    /// Returns the `FingerprintAlgorithm` used by the `SchemaStore` for fingerprinting.
556
11
    pub(crate) fn fingerprint_algorithm(&self) -> FingerprintAlgorithm {
557
11
        self.fingerprint_algorithm
558
11
    }
559
}
560
561
97
fn quote(s: &str) -> Result<String, ArrowError> {
562
97
    serde_json::to_string(s)
563
97
        .map_err(|e| ArrowError::ComputeError(
format!0
(
"Failed to quote string: {e}"0
)))
564
97
}
565
566
// Avro names are defined by a `name` and an optional `namespace`.
567
// The full name is composed of the namespace and the name, separated by a dot.
568
//
569
// Avro specification defines two ways to specify a full name:
570
// 1. The `name` attribute contains the full name (e.g., "a.b.c.d").
571
//    In this case, the `namespace` attribute is ignored.
572
// 2. The `name` attribute contains the simple name (e.g., "d") and the
573
//    `namespace` attribute contains the namespace (e.g., "a.b.c").
574
//
575
// Each part of the name must match the regex `^[A-Za-z_][A-Za-z0-9_]*$`.
576
// Complex paths with quotes or backticks like `a."hi".b` are not supported.
577
//
578
// This function constructs the full name and extracts the namespace,
579
// handling both ways of specifying the name. It prioritizes a namespace
580
// defined within the `name` attribute itself, then the explicit `namespace_attr`,
581
// and finally the `enclosing_ns`.
582
25
fn make_full_name(
583
25
    name: &str,
584
25
    namespace_attr: Option<&str>,
585
25
    enclosing_ns: Option<&str>,
586
25
) -> (String, Option<String>) {
587
    // `name` already contains a dot then treat as full-name, ignore namespace.
588
25
    if let Some((
ns0
, _)) = name.rsplit_once('.') {
589
0
        return (name.to_string(), Some(ns.to_string()));
590
25
    }
591
25
    match namespace_attr.or(enclosing_ns) {
592
6
        Some(ns) => (format!("{ns}.{name}"), Some(ns.to_string())),
593
19
        None => (name.to_string(), None),
594
    }
595
25
}
596
597
66
fn build_canonical(schema: &Schema, enclosing_ns: Option<&str>) -> Result<String, ArrowError> {
598
66
    Ok(match schema {
599
41
        Schema::TypeName(
tn40
) | Schema::Type(Type { r#type:
tn1
, .. }) => match tn {
600
41
            TypeName::Primitive(pt) => quote(pt.as_ref())
?0
,
601
0
            TypeName::Ref(name) => {
602
0
                let (full_name, _) = make_full_name(name, None, enclosing_ns);
603
0
                quote(&full_name)?
604
            }
605
        },
606
0
        Schema::Union(branches) => format!(
607
0
            "[{}]",
608
0
            branches
609
0
                .iter()
610
0
                .map(|b| build_canonical(b, enclosing_ns))
611
0
                .collect::<Result<Vec<_>, _>>()?
612
0
                .join(",")
613
        ),
614
25
        Schema::Complex(ct) => match ct {
615
25
            ComplexType::Record(r) => {
616
25
                let (full_name, child_ns) = make_full_name(r.name, r.namespace, enclosing_ns);
617
25
                let fields = r
618
25
                    .fields
619
25
                    .iter()
620
31
                    .
map25
(|f| {
621
31
                        let field_type =
622
31
                            build_canonical(&f.r#type, child_ns.as_deref().or(enclosing_ns))
?0
;
623
31
                        Ok(format!(
624
31
                            r#"{{"name":{},"type":{}}}"#,
625
31
                            quote(f.name)
?0
,
626
                            field_type
627
                        ))
628
31
                    })
629
25
                    .collect::<Result<Vec<_>, ArrowError>>()
?0
630
25
                    .join(",");
631
25
                format!(
632
25
                    r#"{{"name":{},"type":"record","fields":[{fields}]}}"#,
633
25
                    quote(&full_name)
?0
,
634
                )
635
            }
636
0
            ComplexType::Enum(e) => {
637
0
                let (full_name, _) = make_full_name(e.name, e.namespace, enclosing_ns);
638
0
                let symbols = e
639
0
                    .symbols
640
0
                    .iter()
641
0
                    .map(|s| quote(s))
642
0
                    .collect::<Result<Vec<_>, _>>()?
643
0
                    .join(",");
644
0
                format!(
645
0
                    r#"{{"name":{},"type":"enum","symbols":[{symbols}]}}"#,
646
0
                    quote(&full_name)?
647
                )
648
            }
649
0
            ComplexType::Array(arr) => format!(
650
0
                r#"{{"type":"array","items":{}}}"#,
651
0
                build_canonical(&arr.items, enclosing_ns)?
652
            ),
653
0
            ComplexType::Map(map) => format!(
654
0
                r#"{{"type":"map","values":{}}}"#,
655
0
                build_canonical(&map.values, enclosing_ns)?
656
            ),
657
0
            ComplexType::Fixed(f) => {
658
0
                let (full_name, _) = make_full_name(f.name, f.namespace, enclosing_ns);
659
0
                format!(
660
0
                    r#"{{"name":{},"type":"fixed","size":{}}}"#,
661
0
                    quote(&full_name)?,
662
                    f.size
663
                )
664
            }
665
        },
666
    })
667
66
}
668
669
/// 64‑bit Rabin fingerprint as described in the Avro spec.
670
const EMPTY: u64 = 0xc15d_213a_a4d7_a795;
671
672
/// Build one entry of the polynomial‑division table.
673
///
674
/// We cannot yet write `for _ in 0..8` here: `for` loops rely on
675
/// `Iterator::next`, which is not `const` on stable Rust.  Until the
676
/// `const_for` feature (tracking issue #87575) is stabilized, a `while`
677
/// loop is the only option in a `const fn`
678
0
const fn one_entry(i: usize) -> u64 {
679
0
    let mut fp = i as u64;
680
0
    let mut j = 0;
681
0
    while j < 8 {
682
0
        fp = (fp >> 1) ^ (EMPTY & (0u64.wrapping_sub(fp & 1)));
683
0
        j += 1;
684
0
    }
685
0
    fp
686
0
}
687
688
/// Build the full 256‑entry table at compile time.
689
///
690
/// We cannot yet write `for _ in 0..256` here: `for` loops rely on
691
/// `Iterator::next`, which is not `const` on stable Rust.  Until the
692
/// `const_for` feature (tracking issue #87575) is stabilized, a `while`
693
/// loop is the only option in a `const fn`
694
0
const fn build_table() -> [u64; 256] {
695
0
    let mut table = [0u64; 256];
696
0
    let mut i = 0;
697
0
    while i < 256 {
698
0
        table[i] = one_entry(i);
699
0
        i += 1;
700
0
    }
701
0
    table
702
0
}
703
704
/// The pre‑computed table.
705
static FINGERPRINT_TABLE: [u64; 256] = build_table();
706
707
/// Computes the 64-bit Rabin fingerprint for a given canonical schema string.
708
/// This implementation is based on the Avro specification for schema fingerprinting.
709
34
pub(crate) fn compute_fingerprint_rabin(canonical_form: &str) -> u64 {
710
34
    let mut fp = EMPTY;
711
2.19k
    for &byte in 
canonical_form34
.
as_bytes34
() {
712
2.19k
        let idx = ((fp as u8) ^ byte) as usize;
713
2.19k
        fp = (fp >> 8) ^ FINGERPRINT_TABLE[idx];
714
2.19k
    }
715
34
    fp
716
34
}
717
718
#[inline]
719
2
fn is_internal_arrow_key(key: &str) -> bool {
720
2
    key.starts_with("ARROW:") || key == SCHEMA_METADATA_KEY
721
2
}
722
723
// Sanitize an arbitrary string so it is a valid Avro field or type name
724
154
fn sanitise_avro_name(base_name: &str) -> String {
725
154
    if base_name.is_empty() {
726
0
        return "_".to_owned();
727
154
    }
728
154
    let mut out: String = base_name
729
154
        .chars()
730
1.39k
        .
map154
(|char| {
731
1.39k
            if char.is_ascii_alphanumeric() || 
char == '_'62
{
732
1.39k
                char
733
            } else {
734
2
                '_'
735
            }
736
1.39k
        })
737
154
        .collect();
738
154
    if out.as_bytes()[0].is_ascii_digit() {
739
1
        out.insert(0, '_');
740
153
    }
741
154
    out
742
154
}
743
744
#[derive(Default)]
745
struct NameGenerator {
746
    used: HashSet<String>,
747
    counters: HashMap<String, usize>,
748
}
749
750
impl NameGenerator {
751
6
    fn make_unique(&mut self, field_name: &str) -> String {
752
6
        let field_name = sanitise_avro_name(field_name);
753
6
        if self.used.insert(field_name.clone()) {
754
5
            self.counters.insert(field_name.clone(), 1);
755
5
            return field_name;
756
1
        }
757
1
        let counter = self.counters.entry(field_name.clone()).or_insert(1);
758
        loop {
759
1
            let candidate = format!("{field_name}_{}", *counter);
760
1
            if self.used.insert(candidate.clone()) {
761
1
                return candidate;
762
0
            }
763
0
            *counter += 1;
764
        }
765
6
    }
766
}
767
768
108
fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value {
769
108
    if extras.is_empty() {
770
102
        return schema;
771
6
    }
772
6
    match schema {
773
1
        Value::Object(mut map) => {
774
1
            map.extend(extras);
775
1
            Value::Object(map)
776
        }
777
0
        Value::Array(mut union) => {
778
0
            if let Some(non_null) = union.iter_mut().find(|val| val.as_str() != Some("null")) {
779
0
                let original = std::mem::take(non_null);
780
0
                *non_null = merge_extras(original, extras);
781
0
            }
782
0
            Value::Array(union)
783
        }
784
5
        primitive => {
785
5
            let mut map = JsonMap::with_capacity(extras.len() + 1);
786
5
            map.insert("type".into(), primitive);
787
5
            map.extend(extras);
788
5
            Value::Object(map)
789
        }
790
    }
791
108
}
792
793
// Convert an Arrow `DataType` into an Avro schema `Value`.
794
109
fn datatype_to_avro(
795
109
    dt: &DataType,
796
109
    field_name: &str,
797
109
    metadata: &HashMap<String, String>,
798
109
    name_gen: &mut NameGenerator,
799
109
) -> Result<(Value, JsonMap<String, Value>), ArrowError> {
800
109
    let mut extras = JsonMap::new();
801
108
    let val = match 
dt3
{
802
0
        DataType::Null => Value::String("null".into()),
803
7
        DataType::Boolean => Value::String("boolean".into()),
804
        DataType::Int8 | DataType::Int16 | DataType::UInt8 | DataType::UInt16 | DataType::Int32 => {
805
32
            Value::String("int".into())
806
        }
807
9
        DataType::UInt32 | DataType::Int64 | DataType::UInt64 => Value::String("long".into()),
808
7
        DataType::Float16 | DataType::Float32 => Value::String("float".into()),
809
6
        DataType::Float64 => Value::String("double".into()),
810
3
        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Value::String("string".into()),
811
14
        DataType::Binary | DataType::LargeBinary => Value::String("bytes".into()),
812
4
        DataType::FixedSizeBinary(len) => {
813
4
            let is_uuid = metadata
814
4
                .get("logicalType")
815
4
                .is_some_and(|value| 
value1
==
"uuid"1
)
816
3
                || (*len == 16
817
0
                    && metadata
818
0
                        .get("ARROW:extension:name")
819
0
                        .is_some_and(|value| value == "uuid"));
820
4
            if is_uuid {
821
1
                json!({ "type": "string", "logicalType": "uuid" })
822
            } else {
823
3
                json!({
824
3
                    "type": "fixed",
825
3
                    "name": name_gen.make_unique(field_name),
826
3
                    "size": len
827
                })
828
            }
829
        }
830
1
        DataType::Decimal128(precision, scale) | DataType::Decimal256(
precision0
,
scale0
) => {
831
            // Prefer fixed if original size info present
832
1
            let mut meta = JsonMap::from_iter([
833
1
                ("logicalType".into(), json!("decimal")),
834
1
                ("precision".into(), json!(*precision)),
835
1
                ("scale".into(), json!(*scale)),
836
1
            ]);
837
1
            if let Some(
size0
) = metadata
838
1
                .get("size")
839
1
                .and_then(|val| 
val.parse::<usize>()0
.
ok0
())
840
0
            {
841
0
                meta.insert("type".into(), json!("fixed"));
842
0
                meta.insert("size".into(), json!(size));
843
0
                meta.insert("name".into(), json!(name_gen.make_unique(field_name)));
844
1
            } else {
845
1
                meta.insert("type".into(), json!("bytes"));
846
1
            }
847
1
            Value::Object(meta)
848
        }
849
1
        DataType::Date32 => json!({ "type": "int", "logicalType": "date" }),
850
1
        DataType::Date64 => json!({ "type": "long", "logicalType": "local-timestamp-millis" }),
851
1
        DataType::Time32(unit) => match unit {
852
1
            TimeUnit::Millisecond => json!({ "type": "int", "logicalType": "time-millis" }),
853
            TimeUnit::Second => {
854
0
                extras.insert("arrowTimeUnit".into(), Value::String("second".into()));
855
0
                Value::String("int".into())
856
            }
857
0
            _ => Value::String("int".into()),
858
        },
859
1
        DataType::Time64(unit) => match unit {
860
1
            TimeUnit::Microsecond => json!({ "type": "long", "logicalType": "time-micros" }),
861
            TimeUnit::Nanosecond => {
862
0
                extras.insert("arrowTimeUnit".into(), Value::String("nanosecond".into()));
863
0
                Value::String("long".into())
864
            }
865
0
            _ => Value::String("long".into()),
866
        },
867
7
        DataType::Timestamp(unit, tz) => {
868
7
            let logical_type = match (unit, tz.is_some()) {
869
0
                (TimeUnit::Millisecond, true) => "timestamp-millis",
870
1
                (TimeUnit::Millisecond, false) => "local-timestamp-millis",
871
6
                (TimeUnit::Microsecond, true) => "timestamp-micros",
872
0
                (TimeUnit::Microsecond, false) => "local-timestamp-micros",
873
                (TimeUnit::Second, _) => {
874
0
                    extras.insert("arrowTimeUnit".into(), Value::String("second".into()));
875
0
                    return Ok((Value::String("long".into()), extras));
876
                }
877
                (TimeUnit::Nanosecond, _) => {
878
0
                    extras.insert("arrowTimeUnit".into(), Value::String("nanosecond".into()));
879
0
                    return Ok((Value::String("long".into()), extras));
880
                }
881
            };
882
7
            json!({ "type": "long", "logicalType": logical_type })
883
        }
884
3
        DataType::Duration(unit) => {
885
3
            extras.insert(
886
3
                "arrowDurationUnit".into(),
887
3
                Value::String(format!("{unit:?}").to_lowercase()),
888
            );
889
3
            Value::String("long".into())
890
        }
891
1
        DataType::Interval(IntervalUnit::MonthDayNano) => json!({
892
1
            "type": "fixed",
893
1
            "name": name_gen.make_unique(&format!("{field_name}_duration")),
894
1
            "size": 12,
895
1
            "logicalType": "duration"
896
        }),
897
        DataType::Interval(IntervalUnit::YearMonth) => {
898
1
            extras.insert(
899
1
                "arrowIntervalUnit".into(),
900
1
                Value::String("yearmonth".into()),
901
            );
902
1
            Value::String("long".into())
903
        }
904
        DataType::Interval(IntervalUnit::DayTime) => {
905
1
            extras.insert("arrowIntervalUnit".into(), Value::String("daytime".into()));
906
1
            Value::String("long".into())
907
        }
908
2
        DataType::List(child) | DataType::LargeList(
child0
) => {
909
2
            if matches!(dt, DataType::LargeList(_)) {
910
0
                extras.insert("arrowLargeList".into(), Value::Bool(true));
911
2
            }
912
2
            let (items, ie) =
913
2
                datatype_to_avro(child.data_type(), child.name(), child.metadata(), name_gen)
?0
;
914
2
            json!({
915
2
                "type": "array",
916
2
                "items": merge_extras(items, ie)
917
            })
918
        }
919
1
        DataType::FixedSizeList(child, len) => {
920
1
            extras.insert("arrowFixedSize".into(), json!(len));
921
1
            let (items, ie) =
922
1
                datatype_to_avro(child.data_type(), child.name(), child.metadata(), name_gen)
?0
;
923
1
            json!({
924
1
                "type": "array",
925
1
                "items": merge_extras(items, ie)
926
            })
927
        }
928
2
        DataType::Map(entries, _) => {
929
2
            let value_field = match entries.data_type() {
930
2
                DataType::Struct(fs) => &fs[1],
931
                _ => {
932
0
                    return Err(ArrowError::SchemaError(
933
0
                        "Map 'entries' field must be Struct(key,value)".into(),
934
0
                    ))
935
                }
936
            };
937
2
            let (val_schema, value_entry) = datatype_to_avro(
938
2
                value_field.data_type(),
939
2
                value_field.name(),
940
2
                value_field.metadata(),
941
2
                name_gen,
942
0
            )?;
943
2
            json!({
944
2
                "type": "map",
945
2
                "values": merge_extras(val_schema, value_entry)
946
            })
947
        }
948
1
        DataType::Struct(fields) => {
949
1
            let avro_fields = fields
950
1
                .iter()
951
2
                .
map1
(|field| arrow_field_to_avro(field, name_gen))
952
1
                .collect::<Result<Vec<_>, _>>()
?0
;
953
1
            json!({
954
1
                "type": "record",
955
1
                "name": name_gen.make_unique(field_name),
956
1
                "fields": avro_fields
957
            })
958
        }
959
1
        DataType::Dictionary(_, value) => {
960
1
            if let Some(j) = metadata.get(AVRO_ENUM_SYMBOLS_METADATA_KEY) {
961
1
                let symbols: Vec<&str> =
962
1
                    serde_json::from_str(j).map_err(|e| ArrowError::ParseError(
e0
.
to_string0
()))
?0
;
963
1
                json!({
964
1
                    "type": "enum",
965
1
                    "name": name_gen.make_unique(field_name),
966
1
                    "symbols": symbols
967
                })
968
            } else {
969
0
                let (inner, ie) = datatype_to_avro(value.as_ref(), field_name, metadata, name_gen)?;
970
0
                merge_extras(inner, ie)
971
            }
972
        }
973
1
        DataType::RunEndEncoded(_, values) => {
974
1
            let (inner, ie) = datatype_to_avro(
975
1
                values.data_type(),
976
1
                values.name(),
977
1
                values.metadata(),
978
1
                name_gen,
979
0
            )?;
980
1
            merge_extras(inner, ie)
981
        }
982
        DataType::Union(_, _) => {
983
1
            return Err(ArrowError::NotYetImplemented(
984
1
                "Arrow Union to Avro Union not yet supported".into(),
985
1
            ))
986
        }
987
0
        other => {
988
0
            return Err(ArrowError::NotYetImplemented(format!(
989
0
                "Arrow type {other:?} has no Avro representation"
990
0
            )))
991
        }
992
    };
993
108
    Ok((val, extras))
994
109
}
995
996
103
fn arrow_field_to_avro(
997
103
    field: &ArrowField,
998
103
    name_gen: &mut NameGenerator,
999
103
) -> Result<Value, ArrowError> {
1000
    // Sanitize field name to ensure Avro validity but store the original in metadata
1001
103
    let avro_name = sanitise_avro_name(field.name());
1002
102
    let (schema, extras) =
1003
103
        datatype_to_avro(field.data_type(), &avro_name, field.metadata(), name_gen)
?1
;
1004
    // If nullable, wrap `[ "null", <type> ]`, NOTE: second order nullability to be added in a follow-up
1005
102
    let mut schema = if field.is_nullable() {
1006
63
        Value::Array(vec![
1007
63
            Value::String("null".into()),
1008
63
            merge_extras(schema, extras),
1009
63
        ])
1010
    } else {
1011
39
        merge_extras(schema, extras)
1012
    };
1013
    // Build the field map
1014
102
    let mut map = JsonMap::with_capacity(field.metadata().len() + 3);
1015
102
    map.insert("name".into(), Value::String(avro_name));
1016
102
    map.insert("type".into(), schema);
1017
    // Transfer selected metadata
1018
102
    for (
meta_key2
,
meta_val2
) in field.metadata() {
1019
2
        if is_internal_arrow_key(meta_key) {
1020
0
            continue;
1021
2
        }
1022
2
        match meta_key.as_str() {
1023
2
            AVRO_DOC_METADATA_KEY => {
1024
0
                map.insert("doc".into(), Value::String(meta_val.clone()));
1025
0
            }
1026
2
            AVRO_FIELD_DEFAULT_METADATA_KEY => {
1027
0
                let default_value = serde_json::from_str(meta_val)
1028
0
                    .unwrap_or_else(|_| Value::String(meta_val.clone()));
1029
0
                map.insert("default".into(), default_value);
1030
            }
1031
            _ => {
1032
2
                let json_val = serde_json::from_str(meta_val)
1033
2
                    .unwrap_or_else(|_| Value::String(
meta_val1
.
clone1
()));
1034
2
                map.insert(meta_key.clone(), json_val);
1035
            }
1036
        }
1037
    }
1038
102
    Ok(Value::Object(map))
1039
103
}
1040
1041
#[cfg(test)]
1042
mod tests {
1043
    use super::*;
1044
    use crate::codec::{AvroDataType, AvroField};
1045
    use arrow_schema::{DataType, Fields, SchemaBuilder, TimeUnit};
1046
    use serde_json::json;
1047
    use std::sync::Arc;
1048
1049
12
    fn int_schema() -> Schema<'static> {
1050
12
        Schema::TypeName(TypeName::Primitive(PrimitiveType::Int))
1051
12
    }
1052
1053
5
    fn record_schema() -> Schema<'static> {
1054
5
        Schema::Complex(ComplexType::Record(Record {
1055
5
            name: "record1",
1056
5
            namespace: Some("test.namespace"),
1057
5
            doc: Some("A test record"),
1058
5
            aliases: vec![],
1059
5
            fields: vec![
1060
5
                Field {
1061
5
                    name: "field1",
1062
5
                    doc: Some("An integer field"),
1063
5
                    r#type: int_schema(),
1064
5
                    default: None,
1065
5
                },
1066
5
                Field {
1067
5
                    name: "field2",
1068
5
                    doc: None,
1069
5
                    r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::String)),
1070
5
                    default: None,
1071
5
                },
1072
5
            ],
1073
5
            attributes: Attributes::default(),
1074
5
        }))
1075
5
    }
1076
1077
35
    fn single_field_schema(field: ArrowField) -> arrow_schema::Schema {
1078
35
        let mut sb = SchemaBuilder::new();
1079
35
        sb.push(field);
1080
35
        sb.finish()
1081
35
    }
1082
1083
44
    fn assert_json_contains(avro_json: &str, needle: &str) {
1084
44
        assert!(
1085
44
            avro_json.contains(needle),
1086
0
            "JSON did not contain `{needle}` : {avro_json}"
1087
        )
1088
44
    }
1089
1090
    #[test]
1091
1
    fn test_deserialize() {
1092
1
        let t: Schema = serde_json::from_str("\"string\"").unwrap();
1093
1
        assert_eq!(
1094
            t,
1095
            Schema::TypeName(TypeName::Primitive(PrimitiveType::String))
1096
        );
1097
1098
1
        let t: Schema = serde_json::from_str("[\"int\", \"null\"]").unwrap();
1099
1
        assert_eq!(
1100
            t,
1101
1
            Schema::Union(vec![
1102
1
                Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)),
1103
1
                Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1104
1
            ])
1105
        );
1106
1107
1
        let t: Type = serde_json::from_str(
1108
1
            r#"{
1109
1
                   "type":"long",
1110
1
                   "logicalType":"timestamp-micros"
1111
1
                }"#,
1112
        )
1113
1
        .unwrap();
1114
1115
1
        let timestamp = Type {
1116
1
            r#type: TypeName::Primitive(PrimitiveType::Long),
1117
1
            attributes: Attributes {
1118
1
                logical_type: Some("timestamp-micros"),
1119
1
                additional: Default::default(),
1120
1
            },
1121
1
        };
1122
1123
1
        assert_eq!(t, timestamp);
1124
1125
1
        let t: ComplexType = serde_json::from_str(
1126
1
            r#"{
1127
1
                   "type":"fixed",
1128
1
                   "name":"fixed",
1129
1
                   "namespace":"topLevelRecord.value",
1130
1
                   "size":11,
1131
1
                   "logicalType":"decimal",
1132
1
                   "precision":25,
1133
1
                   "scale":2
1134
1
                }"#,
1135
        )
1136
1
        .unwrap();
1137
1138
1
        let decimal = ComplexType::Fixed(Fixed {
1139
1
            name: "fixed",
1140
1
            namespace: Some("topLevelRecord.value"),
1141
1
            aliases: vec![],
1142
1
            size: 11,
1143
1
            attributes: Attributes {
1144
1
                logical_type: Some("decimal"),
1145
1
                additional: vec![("precision", json!(25)), ("scale", json!(2))]
1146
1
                    .into_iter()
1147
1
                    .collect(),
1148
1
            },
1149
1
        });
1150
1151
1
        assert_eq!(t, decimal);
1152
1153
1
        let schema: Schema = serde_json::from_str(
1154
1
            r#"{
1155
1
               "type":"record",
1156
1
               "name":"topLevelRecord",
1157
1
               "fields":[
1158
1
                  {
1159
1
                     "name":"value",
1160
1
                     "type":[
1161
1
                        {
1162
1
                           "type":"fixed",
1163
1
                           "name":"fixed",
1164
1
                           "namespace":"topLevelRecord.value",
1165
1
                           "size":11,
1166
1
                           "logicalType":"decimal",
1167
1
                           "precision":25,
1168
1
                           "scale":2
1169
1
                        },
1170
1
                        "null"
1171
1
                     ]
1172
1
                  }
1173
1
               ]
1174
1
            }"#,
1175
        )
1176
1
        .unwrap();
1177
1178
1
        assert_eq!(
1179
            schema,
1180
1
            Schema::Complex(ComplexType::Record(Record {
1181
1
                name: "topLevelRecord",
1182
1
                namespace: None,
1183
1
                doc: None,
1184
1
                aliases: vec![],
1185
1
                fields: vec![Field {
1186
1
                    name: "value",
1187
1
                    doc: None,
1188
1
                    r#type: Schema::Union(vec![
1189
1
                        Schema::Complex(decimal),
1190
1
                        Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1191
1
                    ]),
1192
1
                    default: None,
1193
1
                },],
1194
1
                attributes: Default::default(),
1195
1
            }))
1196
        );
1197
1198
1
        let schema: Schema = serde_json::from_str(
1199
1
            r#"{
1200
1
                  "type": "record",
1201
1
                  "name": "LongList",
1202
1
                  "aliases": ["LinkedLongs"],
1203
1
                  "fields" : [
1204
1
                    {"name": "value", "type": "long"},
1205
1
                    {"name": "next", "type": ["null", "LongList"]}
1206
1
                  ]
1207
1
                }"#,
1208
        )
1209
1
        .unwrap();
1210
1211
1
        assert_eq!(
1212
            schema,
1213
1
            Schema::Complex(ComplexType::Record(Record {
1214
1
                name: "LongList",
1215
1
                namespace: None,
1216
1
                doc: None,
1217
1
                aliases: vec!["LinkedLongs"],
1218
1
                fields: vec![
1219
1
                    Field {
1220
1
                        name: "value",
1221
1
                        doc: None,
1222
1
                        r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::Long)),
1223
1
                        default: None,
1224
1
                    },
1225
1
                    Field {
1226
1
                        name: "next",
1227
1
                        doc: None,
1228
1
                        r#type: Schema::Union(vec![
1229
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1230
1
                            Schema::TypeName(TypeName::Ref("LongList")),
1231
1
                        ]),
1232
1
                        default: None,
1233
1
                    }
1234
1
                ],
1235
1
                attributes: Attributes::default(),
1236
1
            }))
1237
        );
1238
1239
        // Recursive schema are not supported
1240
1
        let err = AvroField::try_from(&schema).unwrap_err().to_string();
1241
1
        assert_eq!(err, "Parser error: Failed to resolve .LongList");
1242
1243
1
        let schema: Schema = serde_json::from_str(
1244
1
            r#"{
1245
1
               "type":"record",
1246
1
               "name":"topLevelRecord",
1247
1
               "fields":[
1248
1
                  {
1249
1
                     "name":"id",
1250
1
                     "type":[
1251
1
                        "int",
1252
1
                        "null"
1253
1
                     ]
1254
1
                  },
1255
1
                  {
1256
1
                     "name":"timestamp_col",
1257
1
                     "type":[
1258
1
                        {
1259
1
                           "type":"long",
1260
1
                           "logicalType":"timestamp-micros"
1261
1
                        },
1262
1
                        "null"
1263
1
                     ]
1264
1
                  }
1265
1
               ]
1266
1
            }"#,
1267
        )
1268
1
        .unwrap();
1269
1270
1
        assert_eq!(
1271
            schema,
1272
1
            Schema::Complex(ComplexType::Record(Record {
1273
1
                name: "topLevelRecord",
1274
1
                namespace: None,
1275
1
                doc: None,
1276
1
                aliases: vec![],
1277
1
                fields: vec![
1278
1
                    Field {
1279
1
                        name: "id",
1280
1
                        doc: None,
1281
1
                        r#type: Schema::Union(vec![
1282
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)),
1283
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1284
1
                        ]),
1285
1
                        default: None,
1286
1
                    },
1287
1
                    Field {
1288
1
                        name: "timestamp_col",
1289
1
                        doc: None,
1290
1
                        r#type: Schema::Union(vec![
1291
1
                            Schema::Type(timestamp),
1292
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1293
1
                        ]),
1294
1
                        default: None,
1295
1
                    }
1296
1
                ],
1297
1
                attributes: Default::default(),
1298
1
            }))
1299
        );
1300
1
        let codec = AvroField::try_from(&schema).unwrap();
1301
1
        assert_eq!(
1302
1
            codec.field(),
1303
1
            arrow_schema::Field::new(
1304
                "topLevelRecord",
1305
1
                DataType::Struct(Fields::from(vec![
1306
1
                    arrow_schema::Field::new("id", DataType::Int32, true),
1307
1
                    arrow_schema::Field::new(
1308
1
                        "timestamp_col",
1309
1
                        DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
1310
1
                        true
1311
1
                    ),
1312
1
                ])),
1313
                false
1314
            )
1315
        );
1316
1317
1
        let schema: Schema = serde_json::from_str(
1318
1
            r#"{
1319
1
                  "type": "record",
1320
1
                  "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc",
1321
1
                  "fields": [
1322
1
                    {"name": "clientHash", "type": {"type": "fixed", "name": "MD5", "size": 16}},
1323
1
                    {"name": "clientProtocol", "type": ["null", "string"]},
1324
1
                    {"name": "serverHash", "type": "MD5"},
1325
1
                    {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]}
1326
1
                  ]
1327
1
            }"#,
1328
        )
1329
1
        .unwrap();
1330
1331
1
        assert_eq!(
1332
            schema,
1333
1
            Schema::Complex(ComplexType::Record(Record {
1334
1
                name: "HandshakeRequest",
1335
1
                namespace: Some("org.apache.avro.ipc"),
1336
1
                doc: None,
1337
1
                aliases: vec![],
1338
1
                fields: vec![
1339
1
                    Field {
1340
1
                        name: "clientHash",
1341
1
                        doc: None,
1342
1
                        r#type: Schema::Complex(ComplexType::Fixed(Fixed {
1343
1
                            name: "MD5",
1344
1
                            namespace: None,
1345
1
                            aliases: vec![],
1346
1
                            size: 16,
1347
1
                            attributes: Default::default(),
1348
1
                        })),
1349
1
                        default: None,
1350
1
                    },
1351
1
                    Field {
1352
1
                        name: "clientProtocol",
1353
1
                        doc: None,
1354
1
                        r#type: Schema::Union(vec![
1355
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1356
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::String)),
1357
1
                        ]),
1358
1
                        default: None,
1359
1
                    },
1360
1
                    Field {
1361
1
                        name: "serverHash",
1362
1
                        doc: None,
1363
1
                        r#type: Schema::TypeName(TypeName::Ref("MD5")),
1364
1
                        default: None,
1365
1
                    },
1366
1
                    Field {
1367
1
                        name: "meta",
1368
1
                        doc: None,
1369
1
                        r#type: Schema::Union(vec![
1370
1
                            Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)),
1371
1
                            Schema::Complex(ComplexType::Map(Map {
1372
1
                                values: Box::new(Schema::TypeName(TypeName::Primitive(
1373
1
                                    PrimitiveType::Bytes
1374
1
                                ))),
1375
1
                                attributes: Default::default(),
1376
1
                            })),
1377
1
                        ]),
1378
1
                        default: None,
1379
1
                    }
1380
1
                ],
1381
1
                attributes: Default::default(),
1382
1
            }))
1383
        );
1384
1
    }
1385
1386
    #[test]
1387
1
    fn test_new_schema_store() {
1388
1
        let store = SchemaStore::new();
1389
1
        assert!(store.schemas.is_empty());
1390
1
    }
1391
1392
    #[test]
1393
1
    fn test_try_from_schemas_rabin() {
1394
1
        let int_avro_schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap());
1395
1
        let record_avro_schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap());
1396
1
        let schemas = vec![int_avro_schema.clone(), record_avro_schema.clone()];
1397
1
        let store = SchemaStore::try_from(schemas.as_slice()).unwrap();
1398
1
        let int_fp = int_avro_schema.fingerprint().unwrap();
1399
1
        assert_eq!(store.lookup(&int_fp).cloned(), Some(int_avro_schema));
1400
1
        let rec_fp = record_avro_schema.fingerprint().unwrap();
1401
1
        assert_eq!(store.lookup(&rec_fp).cloned(), Some(record_avro_schema));
1402
1
    }
1403
1404
    #[test]
1405
1
    fn test_try_from_with_duplicates() {
1406
1
        let int_avro_schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap());
1407
1
        let record_avro_schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap());
1408
1
        let schemas = vec![
1409
1
            int_avro_schema.clone(),
1410
1
            record_avro_schema,
1411
1
            int_avro_schema.clone(),
1412
        ];
1413
1
        let store = SchemaStore::try_from(schemas.as_slice()).unwrap();
1414
1
        assert_eq!(store.schemas.len(), 2);
1415
1
        let int_fp = int_avro_schema.fingerprint().unwrap();
1416
1
        assert_eq!(store.lookup(&int_fp).cloned(), Some(int_avro_schema));
1417
1
    }
1418
1419
    #[test]
1420
1
    fn test_register_and_lookup_rabin() {
1421
1
        let mut store = SchemaStore::new();
1422
1
        let schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap());
1423
1
        let fp_enum = store.register(schema.clone()).unwrap();
1424
1
        let Fingerprint::Rabin(fp_val) = fp_enum;
1425
1
        assert_eq!(
1426
1
            store.lookup(&Fingerprint::Rabin(fp_val)).cloned(),
1427
1
            Some(schema.clone())
1428
        );
1429
1
        assert!(store
1430
1
            .lookup(&Fingerprint::Rabin(fp_val.wrapping_add(1)))
1431
1
            .is_none());
1432
1
    }
1433
1434
    #[test]
1435
1
    fn test_register_duplicate_schema() {
1436
1
        let mut store = SchemaStore::new();
1437
1
        let schema1 = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap());
1438
1
        let schema2 = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap());
1439
1
        let fingerprint1 = store.register(schema1).unwrap();
1440
1
        let fingerprint2 = store.register(schema2).unwrap();
1441
1
        assert_eq!(fingerprint1, fingerprint2);
1442
1
        assert_eq!(store.schemas.len(), 1);
1443
1
    }
1444
1445
    #[test]
1446
1
    fn test_canonical_form_generation_primitive() {
1447
1
        let schema = int_schema();
1448
1
        let canonical_form = generate_canonical_form(&schema).unwrap();
1449
1
        assert_eq!(canonical_form, r#""int""#);
1450
1
    }
1451
1452
    #[test]
1453
1
    fn test_canonical_form_generation_record() {
1454
1
        let schema = record_schema();
1455
1
        let expected_canonical_form = r#"{"name":"test.namespace.record1","type":"record","fields":[{"name":"field1","type":"int"},{"name":"field2","type":"string"}]}"#;
1456
1
        let canonical_form = generate_canonical_form(&schema).unwrap();
1457
1
        assert_eq!(canonical_form, expected_canonical_form);
1458
1
    }
1459
1460
    #[test]
1461
1
    fn test_fingerprint_calculation() {
1462
1
        let canonical_form = r#"{"fields":[{"name":"a","type":"long"},{"name":"b","type":"string"}],"name":"test","type":"record"}"#;
1463
1
        let expected_fingerprint = 10505236152925314060;
1464
1
        let fingerprint = compute_fingerprint_rabin(canonical_form);
1465
1
        assert_eq!(fingerprint, expected_fingerprint);
1466
1
    }
1467
1468
    #[test]
1469
1
    fn test_register_and_lookup_complex_schema() {
1470
1
        let mut store = SchemaStore::new();
1471
1
        let schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap());
1472
1
        let canonical_form = r#"{"name":"test.namespace.record1","type":"record","fields":[{"name":"field1","type":"int"},{"name":"field2","type":"string"}]}"#;
1473
1
        let expected_fingerprint =
1474
1
            Fingerprint::Rabin(super::compute_fingerprint_rabin(canonical_form));
1475
1
        let fingerprint = store.register(schema.clone()).unwrap();
1476
1
        assert_eq!(fingerprint, expected_fingerprint);
1477
1
        let looked_up = store.lookup(&fingerprint).cloned();
1478
1
        assert_eq!(looked_up, Some(schema));
1479
1
    }
1480
1481
    #[test]
1482
1
    fn test_fingerprints_returns_all_keys() {
1483
1
        let mut store = SchemaStore::new();
1484
1
        let fp_int = store
1485
1
            .register(AvroSchema::new(
1486
1
                serde_json::to_string(&int_schema()).unwrap(),
1487
            ))
1488
1
            .unwrap();
1489
1
        let fp_record = store
1490
1
            .register(AvroSchema::new(
1491
1
                serde_json::to_string(&record_schema()).unwrap(),
1492
            ))
1493
1
            .unwrap();
1494
1
        let fps = store.fingerprints();
1495
1
        assert_eq!(fps.len(), 2);
1496
1
        assert!(fps.contains(&fp_int));
1497
1
        assert!(fps.contains(&fp_record));
1498
1
    }
1499
1500
    #[test]
1501
1
    fn test_canonical_form_strips_attributes() {
1502
1
        let schema_with_attrs = Schema::Complex(ComplexType::Record(Record {
1503
1
            name: "record_with_attrs",
1504
1
            namespace: None,
1505
1
            doc: Some("This doc should be stripped"),
1506
1
            aliases: vec!["alias1", "alias2"],
1507
1
            fields: vec![Field {
1508
1
                name: "f1",
1509
1
                doc: Some("field doc"),
1510
1
                r#type: Schema::Type(Type {
1511
1
                    r#type: TypeName::Primitive(PrimitiveType::Bytes),
1512
1
                    attributes: Attributes {
1513
1
                        logical_type: Some("decimal"),
1514
1
                        additional: HashMap::from([("precision", json!(4))]),
1515
1
                    },
1516
1
                }),
1517
1
                default: None,
1518
1
            }],
1519
1
            attributes: Attributes {
1520
1
                logical_type: None,
1521
1
                additional: HashMap::from([("custom_attr", json!("value"))]),
1522
1
            },
1523
1
        }));
1524
1
        let expected_canonical_form = r#"{"name":"record_with_attrs","type":"record","fields":[{"name":"f1","type":"bytes"}]}"#;
1525
1
        let canonical_form = generate_canonical_form(&schema_with_attrs).unwrap();
1526
1
        assert_eq!(canonical_form, expected_canonical_form);
1527
1
    }
1528
1529
    #[test]
1530
1
    fn test_primitive_mappings() {
1531
1
        let cases = vec![
1532
1
            (DataType::Boolean, "\"boolean\""),
1533
1
            (DataType::Int8, "\"int\""),
1534
1
            (DataType::Int16, "\"int\""),
1535
1
            (DataType::Int32, "\"int\""),
1536
1
            (DataType::Int64, "\"long\""),
1537
1
            (DataType::UInt8, "\"int\""),
1538
1
            (DataType::UInt16, "\"int\""),
1539
1
            (DataType::UInt32, "\"long\""),
1540
1
            (DataType::UInt64, "\"long\""),
1541
1
            (DataType::Float16, "\"float\""),
1542
1
            (DataType::Float32, "\"float\""),
1543
1
            (DataType::Float64, "\"double\""),
1544
1
            (DataType::Utf8, "\"string\""),
1545
1
            (DataType::Binary, "\"bytes\""),
1546
        ];
1547
15
        for (
dt14
,
avro_token14
) in cases {
1548
14
            let field = ArrowField::new("col", dt.clone(), false);
1549
14
            let arrow_schema = single_field_schema(field);
1550
14
            let avro = AvroSchema::try_from(&arrow_schema).unwrap();
1551
14
            assert_json_contains(&avro.json_string, avro_token);
1552
14
        }
1553
1
    }
1554
1555
    #[test]
1556
1
    fn test_temporal_mappings() {
1557
1
        let cases = vec![
1558
1
            (DataType::Date32, "\"logicalType\":\"date\""),
1559
1
            (
1560
1
                DataType::Time32(TimeUnit::Millisecond),
1561
1
                "\"logicalType\":\"time-millis\"",
1562
1
            ),
1563
1
            (
1564
1
                DataType::Time64(TimeUnit::Microsecond),
1565
1
                "\"logicalType\":\"time-micros\"",
1566
1
            ),
1567
1
            (
1568
1
                DataType::Timestamp(TimeUnit::Millisecond, None),
1569
1
                "\"logicalType\":\"local-timestamp-millis\"",
1570
1
            ),
1571
1
            (
1572
1
                DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
1573
1
                "\"logicalType\":\"timestamp-micros\"",
1574
1
            ),
1575
        ];
1576
6
        for (
dt5
,
needle5
) in cases {
1577
5
            let field = ArrowField::new("ts", dt.clone(), true);
1578
5
            let arrow_schema = single_field_schema(field);
1579
5
            let avro = AvroSchema::try_from(&arrow_schema).unwrap();
1580
5
            assert_json_contains(&avro.json_string, needle);
1581
5
        }
1582
1
    }
1583
1584
    #[test]
1585
1
    fn test_decimal_and_uuid() {
1586
1
        let decimal_field = ArrowField::new("amount", DataType::Decimal128(25, 2), false);
1587
1
        let dec_schema = single_field_schema(decimal_field);
1588
1
        let avro_dec = AvroSchema::try_from(&dec_schema).unwrap();
1589
1
        assert_json_contains(&avro_dec.json_string, "\"logicalType\":\"decimal\"");
1590
1
        assert_json_contains(&avro_dec.json_string, "\"precision\":25");
1591
1
        assert_json_contains(&avro_dec.json_string, "\"scale\":2");
1592
1
        let mut md = HashMap::new();
1593
1
        md.insert("logicalType".into(), "uuid".into());
1594
1
        let uuid_field =
1595
1
            ArrowField::new("id", DataType::FixedSizeBinary(16), false).with_metadata(md);
1596
1
        let uuid_schema = single_field_schema(uuid_field);
1597
1
        let avro_uuid = AvroSchema::try_from(&uuid_schema).unwrap();
1598
1
        assert_json_contains(&avro_uuid.json_string, "\"logicalType\":\"uuid\"");
1599
1
    }
1600
1601
    #[test]
1602
1
    fn test_interval_duration() {
1603
1
        let interval_field = ArrowField::new(
1604
            "span",
1605
1
            DataType::Interval(IntervalUnit::MonthDayNano),
1606
            false,
1607
        );
1608
1
        let s = single_field_schema(interval_field);
1609
1
        let avro = AvroSchema::try_from(&s).unwrap();
1610
1
        assert_json_contains(&avro.json_string, "\"logicalType\":\"duration\"");
1611
1
        assert_json_contains(&avro.json_string, "\"size\":12");
1612
1
        let dur_field = ArrowField::new("latency", DataType::Duration(TimeUnit::Nanosecond), false);
1613
1
        let s2 = single_field_schema(dur_field);
1614
1
        let avro2 = AvroSchema::try_from(&s2).unwrap();
1615
1
        assert_json_contains(&avro2.json_string, "\"arrowDurationUnit\"");
1616
1
    }
1617
1618
    #[test]
1619
1
    fn test_complex_types() {
1620
1
        let list_dt = DataType::List(Arc::new(ArrowField::new("item", DataType::Int32, true)));
1621
1
        let list_schema = single_field_schema(ArrowField::new("numbers", list_dt, false));
1622
1
        let avro_list = AvroSchema::try_from(&list_schema).unwrap();
1623
1
        assert_json_contains(&avro_list.json_string, "\"type\":\"array\"");
1624
1
        assert_json_contains(&avro_list.json_string, "\"items\"");
1625
1
        let value_field = ArrowField::new("value", DataType::Boolean, true);
1626
1
        let entries_struct = ArrowField::new(
1627
            "entries",
1628
1
            DataType::Struct(Fields::from(vec![
1629
1
                ArrowField::new("key", DataType::Utf8, false),
1630
1
                value_field.clone(),
1631
1
            ])),
1632
            false,
1633
        );
1634
1
        let map_dt = DataType::Map(Arc::new(entries_struct), false);
1635
1
        let map_schema = single_field_schema(ArrowField::new("props", map_dt, false));
1636
1
        let avro_map = AvroSchema::try_from(&map_schema).unwrap();
1637
1
        assert_json_contains(&avro_map.json_string, "\"type\":\"map\"");
1638
1
        assert_json_contains(&avro_map.json_string, "\"values\"");
1639
1
        let struct_dt = DataType::Struct(Fields::from(vec![
1640
1
            ArrowField::new("f1", DataType::Int64, false),
1641
1
            ArrowField::new("f2", DataType::Utf8, true),
1642
1
        ]));
1643
1
        let struct_schema = single_field_schema(ArrowField::new("person", struct_dt, true));
1644
1
        let avro_struct = AvroSchema::try_from(&struct_schema).unwrap();
1645
1
        assert_json_contains(&avro_struct.json_string, "\"type\":\"record\"");
1646
1
        assert_json_contains(&avro_struct.json_string, "\"null\"");
1647
1
    }
1648
1649
    #[test]
1650
1
    fn test_enum_dictionary() {
1651
1
        let mut md = HashMap::new();
1652
1
        md.insert(
1653
1
            AVRO_ENUM_SYMBOLS_METADATA_KEY.into(),
1654
1
            "[\"OPEN\",\"CLOSED\"]".into(),
1655
        );
1656
1
        let enum_dt = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
1657
1
        let field = ArrowField::new("status", enum_dt, false).with_metadata(md);
1658
1
        let schema = single_field_schema(field);
1659
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1660
1
        assert_json_contains(&avro.json_string, "\"type\":\"enum\"");
1661
1
        assert_json_contains(&avro.json_string, "\"symbols\":[\"OPEN\",\"CLOSED\"]");
1662
1
    }
1663
1664
    #[test]
1665
1
    fn test_run_end_encoded() {
1666
1
        let ree_dt = DataType::RunEndEncoded(
1667
1
            Arc::new(ArrowField::new("run_ends", DataType::Int32, false)),
1668
1
            Arc::new(ArrowField::new("values", DataType::Utf8, false)),
1669
1
        );
1670
1
        let s = single_field_schema(ArrowField::new("text", ree_dt, false));
1671
1
        let avro = AvroSchema::try_from(&s).unwrap();
1672
1
        assert_json_contains(&avro.json_string, "\"string\"");
1673
1
    }
1674
1675
    #[test]
1676
1
    fn test_dense_union_error() {
1677
        use arrow_schema::UnionFields;
1678
1
        let uf: UnionFields = vec![(0i8, Arc::new(ArrowField::new("a", DataType::Int32, false)))]
1679
1
            .into_iter()
1680
1
            .collect();
1681
1
        let union_dt = DataType::Union(uf, arrow_schema::UnionMode::Dense);
1682
1
        let s = single_field_schema(ArrowField::new("u", union_dt, false));
1683
1
        let err = AvroSchema::try_from(&s).unwrap_err();
1684
1
        assert!(err
1685
1
            .to_string()
1686
1
            .contains("Arrow Union to Avro Union not yet supported"));
1687
1
    }
1688
1689
    #[test]
1690
1
    fn round_trip_primitive() {
1691
1
        let arrow_schema = ArrowSchema::new(vec![ArrowField::new("f1", DataType::Int32, false)]);
1692
1
        let avro_schema = AvroSchema::try_from(&arrow_schema).unwrap();
1693
1
        let decoded = avro_schema.schema().unwrap();
1694
1
        assert!(
matches!0
(decoded, Schema::Complex(_)));
1695
1
    }
1696
1697
    #[test]
1698
1
    fn test_name_generator_sanitization_and_uniqueness() {
1699
1
        let f1 = ArrowField::new("weird-name", DataType::FixedSizeBinary(8), false);
1700
1
        let f2 = ArrowField::new("weird name", DataType::FixedSizeBinary(8), false);
1701
1
        let f3 = ArrowField::new("123bad", DataType::FixedSizeBinary(8), false);
1702
1
        let arrow_schema = ArrowSchema::new(vec![f1, f2, f3]);
1703
1
        let avro = AvroSchema::try_from(&arrow_schema).unwrap();
1704
1
        assert_json_contains(&avro.json_string, "\"name\":\"weird_name\"");
1705
1
        assert_json_contains(&avro.json_string, "\"name\":\"weird_name_1\"");
1706
1
        assert_json_contains(&avro.json_string, "\"name\":\"_123bad\"");
1707
1
    }
1708
1709
    #[test]
1710
1
    fn test_date64_logical_type_mapping() {
1711
1
        let field = ArrowField::new("d", DataType::Date64, true);
1712
1
        let schema = single_field_schema(field);
1713
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1714
1
        assert_json_contains(
1715
1
            &avro.json_string,
1716
1
            "\"logicalType\":\"local-timestamp-millis\"",
1717
        );
1718
1
    }
1719
1720
    #[test]
1721
1
    fn test_duration_list_extras_propagated() {
1722
1
        let child = ArrowField::new("lat", DataType::Duration(TimeUnit::Microsecond), false);
1723
1
        let list_dt = DataType::List(Arc::new(child));
1724
1
        let arrow_schema = single_field_schema(ArrowField::new("durations", list_dt, false));
1725
1
        let avro = AvroSchema::try_from(&arrow_schema).unwrap();
1726
1
        assert_json_contains(&avro.json_string, "\"arrowDurationUnit\":\"microsecond\"");
1727
1
    }
1728
1729
    #[test]
1730
1
    fn test_interval_yearmonth_extra() {
1731
1
        let field = ArrowField::new("iv", DataType::Interval(IntervalUnit::YearMonth), false);
1732
1
        let schema = single_field_schema(field);
1733
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1734
1
        assert_json_contains(&avro.json_string, "\"arrowIntervalUnit\":\"yearmonth\"");
1735
1
    }
1736
1737
    #[test]
1738
1
    fn test_interval_daytime_extra() {
1739
1
        let field = ArrowField::new("iv_dt", DataType::Interval(IntervalUnit::DayTime), false);
1740
1
        let schema = single_field_schema(field);
1741
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1742
1
        assert_json_contains(&avro.json_string, "\"arrowIntervalUnit\":\"daytime\"");
1743
1
    }
1744
1745
    #[test]
1746
1
    fn test_fixed_size_list_extra() {
1747
1
        let child = ArrowField::new("item", DataType::Int32, false);
1748
1
        let dt = DataType::FixedSizeList(Arc::new(child), 3);
1749
1
        let schema = single_field_schema(ArrowField::new("triples", dt, false));
1750
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1751
1
        assert_json_contains(&avro.json_string, "\"arrowFixedSize\":3");
1752
1
    }
1753
1754
    #[test]
1755
1
    fn test_map_duration_value_extra() {
1756
1
        let val_field = ArrowField::new("value", DataType::Duration(TimeUnit::Second), true);
1757
1
        let entries_struct = ArrowField::new(
1758
            "entries",
1759
1
            DataType::Struct(Fields::from(vec![
1760
1
                ArrowField::new("key", DataType::Utf8, false),
1761
1
                val_field,
1762
1
            ])),
1763
            false,
1764
        );
1765
1
        let map_dt = DataType::Map(Arc::new(entries_struct), false);
1766
1
        let schema = single_field_schema(ArrowField::new("metrics", map_dt, false));
1767
1
        let avro = AvroSchema::try_from(&schema).unwrap();
1768
1
        assert_json_contains(&avro.json_string, "\"arrowDurationUnit\":\"second\"");
1769
1
    }
1770
}