/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/schema.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use arrow_schema::{ |
19 | | ArrowError, DataType, Field as ArrowField, IntervalUnit, Schema as ArrowSchema, TimeUnit, |
20 | | }; |
21 | | use serde::{Deserialize, Serialize}; |
22 | | use serde_json::{json, Map as JsonMap, Value}; |
23 | | use std::cmp::PartialEq; |
24 | | use std::collections::hash_map::Entry; |
25 | | use std::collections::{HashMap, HashSet}; |
26 | | use strum_macros::AsRefStr; |
27 | | |
28 | | /// The metadata key used for storing the JSON encoded [`Schema`] |
29 | | pub const SCHEMA_METADATA_KEY: &str = "avro.schema"; |
30 | | |
31 | | /// The Avro single‑object encoding “magic” bytes (`0xC3 0x01`) |
32 | | pub const SINGLE_OBJECT_MAGIC: [u8; 2] = [0xC3, 0x01]; |
33 | | |
34 | | /// Metadata key used to represent Avro enum symbols in an Arrow schema. |
35 | | pub const AVRO_ENUM_SYMBOLS_METADATA_KEY: &str = "avro.enum.symbols"; |
36 | | |
37 | | /// Metadata key used to store the default value of a field in an Avro schema. |
38 | | pub const AVRO_FIELD_DEFAULT_METADATA_KEY: &str = "avro.field.default"; |
39 | | |
40 | | /// Metadata key used to store the name of a type in an Avro schema. |
41 | | pub const AVRO_NAME_METADATA_KEY: &str = "avro.name"; |
42 | | |
43 | | /// Metadata key used to store the name of a type in an Avro schema. |
44 | | pub const AVRO_NAMESPACE_METADATA_KEY: &str = "avro.namespace"; |
45 | | |
46 | | /// Metadata key used to store the documentation for a type in an Avro schema. |
47 | | pub const AVRO_DOC_METADATA_KEY: &str = "avro.doc"; |
48 | | |
49 | | /// Compare two Avro schemas for equality (identical schemas). |
50 | | /// Returns true if the schemas have the same parsing canonical form (i.e., logically identical). |
51 | 0 | pub fn compare_schemas(writer: &Schema, reader: &Schema) -> Result<bool, ArrowError> { |
52 | 0 | let canon_writer = generate_canonical_form(writer)?; |
53 | 0 | let canon_reader = generate_canonical_form(reader)?; |
54 | 0 | Ok(canon_writer == canon_reader) |
55 | 0 | } |
56 | | |
57 | | /// Either a [`PrimitiveType`] or a reference to a previously defined named type |
58 | | /// |
59 | | /// <https://avro.apache.org/docs/1.11.1/specification/#names> |
60 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
61 | | #[serde(untagged)] |
62 | | /// A type name in an Avro schema |
63 | | /// |
64 | | /// This represents the different ways a type can be referenced in an Avro schema. |
65 | | pub enum TypeName<'a> { |
66 | | /// A primitive type like null, boolean, int, etc. |
67 | | Primitive(PrimitiveType), |
68 | | /// A reference to another named type |
69 | | Ref(&'a str), |
70 | | } |
71 | | |
72 | | /// A primitive type |
73 | | /// |
74 | | /// <https://avro.apache.org/docs/1.11.1/specification/#primitive-types> |
75 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, AsRefStr)] |
76 | | #[serde(rename_all = "camelCase")] |
77 | | #[strum(serialize_all = "lowercase")] |
78 | | pub enum PrimitiveType { |
79 | | /// null: no value |
80 | | Null, |
81 | | /// boolean: a binary value |
82 | | Boolean, |
83 | | /// int: 32-bit signed integer |
84 | | Int, |
85 | | /// long: 64-bit signed integer |
86 | | Long, |
87 | | /// float: single precision (32-bit) IEEE 754 floating-point number |
88 | | Float, |
89 | | /// double: double precision (64-bit) IEEE 754 floating-point number |
90 | | Double, |
91 | | /// bytes: sequence of 8-bit unsigned bytes |
92 | | Bytes, |
93 | | /// string: Unicode character sequence |
94 | | String, |
95 | | } |
96 | | |
97 | | /// Additional attributes within a [`Schema`] |
98 | | /// |
99 | | /// <https://avro.apache.org/docs/1.11.1/specification/#schema-declaration> |
100 | | #[derive(Debug, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] |
101 | | #[serde(rename_all = "camelCase")] |
102 | | pub struct Attributes<'a> { |
103 | | /// A logical type name |
104 | | /// |
105 | | /// <https://avro.apache.org/docs/1.11.1/specification/#logical-types> |
106 | | #[serde(default)] |
107 | | pub logical_type: Option<&'a str>, |
108 | | |
109 | | /// Additional JSON attributes |
110 | | #[serde(flatten)] |
111 | | pub additional: HashMap<&'a str, serde_json::Value>, |
112 | | } |
113 | | |
114 | | impl Attributes<'_> { |
115 | | /// Returns the field metadata for this [`Attributes`] |
116 | 220 | pub(crate) fn field_metadata(&self) -> HashMap<String, String> { |
117 | 220 | self.additional |
118 | 220 | .iter() |
119 | 220 | .map(|(k, v)| (k16 .to_string16 (), v16 .to_string16 ())) |
120 | 220 | .collect() |
121 | 220 | } |
122 | | } |
123 | | |
124 | | /// A type definition that is not a variant of [`ComplexType`] |
125 | | #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] |
126 | | #[serde(rename_all = "camelCase")] |
127 | | pub struct Type<'a> { |
128 | | /// The type of this Avro data structure |
129 | | #[serde(borrow)] |
130 | | pub r#type: TypeName<'a>, |
131 | | /// Additional attributes associated with this type |
132 | | #[serde(flatten)] |
133 | | pub attributes: Attributes<'a>, |
134 | | } |
135 | | |
136 | | /// An Avro schema |
137 | | /// |
138 | | /// This represents the different shapes of Avro schemas as defined in the specification. |
139 | | /// See <https://avro.apache.org/docs/1.11.1/specification/#schemas> for more details. |
140 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
141 | | #[serde(untagged)] |
142 | | pub enum Schema<'a> { |
143 | | /// A direct type name (primitive or reference) |
144 | | #[serde(borrow)] |
145 | | TypeName(TypeName<'a>), |
146 | | /// A union of multiple schemas (e.g., ["null", "string"]) |
147 | | #[serde(borrow)] |
148 | | Union(Vec<Schema<'a>>), |
149 | | /// A complex type such as record, array, map, etc. |
150 | | #[serde(borrow)] |
151 | | Complex(ComplexType<'a>), |
152 | | /// A type with attributes |
153 | | #[serde(borrow)] |
154 | | Type(Type<'a>), |
155 | | } |
156 | | |
157 | | /// A complex type |
158 | | /// |
159 | | /// <https://avro.apache.org/docs/1.11.1/specification/#complex-types> |
160 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
161 | | #[serde(tag = "type", rename_all = "camelCase")] |
162 | | pub enum ComplexType<'a> { |
163 | | /// Record type: a sequence of fields with names and types |
164 | | #[serde(borrow)] |
165 | | Record(Record<'a>), |
166 | | /// Enum type: a set of named values |
167 | | #[serde(borrow)] |
168 | | Enum(Enum<'a>), |
169 | | /// Array type: a sequence of values of the same type |
170 | | #[serde(borrow)] |
171 | | Array(Array<'a>), |
172 | | /// Map type: a mapping from strings to values of the same type |
173 | | #[serde(borrow)] |
174 | | Map(Map<'a>), |
175 | | /// Fixed type: a fixed-size byte array |
176 | | #[serde(borrow)] |
177 | | Fixed(Fixed<'a>), |
178 | | } |
179 | | |
180 | | /// A record |
181 | | /// |
182 | | /// <https://avro.apache.org/docs/1.11.1/specification/#schema-record> |
183 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
184 | | pub struct Record<'a> { |
185 | | /// Name of the record |
186 | | #[serde(borrow)] |
187 | | pub name: &'a str, |
188 | | /// Optional namespace for the record, provides a way to organize names |
189 | | #[serde(borrow, default)] |
190 | | pub namespace: Option<&'a str>, |
191 | | /// Optional documentation string for the record |
192 | | #[serde(borrow, default)] |
193 | | pub doc: Option<&'a str>, |
194 | | /// Alternative names for this record |
195 | | #[serde(borrow, default)] |
196 | | pub aliases: Vec<&'a str>, |
197 | | /// The fields contained in this record |
198 | | #[serde(borrow)] |
199 | | pub fields: Vec<Field<'a>>, |
200 | | /// Additional attributes for this record |
201 | | #[serde(flatten)] |
202 | | pub attributes: Attributes<'a>, |
203 | | } |
204 | | |
205 | | /// A field within a [`Record`] |
206 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
207 | | pub struct Field<'a> { |
208 | | /// Name of the field within the record |
209 | | #[serde(borrow)] |
210 | | pub name: &'a str, |
211 | | /// Optional documentation for this field |
212 | | #[serde(borrow, default)] |
213 | | pub doc: Option<&'a str>, |
214 | | /// The field's type definition |
215 | | #[serde(borrow)] |
216 | | pub r#type: Schema<'a>, |
217 | | /// Optional default value for this field |
218 | | #[serde(borrow, default)] |
219 | | pub default: Option<&'a str>, |
220 | | } |
221 | | |
222 | | /// An enumeration |
223 | | /// |
224 | | /// <https://avro.apache.org/docs/1.11.1/specification/#enums> |
225 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
226 | | pub struct Enum<'a> { |
227 | | /// Name of the enum |
228 | | #[serde(borrow)] |
229 | | pub name: &'a str, |
230 | | /// Optional namespace for the enum, provides organizational structure |
231 | | #[serde(borrow, default)] |
232 | | pub namespace: Option<&'a str>, |
233 | | /// Optional documentation string describing the enum |
234 | | #[serde(borrow, default)] |
235 | | pub doc: Option<&'a str>, |
236 | | /// Alternative names for this enum |
237 | | #[serde(borrow, default)] |
238 | | pub aliases: Vec<&'a str>, |
239 | | /// The symbols (values) that this enum can have |
240 | | #[serde(borrow)] |
241 | | pub symbols: Vec<&'a str>, |
242 | | /// Optional default value for this enum |
243 | | #[serde(borrow, default)] |
244 | | pub default: Option<&'a str>, |
245 | | /// Additional attributes for this enum |
246 | | #[serde(flatten)] |
247 | | pub attributes: Attributes<'a>, |
248 | | } |
249 | | |
250 | | /// An array |
251 | | /// |
252 | | /// <https://avro.apache.org/docs/1.11.1/specification/#arrays> |
253 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
254 | | pub struct Array<'a> { |
255 | | /// The schema for items in this array |
256 | | #[serde(borrow)] |
257 | | pub items: Box<Schema<'a>>, |
258 | | /// Additional attributes for this array |
259 | | #[serde(flatten)] |
260 | | pub attributes: Attributes<'a>, |
261 | | } |
262 | | |
263 | | /// A map |
264 | | /// |
265 | | /// <https://avro.apache.org/docs/1.11.1/specification/#maps> |
266 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
267 | | pub struct Map<'a> { |
268 | | /// The schema for values in this map |
269 | | #[serde(borrow)] |
270 | | pub values: Box<Schema<'a>>, |
271 | | /// Additional attributes for this map |
272 | | #[serde(flatten)] |
273 | | pub attributes: Attributes<'a>, |
274 | | } |
275 | | |
276 | | /// A fixed length binary array |
277 | | /// |
278 | | /// <https://avro.apache.org/docs/1.11.1/specification/#fixed> |
279 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
280 | | pub struct Fixed<'a> { |
281 | | /// Name of the fixed type |
282 | | #[serde(borrow)] |
283 | | pub name: &'a str, |
284 | | /// Optional namespace for the fixed type |
285 | | #[serde(borrow, default)] |
286 | | pub namespace: Option<&'a str>, |
287 | | /// Alternative names for this fixed type |
288 | | #[serde(borrow, default)] |
289 | | pub aliases: Vec<&'a str>, |
290 | | /// The number of bytes in this fixed type |
291 | | pub size: usize, |
292 | | /// Additional attributes for this fixed type |
293 | | #[serde(flatten)] |
294 | | pub attributes: Attributes<'a>, |
295 | | } |
296 | | |
297 | | /// A wrapper for an Avro schema in its JSON string representation. |
298 | | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] |
299 | | pub struct AvroSchema { |
300 | | /// The Avro schema as a JSON string. |
301 | | pub json_string: String, |
302 | | } |
303 | | |
304 | | impl TryFrom<&ArrowSchema> for AvroSchema { |
305 | | type Error = ArrowError; |
306 | | |
307 | 46 | fn try_from(schema: &ArrowSchema) -> Result<Self, Self::Error> { |
308 | | // Fast‑path: schema already contains Avro JSON |
309 | 46 | if let Some(json0 ) = schema.metadata.get(SCHEMA_METADATA_KEY) { |
310 | 0 | return Ok(AvroSchema::new(json.clone())); |
311 | 46 | } |
312 | 46 | let mut name_gen = NameGenerator::default(); |
313 | 46 | let fields_json45 = schema |
314 | 46 | .fields() |
315 | 46 | .iter() |
316 | 101 | .map46 (|f| arrow_field_to_avro(f, &mut name_gen)) |
317 | 46 | .collect::<Result<Vec<_>, _>>()?1 ; |
318 | | // Assemble top‑level record |
319 | 45 | let record_name = schema |
320 | 45 | .metadata |
321 | 45 | .get(AVRO_NAME_METADATA_KEY) |
322 | 45 | .map_or("topLevelRecord", |s| s0 .as_str0 ()); |
323 | 45 | let mut record = JsonMap::with_capacity(schema.metadata.len() + 4); |
324 | 45 | record.insert("type".into(), Value::String("record".into())); |
325 | 45 | record.insert( |
326 | 45 | "name".into(), |
327 | 45 | Value::String(sanitise_avro_name(record_name)), |
328 | | ); |
329 | 45 | if let Some(ns0 ) = schema.metadata.get(AVRO_NAMESPACE_METADATA_KEY) { |
330 | 0 | record.insert("namespace".into(), Value::String(ns.clone())); |
331 | 45 | } |
332 | 45 | if let Some(doc0 ) = schema.metadata.get(AVRO_DOC_METADATA_KEY) { |
333 | 0 | record.insert("doc".into(), Value::String(doc.clone())); |
334 | 45 | } |
335 | 45 | record.insert("fields".into(), Value::Array(fields_json)); |
336 | 45 | let schema_prefix = format!("{SCHEMA_METADATA_KEY}."); |
337 | 45 | for (meta_key0 , meta_val0 ) in &schema.metadata { |
338 | | // Skip keys already handled or internal |
339 | 0 | if meta_key.starts_with("avro.") |
340 | 0 | || meta_key.starts_with(schema_prefix.as_str()) |
341 | 0 | || is_internal_arrow_key(meta_key) |
342 | | { |
343 | 0 | continue; |
344 | 0 | } |
345 | 0 | let json_val = |
346 | 0 | serde_json::from_str(meta_val).unwrap_or_else(|_| Value::String(meta_val.clone())); |
347 | 0 | record.insert(meta_key.clone(), json_val); |
348 | | } |
349 | 45 | let json_string = serde_json::to_string(&Value::Object(record)) |
350 | 45 | .map_err(|e| ArrowError::SchemaError(format!0 ("Serialising Avro JSON failed: {e}"0 )))?0 ; |
351 | 45 | Ok(AvroSchema::new(json_string)) |
352 | 46 | } |
353 | | } |
354 | | |
355 | | impl AvroSchema { |
356 | | /// Creates a new `AvroSchema` from a JSON string. |
357 | 91 | pub fn new(json_string: String) -> Self { |
358 | 91 | Self { json_string } |
359 | 91 | } |
360 | | |
361 | | /// Deserializes and returns the `AvroSchema`. |
362 | | /// |
363 | | /// The returned schema borrows from `self`. |
364 | 80 | pub fn schema(&self) -> Result<Schema<'_>, ArrowError> { |
365 | 80 | serde_json::from_str(self.json_string.as_str()) |
366 | 80 | .map_err(|e| ArrowError::ParseError(format!0 ("Invalid Avro schema JSON: {e}"0 ))) |
367 | 80 | } |
368 | | |
369 | | /// Returns the Rabin fingerprint of the schema. |
370 | 3 | pub fn fingerprint(&self) -> Result<Fingerprint, ArrowError> { |
371 | 3 | generate_fingerprint_rabin(&self.schema()?0 ) |
372 | 3 | } |
373 | | } |
374 | | |
375 | | /// Supported fingerprint algorithms for Avro schema identification. |
376 | | /// Currently only `Rabin` is supported, `SHA256` and `MD5` support will come in a future update |
377 | | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)] |
378 | | pub enum FingerprintAlgorithm { |
379 | | /// 64‑bit CRC‑64‑AVRO Rabin fingerprint. |
380 | | #[default] |
381 | | Rabin, |
382 | | } |
383 | | |
384 | | /// A schema fingerprint in one of the supported formats. |
385 | | /// |
386 | | /// This is used as the key inside `SchemaStore` `HashMap`. Each `SchemaStore` |
387 | | /// instance always stores only one variant, matching its configured |
388 | | /// `FingerprintAlgorithm`, but the enum makes the API uniform. |
389 | | /// Currently only `Rabin` is supported |
390 | | /// |
391 | | /// <https://avro.apache.org/docs/1.11.1/specification/#schema-fingerprints> |
392 | | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] |
393 | | pub enum Fingerprint { |
394 | | /// A 64-bit Rabin fingerprint. |
395 | | Rabin(u64), |
396 | | } |
397 | | |
398 | | /// Allow easy extraction of the algorithm used to create a fingerprint. |
399 | | impl From<&Fingerprint> for FingerprintAlgorithm { |
400 | 0 | fn from(fp: &Fingerprint) -> Self { |
401 | 0 | match fp { |
402 | 0 | Fingerprint::Rabin(_) => FingerprintAlgorithm::Rabin, |
403 | | } |
404 | 0 | } |
405 | | } |
406 | | |
407 | | /// Generates a fingerprint for the given `Schema` using the specified `FingerprintAlgorithm`. |
408 | 32 | pub(crate) fn generate_fingerprint( |
409 | 32 | schema: &Schema, |
410 | 32 | hash_type: FingerprintAlgorithm, |
411 | 32 | ) -> Result<Fingerprint, ArrowError> { |
412 | 32 | let canonical = generate_canonical_form(schema).map_err(|e| {0 |
413 | 0 | ArrowError::ComputeError(format!("Failed to generate canonical form for schema: {e}")) |
414 | 0 | })?; |
415 | 32 | match hash_type { |
416 | | FingerprintAlgorithm::Rabin => { |
417 | 32 | Ok(Fingerprint::Rabin(compute_fingerprint_rabin(&canonical))) |
418 | | } |
419 | | } |
420 | 32 | } |
421 | | |
422 | | /// Generates the 64-bit Rabin fingerprint for the given `Schema`. |
423 | | /// |
424 | | /// The fingerprint is computed from the canonical form of the schema. |
425 | | /// This is also known as `CRC-64-AVRO`. |
426 | | /// |
427 | | /// # Returns |
428 | | /// A `Fingerprint::Rabin` variant containing the 64-bit fingerprint. |
429 | 3 | pub fn generate_fingerprint_rabin(schema: &Schema) -> Result<Fingerprint, ArrowError> { |
430 | 3 | generate_fingerprint(schema, FingerprintAlgorithm::Rabin) |
431 | 3 | } |
432 | | |
433 | | /// Generates the Parsed Canonical Form for the given [`Schema`]. |
434 | | /// |
435 | | /// The canonical form is a standardized JSON representation of the schema, |
436 | | /// primarily used for generating a schema fingerprint for equality checking. |
437 | | /// |
438 | | /// This form strips attributes that do not affect the schema's identity, |
439 | | /// such as `doc` fields, `aliases`, and any properties not defined in the |
440 | | /// Avro specification. |
441 | | /// |
442 | | /// <https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas> |
443 | 35 | pub fn generate_canonical_form(schema: &Schema) -> Result<String, ArrowError> { |
444 | 35 | build_canonical(schema, None) |
445 | 35 | } |
446 | | |
447 | | /// An in-memory cache of Avro schemas, indexed by their fingerprint. |
448 | | /// |
449 | | /// `SchemaStore` provides a mechanism to store and retrieve Avro schemas efficiently. |
450 | | /// Each schema is associated with a unique [`Fingerprint`], which is generated based |
451 | | /// on the schema's canonical form and a specific hashing algorithm. |
452 | | /// |
453 | | /// A `SchemaStore` instance is configured to use a single [`FingerprintAlgorithm`] such as Rabin, |
454 | | /// MD5 (not yet supported), or SHA256 (not yet supported) for all its operations. |
455 | | /// This ensures consistency when generating fingerprints and looking up schemas. |
456 | | /// All schemas registered will have their fingerprint computed with this algorithm, and |
457 | | /// lookups must use a matching fingerprint. |
458 | | /// |
459 | | /// # Examples |
460 | | /// |
461 | | /// ```no_run |
462 | | /// // Create a new store with the default Rabin fingerprinting. |
463 | | /// use arrow_avro::schema::{AvroSchema, SchemaStore}; |
464 | | /// |
465 | | /// let mut store = SchemaStore::new(); |
466 | | /// let schema = AvroSchema::new("\"string\"".to_string()); |
467 | | /// // Register the schema to get its fingerprint. |
468 | | /// let fingerprint = store.register(schema.clone()).unwrap(); |
469 | | /// // Use the fingerprint to look up the schema. |
470 | | /// let retrieved_schema = store.lookup(&fingerprint).cloned(); |
471 | | /// assert_eq!(retrieved_schema, Some(schema)); |
472 | | /// ``` |
473 | | #[derive(Debug, Clone, Default)] |
474 | | pub struct SchemaStore { |
475 | | /// The hashing algorithm used for generating fingerprints. |
476 | | fingerprint_algorithm: FingerprintAlgorithm, |
477 | | /// A map from a schema's fingerprint to the schema itself. |
478 | | schemas: HashMap<Fingerprint, AvroSchema>, |
479 | | } |
480 | | |
481 | | impl TryFrom<&[AvroSchema]> for SchemaStore { |
482 | | type Error = ArrowError; |
483 | | |
484 | | /// Creates a `SchemaStore` from a slice of schemas. |
485 | | /// Each schema in the slice is registered with the new store. |
486 | 2 | fn try_from(schemas: &[AvroSchema]) -> Result<Self, Self::Error> { |
487 | 2 | let mut store = SchemaStore::new(); |
488 | 7 | for schema5 in schemas { |
489 | 5 | store.register(schema.clone())?0 ; |
490 | | } |
491 | 2 | Ok(store) |
492 | 2 | } |
493 | | } |
494 | | |
495 | | impl SchemaStore { |
496 | | /// Creates an empty `SchemaStore` using the default fingerprinting algorithm (64-bit Rabin). |
497 | 18 | pub fn new() -> Self { |
498 | 18 | Self::default() |
499 | 18 | } |
500 | | |
501 | | /// Registers a schema with the store and returns its fingerprint. |
502 | | /// |
503 | | /// A fingerprint is calculated for the given schema using the store's configured |
504 | | /// hash type. If a schema with the same fingerprint does not already exist in the |
505 | | /// store, the new schema is inserted. If the fingerprint already exists, the |
506 | | /// existing schema is not overwritten. |
507 | | /// |
508 | | /// # Arguments |
509 | | /// |
510 | | /// * `schema` - The `AvroSchema` to register. |
511 | | /// |
512 | | /// # Returns |
513 | | /// |
514 | | /// A `Result` containing the `Fingerprint` of the schema if successful, |
515 | | /// or an `ArrowError` on failure. |
516 | 29 | pub fn register(&mut self, schema: AvroSchema) -> Result<Fingerprint, ArrowError> { |
517 | 29 | let fingerprint = generate_fingerprint(&schema.schema()?0 , self.fingerprint_algorithm)?0 ; |
518 | 29 | match self.schemas.entry(fingerprint) { |
519 | 2 | Entry::Occupied(entry) => { |
520 | 2 | if entry.get() != &schema { |
521 | 0 | return Err(ArrowError::ComputeError(format!( |
522 | 0 | "Schema fingerprint collision detected for fingerprint {fingerprint:?}" |
523 | 0 | ))); |
524 | 2 | } |
525 | | } |
526 | 27 | Entry::Vacant(entry) => { |
527 | 27 | entry.insert(schema); |
528 | 27 | } |
529 | | } |
530 | 29 | Ok(fingerprint) |
531 | 29 | } |
532 | | |
533 | | /// Looks up a schema by its `Fingerprint`. |
534 | | /// |
535 | | /// # Arguments |
536 | | /// |
537 | | /// * `fingerprint` - A reference to the `Fingerprint` of the schema to look up. |
538 | | /// |
539 | | /// # Returns |
540 | | /// |
541 | | /// An `Option` containing a clone of the `AvroSchema` if found, otherwise `None`. |
542 | 24 | pub fn lookup(&self, fingerprint: &Fingerprint) -> Option<&AvroSchema> { |
543 | 24 | self.schemas.get(fingerprint) |
544 | 24 | } |
545 | | |
546 | | /// Returns a `Vec` containing **all unique [`Fingerprint`]s** currently |
547 | | /// held by this [`SchemaStore`]. |
548 | | /// |
549 | | /// The order of the returned fingerprints is unspecified and should not be |
550 | | /// relied upon. |
551 | 21 | pub fn fingerprints(&self) -> Vec<Fingerprint> { |
552 | 21 | self.schemas.keys().copied().collect() |
553 | 21 | } |
554 | | |
555 | | /// Returns the `FingerprintAlgorithm` used by the `SchemaStore` for fingerprinting. |
556 | 11 | pub(crate) fn fingerprint_algorithm(&self) -> FingerprintAlgorithm { |
557 | 11 | self.fingerprint_algorithm |
558 | 11 | } |
559 | | } |
560 | | |
561 | 97 | fn quote(s: &str) -> Result<String, ArrowError> { |
562 | 97 | serde_json::to_string(s) |
563 | 97 | .map_err(|e| ArrowError::ComputeError(format!0 ("Failed to quote string: {e}"0 ))) |
564 | 97 | } |
565 | | |
566 | | // Avro names are defined by a `name` and an optional `namespace`. |
567 | | // The full name is composed of the namespace and the name, separated by a dot. |
568 | | // |
569 | | // Avro specification defines two ways to specify a full name: |
570 | | // 1. The `name` attribute contains the full name (e.g., "a.b.c.d"). |
571 | | // In this case, the `namespace` attribute is ignored. |
572 | | // 2. The `name` attribute contains the simple name (e.g., "d") and the |
573 | | // `namespace` attribute contains the namespace (e.g., "a.b.c"). |
574 | | // |
575 | | // Each part of the name must match the regex `^[A-Za-z_][A-Za-z0-9_]*$`. |
576 | | // Complex paths with quotes or backticks like `a."hi".b` are not supported. |
577 | | // |
578 | | // This function constructs the full name and extracts the namespace, |
579 | | // handling both ways of specifying the name. It prioritizes a namespace |
580 | | // defined within the `name` attribute itself, then the explicit `namespace_attr`, |
581 | | // and finally the `enclosing_ns`. |
582 | 25 | fn make_full_name( |
583 | 25 | name: &str, |
584 | 25 | namespace_attr: Option<&str>, |
585 | 25 | enclosing_ns: Option<&str>, |
586 | 25 | ) -> (String, Option<String>) { |
587 | | // `name` already contains a dot then treat as full-name, ignore namespace. |
588 | 25 | if let Some((ns0 , _)) = name.rsplit_once('.') { |
589 | 0 | return (name.to_string(), Some(ns.to_string())); |
590 | 25 | } |
591 | 25 | match namespace_attr.or(enclosing_ns) { |
592 | 6 | Some(ns) => (format!("{ns}.{name}"), Some(ns.to_string())), |
593 | 19 | None => (name.to_string(), None), |
594 | | } |
595 | 25 | } |
596 | | |
597 | 66 | fn build_canonical(schema: &Schema, enclosing_ns: Option<&str>) -> Result<String, ArrowError> { |
598 | 66 | Ok(match schema { |
599 | 41 | Schema::TypeName(tn40 ) | Schema::Type(Type { r#type: tn1 , .. }) => match tn { |
600 | 41 | TypeName::Primitive(pt) => quote(pt.as_ref())?0 , |
601 | 0 | TypeName::Ref(name) => { |
602 | 0 | let (full_name, _) = make_full_name(name, None, enclosing_ns); |
603 | 0 | quote(&full_name)? |
604 | | } |
605 | | }, |
606 | 0 | Schema::Union(branches) => format!( |
607 | 0 | "[{}]", |
608 | 0 | branches |
609 | 0 | .iter() |
610 | 0 | .map(|b| build_canonical(b, enclosing_ns)) |
611 | 0 | .collect::<Result<Vec<_>, _>>()? |
612 | 0 | .join(",") |
613 | | ), |
614 | 25 | Schema::Complex(ct) => match ct { |
615 | 25 | ComplexType::Record(r) => { |
616 | 25 | let (full_name, child_ns) = make_full_name(r.name, r.namespace, enclosing_ns); |
617 | 25 | let fields = r |
618 | 25 | .fields |
619 | 25 | .iter() |
620 | 31 | .map25 (|f| { |
621 | 31 | let field_type = |
622 | 31 | build_canonical(&f.r#type, child_ns.as_deref().or(enclosing_ns))?0 ; |
623 | 31 | Ok(format!( |
624 | 31 | r#"{{"name":{},"type":{}}}"#, |
625 | 31 | quote(f.name)?0 , |
626 | | field_type |
627 | | )) |
628 | 31 | }) |
629 | 25 | .collect::<Result<Vec<_>, ArrowError>>()?0 |
630 | 25 | .join(","); |
631 | 25 | format!( |
632 | 25 | r#"{{"name":{},"type":"record","fields":[{fields}]}}"#, |
633 | 25 | quote(&full_name)?0 , |
634 | | ) |
635 | | } |
636 | 0 | ComplexType::Enum(e) => { |
637 | 0 | let (full_name, _) = make_full_name(e.name, e.namespace, enclosing_ns); |
638 | 0 | let symbols = e |
639 | 0 | .symbols |
640 | 0 | .iter() |
641 | 0 | .map(|s| quote(s)) |
642 | 0 | .collect::<Result<Vec<_>, _>>()? |
643 | 0 | .join(","); |
644 | 0 | format!( |
645 | 0 | r#"{{"name":{},"type":"enum","symbols":[{symbols}]}}"#, |
646 | 0 | quote(&full_name)? |
647 | | ) |
648 | | } |
649 | 0 | ComplexType::Array(arr) => format!( |
650 | 0 | r#"{{"type":"array","items":{}}}"#, |
651 | 0 | build_canonical(&arr.items, enclosing_ns)? |
652 | | ), |
653 | 0 | ComplexType::Map(map) => format!( |
654 | 0 | r#"{{"type":"map","values":{}}}"#, |
655 | 0 | build_canonical(&map.values, enclosing_ns)? |
656 | | ), |
657 | 0 | ComplexType::Fixed(f) => { |
658 | 0 | let (full_name, _) = make_full_name(f.name, f.namespace, enclosing_ns); |
659 | 0 | format!( |
660 | 0 | r#"{{"name":{},"type":"fixed","size":{}}}"#, |
661 | 0 | quote(&full_name)?, |
662 | | f.size |
663 | | ) |
664 | | } |
665 | | }, |
666 | | }) |
667 | 66 | } |
668 | | |
669 | | /// 64‑bit Rabin fingerprint as described in the Avro spec. |
670 | | const EMPTY: u64 = 0xc15d_213a_a4d7_a795; |
671 | | |
672 | | /// Build one entry of the polynomial‑division table. |
673 | | /// |
674 | | /// We cannot yet write `for _ in 0..8` here: `for` loops rely on |
675 | | /// `Iterator::next`, which is not `const` on stable Rust. Until the |
676 | | /// `const_for` feature (tracking issue #87575) is stabilized, a `while` |
677 | | /// loop is the only option in a `const fn` |
678 | 0 | const fn one_entry(i: usize) -> u64 { |
679 | 0 | let mut fp = i as u64; |
680 | 0 | let mut j = 0; |
681 | 0 | while j < 8 { |
682 | 0 | fp = (fp >> 1) ^ (EMPTY & (0u64.wrapping_sub(fp & 1))); |
683 | 0 | j += 1; |
684 | 0 | } |
685 | 0 | fp |
686 | 0 | } |
687 | | |
688 | | /// Build the full 256‑entry table at compile time. |
689 | | /// |
690 | | /// We cannot yet write `for _ in 0..256` here: `for` loops rely on |
691 | | /// `Iterator::next`, which is not `const` on stable Rust. Until the |
692 | | /// `const_for` feature (tracking issue #87575) is stabilized, a `while` |
693 | | /// loop is the only option in a `const fn` |
694 | 0 | const fn build_table() -> [u64; 256] { |
695 | 0 | let mut table = [0u64; 256]; |
696 | 0 | let mut i = 0; |
697 | 0 | while i < 256 { |
698 | 0 | table[i] = one_entry(i); |
699 | 0 | i += 1; |
700 | 0 | } |
701 | 0 | table |
702 | 0 | } |
703 | | |
704 | | /// The pre‑computed table. |
705 | | static FINGERPRINT_TABLE: [u64; 256] = build_table(); |
706 | | |
707 | | /// Computes the 64-bit Rabin fingerprint for a given canonical schema string. |
708 | | /// This implementation is based on the Avro specification for schema fingerprinting. |
709 | 34 | pub(crate) fn compute_fingerprint_rabin(canonical_form: &str) -> u64 { |
710 | 34 | let mut fp = EMPTY; |
711 | 2.19k | for &byte in canonical_form34 .as_bytes34 () { |
712 | 2.19k | let idx = ((fp as u8) ^ byte) as usize; |
713 | 2.19k | fp = (fp >> 8) ^ FINGERPRINT_TABLE[idx]; |
714 | 2.19k | } |
715 | 34 | fp |
716 | 34 | } |
717 | | |
718 | | #[inline] |
719 | 2 | fn is_internal_arrow_key(key: &str) -> bool { |
720 | 2 | key.starts_with("ARROW:") || key == SCHEMA_METADATA_KEY |
721 | 2 | } |
722 | | |
723 | | // Sanitize an arbitrary string so it is a valid Avro field or type name |
724 | 154 | fn sanitise_avro_name(base_name: &str) -> String { |
725 | 154 | if base_name.is_empty() { |
726 | 0 | return "_".to_owned(); |
727 | 154 | } |
728 | 154 | let mut out: String = base_name |
729 | 154 | .chars() |
730 | 1.39k | .map154 (|char| { |
731 | 1.39k | if char.is_ascii_alphanumeric() || char == '_'62 { |
732 | 1.39k | char |
733 | | } else { |
734 | 2 | '_' |
735 | | } |
736 | 1.39k | }) |
737 | 154 | .collect(); |
738 | 154 | if out.as_bytes()[0].is_ascii_digit() { |
739 | 1 | out.insert(0, '_'); |
740 | 153 | } |
741 | 154 | out |
742 | 154 | } |
743 | | |
744 | | #[derive(Default)] |
745 | | struct NameGenerator { |
746 | | used: HashSet<String>, |
747 | | counters: HashMap<String, usize>, |
748 | | } |
749 | | |
750 | | impl NameGenerator { |
751 | 6 | fn make_unique(&mut self, field_name: &str) -> String { |
752 | 6 | let field_name = sanitise_avro_name(field_name); |
753 | 6 | if self.used.insert(field_name.clone()) { |
754 | 5 | self.counters.insert(field_name.clone(), 1); |
755 | 5 | return field_name; |
756 | 1 | } |
757 | 1 | let counter = self.counters.entry(field_name.clone()).or_insert(1); |
758 | | loop { |
759 | 1 | let candidate = format!("{field_name}_{}", *counter); |
760 | 1 | if self.used.insert(candidate.clone()) { |
761 | 1 | return candidate; |
762 | 0 | } |
763 | 0 | *counter += 1; |
764 | | } |
765 | 6 | } |
766 | | } |
767 | | |
768 | 108 | fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value { |
769 | 108 | if extras.is_empty() { |
770 | 102 | return schema; |
771 | 6 | } |
772 | 6 | match schema { |
773 | 1 | Value::Object(mut map) => { |
774 | 1 | map.extend(extras); |
775 | 1 | Value::Object(map) |
776 | | } |
777 | 0 | Value::Array(mut union) => { |
778 | 0 | if let Some(non_null) = union.iter_mut().find(|val| val.as_str() != Some("null")) { |
779 | 0 | let original = std::mem::take(non_null); |
780 | 0 | *non_null = merge_extras(original, extras); |
781 | 0 | } |
782 | 0 | Value::Array(union) |
783 | | } |
784 | 5 | primitive => { |
785 | 5 | let mut map = JsonMap::with_capacity(extras.len() + 1); |
786 | 5 | map.insert("type".into(), primitive); |
787 | 5 | map.extend(extras); |
788 | 5 | Value::Object(map) |
789 | | } |
790 | | } |
791 | 108 | } |
792 | | |
793 | | // Convert an Arrow `DataType` into an Avro schema `Value`. |
794 | 109 | fn datatype_to_avro( |
795 | 109 | dt: &DataType, |
796 | 109 | field_name: &str, |
797 | 109 | metadata: &HashMap<String, String>, |
798 | 109 | name_gen: &mut NameGenerator, |
799 | 109 | ) -> Result<(Value, JsonMap<String, Value>), ArrowError> { |
800 | 109 | let mut extras = JsonMap::new(); |
801 | 108 | let val = match dt3 { |
802 | 0 | DataType::Null => Value::String("null".into()), |
803 | 7 | DataType::Boolean => Value::String("boolean".into()), |
804 | | DataType::Int8 | DataType::Int16 | DataType::UInt8 | DataType::UInt16 | DataType::Int32 => { |
805 | 32 | Value::String("int".into()) |
806 | | } |
807 | 9 | DataType::UInt32 | DataType::Int64 | DataType::UInt64 => Value::String("long".into()), |
808 | 7 | DataType::Float16 | DataType::Float32 => Value::String("float".into()), |
809 | 6 | DataType::Float64 => Value::String("double".into()), |
810 | 3 | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Value::String("string".into()), |
811 | 14 | DataType::Binary | DataType::LargeBinary => Value::String("bytes".into()), |
812 | 4 | DataType::FixedSizeBinary(len) => { |
813 | 4 | let is_uuid = metadata |
814 | 4 | .get("logicalType") |
815 | 4 | .is_some_and(|value| value1 == "uuid"1 ) |
816 | 3 | || (*len == 16 |
817 | 0 | && metadata |
818 | 0 | .get("ARROW:extension:name") |
819 | 0 | .is_some_and(|value| value == "uuid")); |
820 | 4 | if is_uuid { |
821 | 1 | json!({ "type": "string", "logicalType": "uuid" }) |
822 | | } else { |
823 | 3 | json!({ |
824 | 3 | "type": "fixed", |
825 | 3 | "name": name_gen.make_unique(field_name), |
826 | 3 | "size": len |
827 | | }) |
828 | | } |
829 | | } |
830 | 1 | DataType::Decimal128(precision, scale) | DataType::Decimal256(precision0 , scale0 ) => { |
831 | | // Prefer fixed if original size info present |
832 | 1 | let mut meta = JsonMap::from_iter([ |
833 | 1 | ("logicalType".into(), json!("decimal")), |
834 | 1 | ("precision".into(), json!(*precision)), |
835 | 1 | ("scale".into(), json!(*scale)), |
836 | 1 | ]); |
837 | 1 | if let Some(size0 ) = metadata |
838 | 1 | .get("size") |
839 | 1 | .and_then(|val| val.parse::<usize>()0 .ok0 ()) |
840 | 0 | { |
841 | 0 | meta.insert("type".into(), json!("fixed")); |
842 | 0 | meta.insert("size".into(), json!(size)); |
843 | 0 | meta.insert("name".into(), json!(name_gen.make_unique(field_name))); |
844 | 1 | } else { |
845 | 1 | meta.insert("type".into(), json!("bytes")); |
846 | 1 | } |
847 | 1 | Value::Object(meta) |
848 | | } |
849 | 1 | DataType::Date32 => json!({ "type": "int", "logicalType": "date" }), |
850 | 1 | DataType::Date64 => json!({ "type": "long", "logicalType": "local-timestamp-millis" }), |
851 | 1 | DataType::Time32(unit) => match unit { |
852 | 1 | TimeUnit::Millisecond => json!({ "type": "int", "logicalType": "time-millis" }), |
853 | | TimeUnit::Second => { |
854 | 0 | extras.insert("arrowTimeUnit".into(), Value::String("second".into())); |
855 | 0 | Value::String("int".into()) |
856 | | } |
857 | 0 | _ => Value::String("int".into()), |
858 | | }, |
859 | 1 | DataType::Time64(unit) => match unit { |
860 | 1 | TimeUnit::Microsecond => json!({ "type": "long", "logicalType": "time-micros" }), |
861 | | TimeUnit::Nanosecond => { |
862 | 0 | extras.insert("arrowTimeUnit".into(), Value::String("nanosecond".into())); |
863 | 0 | Value::String("long".into()) |
864 | | } |
865 | 0 | _ => Value::String("long".into()), |
866 | | }, |
867 | 7 | DataType::Timestamp(unit, tz) => { |
868 | 7 | let logical_type = match (unit, tz.is_some()) { |
869 | 0 | (TimeUnit::Millisecond, true) => "timestamp-millis", |
870 | 1 | (TimeUnit::Millisecond, false) => "local-timestamp-millis", |
871 | 6 | (TimeUnit::Microsecond, true) => "timestamp-micros", |
872 | 0 | (TimeUnit::Microsecond, false) => "local-timestamp-micros", |
873 | | (TimeUnit::Second, _) => { |
874 | 0 | extras.insert("arrowTimeUnit".into(), Value::String("second".into())); |
875 | 0 | return Ok((Value::String("long".into()), extras)); |
876 | | } |
877 | | (TimeUnit::Nanosecond, _) => { |
878 | 0 | extras.insert("arrowTimeUnit".into(), Value::String("nanosecond".into())); |
879 | 0 | return Ok((Value::String("long".into()), extras)); |
880 | | } |
881 | | }; |
882 | 7 | json!({ "type": "long", "logicalType": logical_type }) |
883 | | } |
884 | 3 | DataType::Duration(unit) => { |
885 | 3 | extras.insert( |
886 | 3 | "arrowDurationUnit".into(), |
887 | 3 | Value::String(format!("{unit:?}").to_lowercase()), |
888 | | ); |
889 | 3 | Value::String("long".into()) |
890 | | } |
891 | 1 | DataType::Interval(IntervalUnit::MonthDayNano) => json!({ |
892 | 1 | "type": "fixed", |
893 | 1 | "name": name_gen.make_unique(&format!("{field_name}_duration")), |
894 | 1 | "size": 12, |
895 | 1 | "logicalType": "duration" |
896 | | }), |
897 | | DataType::Interval(IntervalUnit::YearMonth) => { |
898 | 1 | extras.insert( |
899 | 1 | "arrowIntervalUnit".into(), |
900 | 1 | Value::String("yearmonth".into()), |
901 | | ); |
902 | 1 | Value::String("long".into()) |
903 | | } |
904 | | DataType::Interval(IntervalUnit::DayTime) => { |
905 | 1 | extras.insert("arrowIntervalUnit".into(), Value::String("daytime".into())); |
906 | 1 | Value::String("long".into()) |
907 | | } |
908 | 2 | DataType::List(child) | DataType::LargeList(child0 ) => { |
909 | 2 | if matches!(dt, DataType::LargeList(_)) { |
910 | 0 | extras.insert("arrowLargeList".into(), Value::Bool(true)); |
911 | 2 | } |
912 | 2 | let (items, ie) = |
913 | 2 | datatype_to_avro(child.data_type(), child.name(), child.metadata(), name_gen)?0 ; |
914 | 2 | json!({ |
915 | 2 | "type": "array", |
916 | 2 | "items": merge_extras(items, ie) |
917 | | }) |
918 | | } |
919 | 1 | DataType::FixedSizeList(child, len) => { |
920 | 1 | extras.insert("arrowFixedSize".into(), json!(len)); |
921 | 1 | let (items, ie) = |
922 | 1 | datatype_to_avro(child.data_type(), child.name(), child.metadata(), name_gen)?0 ; |
923 | 1 | json!({ |
924 | 1 | "type": "array", |
925 | 1 | "items": merge_extras(items, ie) |
926 | | }) |
927 | | } |
928 | 2 | DataType::Map(entries, _) => { |
929 | 2 | let value_field = match entries.data_type() { |
930 | 2 | DataType::Struct(fs) => &fs[1], |
931 | | _ => { |
932 | 0 | return Err(ArrowError::SchemaError( |
933 | 0 | "Map 'entries' field must be Struct(key,value)".into(), |
934 | 0 | )) |
935 | | } |
936 | | }; |
937 | 2 | let (val_schema, value_entry) = datatype_to_avro( |
938 | 2 | value_field.data_type(), |
939 | 2 | value_field.name(), |
940 | 2 | value_field.metadata(), |
941 | 2 | name_gen, |
942 | 0 | )?; |
943 | 2 | json!({ |
944 | 2 | "type": "map", |
945 | 2 | "values": merge_extras(val_schema, value_entry) |
946 | | }) |
947 | | } |
948 | 1 | DataType::Struct(fields) => { |
949 | 1 | let avro_fields = fields |
950 | 1 | .iter() |
951 | 2 | .map1 (|field| arrow_field_to_avro(field, name_gen)) |
952 | 1 | .collect::<Result<Vec<_>, _>>()?0 ; |
953 | 1 | json!({ |
954 | 1 | "type": "record", |
955 | 1 | "name": name_gen.make_unique(field_name), |
956 | 1 | "fields": avro_fields |
957 | | }) |
958 | | } |
959 | 1 | DataType::Dictionary(_, value) => { |
960 | 1 | if let Some(j) = metadata.get(AVRO_ENUM_SYMBOLS_METADATA_KEY) { |
961 | 1 | let symbols: Vec<&str> = |
962 | 1 | serde_json::from_str(j).map_err(|e| ArrowError::ParseError(e0 .to_string0 ()))?0 ; |
963 | 1 | json!({ |
964 | 1 | "type": "enum", |
965 | 1 | "name": name_gen.make_unique(field_name), |
966 | 1 | "symbols": symbols |
967 | | }) |
968 | | } else { |
969 | 0 | let (inner, ie) = datatype_to_avro(value.as_ref(), field_name, metadata, name_gen)?; |
970 | 0 | merge_extras(inner, ie) |
971 | | } |
972 | | } |
973 | 1 | DataType::RunEndEncoded(_, values) => { |
974 | 1 | let (inner, ie) = datatype_to_avro( |
975 | 1 | values.data_type(), |
976 | 1 | values.name(), |
977 | 1 | values.metadata(), |
978 | 1 | name_gen, |
979 | 0 | )?; |
980 | 1 | merge_extras(inner, ie) |
981 | | } |
982 | | DataType::Union(_, _) => { |
983 | 1 | return Err(ArrowError::NotYetImplemented( |
984 | 1 | "Arrow Union to Avro Union not yet supported".into(), |
985 | 1 | )) |
986 | | } |
987 | 0 | other => { |
988 | 0 | return Err(ArrowError::NotYetImplemented(format!( |
989 | 0 | "Arrow type {other:?} has no Avro representation" |
990 | 0 | ))) |
991 | | } |
992 | | }; |
993 | 108 | Ok((val, extras)) |
994 | 109 | } |
995 | | |
996 | 103 | fn arrow_field_to_avro( |
997 | 103 | field: &ArrowField, |
998 | 103 | name_gen: &mut NameGenerator, |
999 | 103 | ) -> Result<Value, ArrowError> { |
1000 | | // Sanitize field name to ensure Avro validity but store the original in metadata |
1001 | 103 | let avro_name = sanitise_avro_name(field.name()); |
1002 | 102 | let (schema, extras) = |
1003 | 103 | datatype_to_avro(field.data_type(), &avro_name, field.metadata(), name_gen)?1 ; |
1004 | | // If nullable, wrap `[ "null", <type> ]`, NOTE: second order nullability to be added in a follow-up |
1005 | 102 | let mut schema = if field.is_nullable() { |
1006 | 63 | Value::Array(vec![ |
1007 | 63 | Value::String("null".into()), |
1008 | 63 | merge_extras(schema, extras), |
1009 | 63 | ]) |
1010 | | } else { |
1011 | 39 | merge_extras(schema, extras) |
1012 | | }; |
1013 | | // Build the field map |
1014 | 102 | let mut map = JsonMap::with_capacity(field.metadata().len() + 3); |
1015 | 102 | map.insert("name".into(), Value::String(avro_name)); |
1016 | 102 | map.insert("type".into(), schema); |
1017 | | // Transfer selected metadata |
1018 | 102 | for (meta_key2 , meta_val2 ) in field.metadata() { |
1019 | 2 | if is_internal_arrow_key(meta_key) { |
1020 | 0 | continue; |
1021 | 2 | } |
1022 | 2 | match meta_key.as_str() { |
1023 | 2 | AVRO_DOC_METADATA_KEY => { |
1024 | 0 | map.insert("doc".into(), Value::String(meta_val.clone())); |
1025 | 0 | } |
1026 | 2 | AVRO_FIELD_DEFAULT_METADATA_KEY => { |
1027 | 0 | let default_value = serde_json::from_str(meta_val) |
1028 | 0 | .unwrap_or_else(|_| Value::String(meta_val.clone())); |
1029 | 0 | map.insert("default".into(), default_value); |
1030 | | } |
1031 | | _ => { |
1032 | 2 | let json_val = serde_json::from_str(meta_val) |
1033 | 2 | .unwrap_or_else(|_| Value::String(meta_val1 .clone1 ())); |
1034 | 2 | map.insert(meta_key.clone(), json_val); |
1035 | | } |
1036 | | } |
1037 | | } |
1038 | 102 | Ok(Value::Object(map)) |
1039 | 103 | } |
1040 | | |
1041 | | #[cfg(test)] |
1042 | | mod tests { |
1043 | | use super::*; |
1044 | | use crate::codec::{AvroDataType, AvroField}; |
1045 | | use arrow_schema::{DataType, Fields, SchemaBuilder, TimeUnit}; |
1046 | | use serde_json::json; |
1047 | | use std::sync::Arc; |
1048 | | |
1049 | 12 | fn int_schema() -> Schema<'static> { |
1050 | 12 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)) |
1051 | 12 | } |
1052 | | |
1053 | 5 | fn record_schema() -> Schema<'static> { |
1054 | 5 | Schema::Complex(ComplexType::Record(Record { |
1055 | 5 | name: "record1", |
1056 | 5 | namespace: Some("test.namespace"), |
1057 | 5 | doc: Some("A test record"), |
1058 | 5 | aliases: vec![], |
1059 | 5 | fields: vec![ |
1060 | 5 | Field { |
1061 | 5 | name: "field1", |
1062 | 5 | doc: Some("An integer field"), |
1063 | 5 | r#type: int_schema(), |
1064 | 5 | default: None, |
1065 | 5 | }, |
1066 | 5 | Field { |
1067 | 5 | name: "field2", |
1068 | 5 | doc: None, |
1069 | 5 | r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::String)), |
1070 | 5 | default: None, |
1071 | 5 | }, |
1072 | 5 | ], |
1073 | 5 | attributes: Attributes::default(), |
1074 | 5 | })) |
1075 | 5 | } |
1076 | | |
1077 | 35 | fn single_field_schema(field: ArrowField) -> arrow_schema::Schema { |
1078 | 35 | let mut sb = SchemaBuilder::new(); |
1079 | 35 | sb.push(field); |
1080 | 35 | sb.finish() |
1081 | 35 | } |
1082 | | |
1083 | 44 | fn assert_json_contains(avro_json: &str, needle: &str) { |
1084 | 44 | assert!( |
1085 | 44 | avro_json.contains(needle), |
1086 | 0 | "JSON did not contain `{needle}` : {avro_json}" |
1087 | | ) |
1088 | 44 | } |
1089 | | |
1090 | | #[test] |
1091 | 1 | fn test_deserialize() { |
1092 | 1 | let t: Schema = serde_json::from_str("\"string\"").unwrap(); |
1093 | 1 | assert_eq!( |
1094 | | t, |
1095 | | Schema::TypeName(TypeName::Primitive(PrimitiveType::String)) |
1096 | | ); |
1097 | | |
1098 | 1 | let t: Schema = serde_json::from_str("[\"int\", \"null\"]").unwrap(); |
1099 | 1 | assert_eq!( |
1100 | | t, |
1101 | 1 | Schema::Union(vec![ |
1102 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), |
1103 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1104 | 1 | ]) |
1105 | | ); |
1106 | | |
1107 | 1 | let t: Type = serde_json::from_str( |
1108 | 1 | r#"{ |
1109 | 1 | "type":"long", |
1110 | 1 | "logicalType":"timestamp-micros" |
1111 | 1 | }"#, |
1112 | | ) |
1113 | 1 | .unwrap(); |
1114 | | |
1115 | 1 | let timestamp = Type { |
1116 | 1 | r#type: TypeName::Primitive(PrimitiveType::Long), |
1117 | 1 | attributes: Attributes { |
1118 | 1 | logical_type: Some("timestamp-micros"), |
1119 | 1 | additional: Default::default(), |
1120 | 1 | }, |
1121 | 1 | }; |
1122 | | |
1123 | 1 | assert_eq!(t, timestamp); |
1124 | | |
1125 | 1 | let t: ComplexType = serde_json::from_str( |
1126 | 1 | r#"{ |
1127 | 1 | "type":"fixed", |
1128 | 1 | "name":"fixed", |
1129 | 1 | "namespace":"topLevelRecord.value", |
1130 | 1 | "size":11, |
1131 | 1 | "logicalType":"decimal", |
1132 | 1 | "precision":25, |
1133 | 1 | "scale":2 |
1134 | 1 | }"#, |
1135 | | ) |
1136 | 1 | .unwrap(); |
1137 | | |
1138 | 1 | let decimal = ComplexType::Fixed(Fixed { |
1139 | 1 | name: "fixed", |
1140 | 1 | namespace: Some("topLevelRecord.value"), |
1141 | 1 | aliases: vec![], |
1142 | 1 | size: 11, |
1143 | 1 | attributes: Attributes { |
1144 | 1 | logical_type: Some("decimal"), |
1145 | 1 | additional: vec![("precision", json!(25)), ("scale", json!(2))] |
1146 | 1 | .into_iter() |
1147 | 1 | .collect(), |
1148 | 1 | }, |
1149 | 1 | }); |
1150 | | |
1151 | 1 | assert_eq!(t, decimal); |
1152 | | |
1153 | 1 | let schema: Schema = serde_json::from_str( |
1154 | 1 | r#"{ |
1155 | 1 | "type":"record", |
1156 | 1 | "name":"topLevelRecord", |
1157 | 1 | "fields":[ |
1158 | 1 | { |
1159 | 1 | "name":"value", |
1160 | 1 | "type":[ |
1161 | 1 | { |
1162 | 1 | "type":"fixed", |
1163 | 1 | "name":"fixed", |
1164 | 1 | "namespace":"topLevelRecord.value", |
1165 | 1 | "size":11, |
1166 | 1 | "logicalType":"decimal", |
1167 | 1 | "precision":25, |
1168 | 1 | "scale":2 |
1169 | 1 | }, |
1170 | 1 | "null" |
1171 | 1 | ] |
1172 | 1 | } |
1173 | 1 | ] |
1174 | 1 | }"#, |
1175 | | ) |
1176 | 1 | .unwrap(); |
1177 | | |
1178 | 1 | assert_eq!( |
1179 | | schema, |
1180 | 1 | Schema::Complex(ComplexType::Record(Record { |
1181 | 1 | name: "topLevelRecord", |
1182 | 1 | namespace: None, |
1183 | 1 | doc: None, |
1184 | 1 | aliases: vec![], |
1185 | 1 | fields: vec![Field { |
1186 | 1 | name: "value", |
1187 | 1 | doc: None, |
1188 | 1 | r#type: Schema::Union(vec![ |
1189 | 1 | Schema::Complex(decimal), |
1190 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1191 | 1 | ]), |
1192 | 1 | default: None, |
1193 | 1 | },], |
1194 | 1 | attributes: Default::default(), |
1195 | 1 | })) |
1196 | | ); |
1197 | | |
1198 | 1 | let schema: Schema = serde_json::from_str( |
1199 | 1 | r#"{ |
1200 | 1 | "type": "record", |
1201 | 1 | "name": "LongList", |
1202 | 1 | "aliases": ["LinkedLongs"], |
1203 | 1 | "fields" : [ |
1204 | 1 | {"name": "value", "type": "long"}, |
1205 | 1 | {"name": "next", "type": ["null", "LongList"]} |
1206 | 1 | ] |
1207 | 1 | }"#, |
1208 | | ) |
1209 | 1 | .unwrap(); |
1210 | | |
1211 | 1 | assert_eq!( |
1212 | | schema, |
1213 | 1 | Schema::Complex(ComplexType::Record(Record { |
1214 | 1 | name: "LongList", |
1215 | 1 | namespace: None, |
1216 | 1 | doc: None, |
1217 | 1 | aliases: vec!["LinkedLongs"], |
1218 | 1 | fields: vec![ |
1219 | 1 | Field { |
1220 | 1 | name: "value", |
1221 | 1 | doc: None, |
1222 | 1 | r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::Long)), |
1223 | 1 | default: None, |
1224 | 1 | }, |
1225 | 1 | Field { |
1226 | 1 | name: "next", |
1227 | 1 | doc: None, |
1228 | 1 | r#type: Schema::Union(vec![ |
1229 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1230 | 1 | Schema::TypeName(TypeName::Ref("LongList")), |
1231 | 1 | ]), |
1232 | 1 | default: None, |
1233 | 1 | } |
1234 | 1 | ], |
1235 | 1 | attributes: Attributes::default(), |
1236 | 1 | })) |
1237 | | ); |
1238 | | |
1239 | | // Recursive schema are not supported |
1240 | 1 | let err = AvroField::try_from(&schema).unwrap_err().to_string(); |
1241 | 1 | assert_eq!(err, "Parser error: Failed to resolve .LongList"); |
1242 | | |
1243 | 1 | let schema: Schema = serde_json::from_str( |
1244 | 1 | r#"{ |
1245 | 1 | "type":"record", |
1246 | 1 | "name":"topLevelRecord", |
1247 | 1 | "fields":[ |
1248 | 1 | { |
1249 | 1 | "name":"id", |
1250 | 1 | "type":[ |
1251 | 1 | "int", |
1252 | 1 | "null" |
1253 | 1 | ] |
1254 | 1 | }, |
1255 | 1 | { |
1256 | 1 | "name":"timestamp_col", |
1257 | 1 | "type":[ |
1258 | 1 | { |
1259 | 1 | "type":"long", |
1260 | 1 | "logicalType":"timestamp-micros" |
1261 | 1 | }, |
1262 | 1 | "null" |
1263 | 1 | ] |
1264 | 1 | } |
1265 | 1 | ] |
1266 | 1 | }"#, |
1267 | | ) |
1268 | 1 | .unwrap(); |
1269 | | |
1270 | 1 | assert_eq!( |
1271 | | schema, |
1272 | 1 | Schema::Complex(ComplexType::Record(Record { |
1273 | 1 | name: "topLevelRecord", |
1274 | 1 | namespace: None, |
1275 | 1 | doc: None, |
1276 | 1 | aliases: vec![], |
1277 | 1 | fields: vec![ |
1278 | 1 | Field { |
1279 | 1 | name: "id", |
1280 | 1 | doc: None, |
1281 | 1 | r#type: Schema::Union(vec![ |
1282 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), |
1283 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1284 | 1 | ]), |
1285 | 1 | default: None, |
1286 | 1 | }, |
1287 | 1 | Field { |
1288 | 1 | name: "timestamp_col", |
1289 | 1 | doc: None, |
1290 | 1 | r#type: Schema::Union(vec![ |
1291 | 1 | Schema::Type(timestamp), |
1292 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1293 | 1 | ]), |
1294 | 1 | default: None, |
1295 | 1 | } |
1296 | 1 | ], |
1297 | 1 | attributes: Default::default(), |
1298 | 1 | })) |
1299 | | ); |
1300 | 1 | let codec = AvroField::try_from(&schema).unwrap(); |
1301 | 1 | assert_eq!( |
1302 | 1 | codec.field(), |
1303 | 1 | arrow_schema::Field::new( |
1304 | | "topLevelRecord", |
1305 | 1 | DataType::Struct(Fields::from(vec![ |
1306 | 1 | arrow_schema::Field::new("id", DataType::Int32, true), |
1307 | 1 | arrow_schema::Field::new( |
1308 | 1 | "timestamp_col", |
1309 | 1 | DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), |
1310 | 1 | true |
1311 | 1 | ), |
1312 | 1 | ])), |
1313 | | false |
1314 | | ) |
1315 | | ); |
1316 | | |
1317 | 1 | let schema: Schema = serde_json::from_str( |
1318 | 1 | r#"{ |
1319 | 1 | "type": "record", |
1320 | 1 | "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", |
1321 | 1 | "fields": [ |
1322 | 1 | {"name": "clientHash", "type": {"type": "fixed", "name": "MD5", "size": 16}}, |
1323 | 1 | {"name": "clientProtocol", "type": ["null", "string"]}, |
1324 | 1 | {"name": "serverHash", "type": "MD5"}, |
1325 | 1 | {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} |
1326 | 1 | ] |
1327 | 1 | }"#, |
1328 | | ) |
1329 | 1 | .unwrap(); |
1330 | | |
1331 | 1 | assert_eq!( |
1332 | | schema, |
1333 | 1 | Schema::Complex(ComplexType::Record(Record { |
1334 | 1 | name: "HandshakeRequest", |
1335 | 1 | namespace: Some("org.apache.avro.ipc"), |
1336 | 1 | doc: None, |
1337 | 1 | aliases: vec![], |
1338 | 1 | fields: vec![ |
1339 | 1 | Field { |
1340 | 1 | name: "clientHash", |
1341 | 1 | doc: None, |
1342 | 1 | r#type: Schema::Complex(ComplexType::Fixed(Fixed { |
1343 | 1 | name: "MD5", |
1344 | 1 | namespace: None, |
1345 | 1 | aliases: vec![], |
1346 | 1 | size: 16, |
1347 | 1 | attributes: Default::default(), |
1348 | 1 | })), |
1349 | 1 | default: None, |
1350 | 1 | }, |
1351 | 1 | Field { |
1352 | 1 | name: "clientProtocol", |
1353 | 1 | doc: None, |
1354 | 1 | r#type: Schema::Union(vec![ |
1355 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1356 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::String)), |
1357 | 1 | ]), |
1358 | 1 | default: None, |
1359 | 1 | }, |
1360 | 1 | Field { |
1361 | 1 | name: "serverHash", |
1362 | 1 | doc: None, |
1363 | 1 | r#type: Schema::TypeName(TypeName::Ref("MD5")), |
1364 | 1 | default: None, |
1365 | 1 | }, |
1366 | 1 | Field { |
1367 | 1 | name: "meta", |
1368 | 1 | doc: None, |
1369 | 1 | r#type: Schema::Union(vec![ |
1370 | 1 | Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), |
1371 | 1 | Schema::Complex(ComplexType::Map(Map { |
1372 | 1 | values: Box::new(Schema::TypeName(TypeName::Primitive( |
1373 | 1 | PrimitiveType::Bytes |
1374 | 1 | ))), |
1375 | 1 | attributes: Default::default(), |
1376 | 1 | })), |
1377 | 1 | ]), |
1378 | 1 | default: None, |
1379 | 1 | } |
1380 | 1 | ], |
1381 | 1 | attributes: Default::default(), |
1382 | 1 | })) |
1383 | | ); |
1384 | 1 | } |
1385 | | |
1386 | | #[test] |
1387 | 1 | fn test_new_schema_store() { |
1388 | 1 | let store = SchemaStore::new(); |
1389 | 1 | assert!(store.schemas.is_empty()); |
1390 | 1 | } |
1391 | | |
1392 | | #[test] |
1393 | 1 | fn test_try_from_schemas_rabin() { |
1394 | 1 | let int_avro_schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap()); |
1395 | 1 | let record_avro_schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap()); |
1396 | 1 | let schemas = vec![int_avro_schema.clone(), record_avro_schema.clone()]; |
1397 | 1 | let store = SchemaStore::try_from(schemas.as_slice()).unwrap(); |
1398 | 1 | let int_fp = int_avro_schema.fingerprint().unwrap(); |
1399 | 1 | assert_eq!(store.lookup(&int_fp).cloned(), Some(int_avro_schema)); |
1400 | 1 | let rec_fp = record_avro_schema.fingerprint().unwrap(); |
1401 | 1 | assert_eq!(store.lookup(&rec_fp).cloned(), Some(record_avro_schema)); |
1402 | 1 | } |
1403 | | |
1404 | | #[test] |
1405 | 1 | fn test_try_from_with_duplicates() { |
1406 | 1 | let int_avro_schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap()); |
1407 | 1 | let record_avro_schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap()); |
1408 | 1 | let schemas = vec![ |
1409 | 1 | int_avro_schema.clone(), |
1410 | 1 | record_avro_schema, |
1411 | 1 | int_avro_schema.clone(), |
1412 | | ]; |
1413 | 1 | let store = SchemaStore::try_from(schemas.as_slice()).unwrap(); |
1414 | 1 | assert_eq!(store.schemas.len(), 2); |
1415 | 1 | let int_fp = int_avro_schema.fingerprint().unwrap(); |
1416 | 1 | assert_eq!(store.lookup(&int_fp).cloned(), Some(int_avro_schema)); |
1417 | 1 | } |
1418 | | |
1419 | | #[test] |
1420 | 1 | fn test_register_and_lookup_rabin() { |
1421 | 1 | let mut store = SchemaStore::new(); |
1422 | 1 | let schema = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap()); |
1423 | 1 | let fp_enum = store.register(schema.clone()).unwrap(); |
1424 | 1 | let Fingerprint::Rabin(fp_val) = fp_enum; |
1425 | 1 | assert_eq!( |
1426 | 1 | store.lookup(&Fingerprint::Rabin(fp_val)).cloned(), |
1427 | 1 | Some(schema.clone()) |
1428 | | ); |
1429 | 1 | assert!(store |
1430 | 1 | .lookup(&Fingerprint::Rabin(fp_val.wrapping_add(1))) |
1431 | 1 | .is_none()); |
1432 | 1 | } |
1433 | | |
1434 | | #[test] |
1435 | 1 | fn test_register_duplicate_schema() { |
1436 | 1 | let mut store = SchemaStore::new(); |
1437 | 1 | let schema1 = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap()); |
1438 | 1 | let schema2 = AvroSchema::new(serde_json::to_string(&int_schema()).unwrap()); |
1439 | 1 | let fingerprint1 = store.register(schema1).unwrap(); |
1440 | 1 | let fingerprint2 = store.register(schema2).unwrap(); |
1441 | 1 | assert_eq!(fingerprint1, fingerprint2); |
1442 | 1 | assert_eq!(store.schemas.len(), 1); |
1443 | 1 | } |
1444 | | |
1445 | | #[test] |
1446 | 1 | fn test_canonical_form_generation_primitive() { |
1447 | 1 | let schema = int_schema(); |
1448 | 1 | let canonical_form = generate_canonical_form(&schema).unwrap(); |
1449 | 1 | assert_eq!(canonical_form, r#""int""#); |
1450 | 1 | } |
1451 | | |
1452 | | #[test] |
1453 | 1 | fn test_canonical_form_generation_record() { |
1454 | 1 | let schema = record_schema(); |
1455 | 1 | let expected_canonical_form = r#"{"name":"test.namespace.record1","type":"record","fields":[{"name":"field1","type":"int"},{"name":"field2","type":"string"}]}"#; |
1456 | 1 | let canonical_form = generate_canonical_form(&schema).unwrap(); |
1457 | 1 | assert_eq!(canonical_form, expected_canonical_form); |
1458 | 1 | } |
1459 | | |
1460 | | #[test] |
1461 | 1 | fn test_fingerprint_calculation() { |
1462 | 1 | let canonical_form = r#"{"fields":[{"name":"a","type":"long"},{"name":"b","type":"string"}],"name":"test","type":"record"}"#; |
1463 | 1 | let expected_fingerprint = 10505236152925314060; |
1464 | 1 | let fingerprint = compute_fingerprint_rabin(canonical_form); |
1465 | 1 | assert_eq!(fingerprint, expected_fingerprint); |
1466 | 1 | } |
1467 | | |
1468 | | #[test] |
1469 | 1 | fn test_register_and_lookup_complex_schema() { |
1470 | 1 | let mut store = SchemaStore::new(); |
1471 | 1 | let schema = AvroSchema::new(serde_json::to_string(&record_schema()).unwrap()); |
1472 | 1 | let canonical_form = r#"{"name":"test.namespace.record1","type":"record","fields":[{"name":"field1","type":"int"},{"name":"field2","type":"string"}]}"#; |
1473 | 1 | let expected_fingerprint = |
1474 | 1 | Fingerprint::Rabin(super::compute_fingerprint_rabin(canonical_form)); |
1475 | 1 | let fingerprint = store.register(schema.clone()).unwrap(); |
1476 | 1 | assert_eq!(fingerprint, expected_fingerprint); |
1477 | 1 | let looked_up = store.lookup(&fingerprint).cloned(); |
1478 | 1 | assert_eq!(looked_up, Some(schema)); |
1479 | 1 | } |
1480 | | |
1481 | | #[test] |
1482 | 1 | fn test_fingerprints_returns_all_keys() { |
1483 | 1 | let mut store = SchemaStore::new(); |
1484 | 1 | let fp_int = store |
1485 | 1 | .register(AvroSchema::new( |
1486 | 1 | serde_json::to_string(&int_schema()).unwrap(), |
1487 | | )) |
1488 | 1 | .unwrap(); |
1489 | 1 | let fp_record = store |
1490 | 1 | .register(AvroSchema::new( |
1491 | 1 | serde_json::to_string(&record_schema()).unwrap(), |
1492 | | )) |
1493 | 1 | .unwrap(); |
1494 | 1 | let fps = store.fingerprints(); |
1495 | 1 | assert_eq!(fps.len(), 2); |
1496 | 1 | assert!(fps.contains(&fp_int)); |
1497 | 1 | assert!(fps.contains(&fp_record)); |
1498 | 1 | } |
1499 | | |
1500 | | #[test] |
1501 | 1 | fn test_canonical_form_strips_attributes() { |
1502 | 1 | let schema_with_attrs = Schema::Complex(ComplexType::Record(Record { |
1503 | 1 | name: "record_with_attrs", |
1504 | 1 | namespace: None, |
1505 | 1 | doc: Some("This doc should be stripped"), |
1506 | 1 | aliases: vec!["alias1", "alias2"], |
1507 | 1 | fields: vec![Field { |
1508 | 1 | name: "f1", |
1509 | 1 | doc: Some("field doc"), |
1510 | 1 | r#type: Schema::Type(Type { |
1511 | 1 | r#type: TypeName::Primitive(PrimitiveType::Bytes), |
1512 | 1 | attributes: Attributes { |
1513 | 1 | logical_type: Some("decimal"), |
1514 | 1 | additional: HashMap::from([("precision", json!(4))]), |
1515 | 1 | }, |
1516 | 1 | }), |
1517 | 1 | default: None, |
1518 | 1 | }], |
1519 | 1 | attributes: Attributes { |
1520 | 1 | logical_type: None, |
1521 | 1 | additional: HashMap::from([("custom_attr", json!("value"))]), |
1522 | 1 | }, |
1523 | 1 | })); |
1524 | 1 | let expected_canonical_form = r#"{"name":"record_with_attrs","type":"record","fields":[{"name":"f1","type":"bytes"}]}"#; |
1525 | 1 | let canonical_form = generate_canonical_form(&schema_with_attrs).unwrap(); |
1526 | 1 | assert_eq!(canonical_form, expected_canonical_form); |
1527 | 1 | } |
1528 | | |
1529 | | #[test] |
1530 | 1 | fn test_primitive_mappings() { |
1531 | 1 | let cases = vec![ |
1532 | 1 | (DataType::Boolean, "\"boolean\""), |
1533 | 1 | (DataType::Int8, "\"int\""), |
1534 | 1 | (DataType::Int16, "\"int\""), |
1535 | 1 | (DataType::Int32, "\"int\""), |
1536 | 1 | (DataType::Int64, "\"long\""), |
1537 | 1 | (DataType::UInt8, "\"int\""), |
1538 | 1 | (DataType::UInt16, "\"int\""), |
1539 | 1 | (DataType::UInt32, "\"long\""), |
1540 | 1 | (DataType::UInt64, "\"long\""), |
1541 | 1 | (DataType::Float16, "\"float\""), |
1542 | 1 | (DataType::Float32, "\"float\""), |
1543 | 1 | (DataType::Float64, "\"double\""), |
1544 | 1 | (DataType::Utf8, "\"string\""), |
1545 | 1 | (DataType::Binary, "\"bytes\""), |
1546 | | ]; |
1547 | 15 | for (dt14 , avro_token14 ) in cases { |
1548 | 14 | let field = ArrowField::new("col", dt.clone(), false); |
1549 | 14 | let arrow_schema = single_field_schema(field); |
1550 | 14 | let avro = AvroSchema::try_from(&arrow_schema).unwrap(); |
1551 | 14 | assert_json_contains(&avro.json_string, avro_token); |
1552 | 14 | } |
1553 | 1 | } |
1554 | | |
1555 | | #[test] |
1556 | 1 | fn test_temporal_mappings() { |
1557 | 1 | let cases = vec![ |
1558 | 1 | (DataType::Date32, "\"logicalType\":\"date\""), |
1559 | 1 | ( |
1560 | 1 | DataType::Time32(TimeUnit::Millisecond), |
1561 | 1 | "\"logicalType\":\"time-millis\"", |
1562 | 1 | ), |
1563 | 1 | ( |
1564 | 1 | DataType::Time64(TimeUnit::Microsecond), |
1565 | 1 | "\"logicalType\":\"time-micros\"", |
1566 | 1 | ), |
1567 | 1 | ( |
1568 | 1 | DataType::Timestamp(TimeUnit::Millisecond, None), |
1569 | 1 | "\"logicalType\":\"local-timestamp-millis\"", |
1570 | 1 | ), |
1571 | 1 | ( |
1572 | 1 | DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), |
1573 | 1 | "\"logicalType\":\"timestamp-micros\"", |
1574 | 1 | ), |
1575 | | ]; |
1576 | 6 | for (dt5 , needle5 ) in cases { |
1577 | 5 | let field = ArrowField::new("ts", dt.clone(), true); |
1578 | 5 | let arrow_schema = single_field_schema(field); |
1579 | 5 | let avro = AvroSchema::try_from(&arrow_schema).unwrap(); |
1580 | 5 | assert_json_contains(&avro.json_string, needle); |
1581 | 5 | } |
1582 | 1 | } |
1583 | | |
1584 | | #[test] |
1585 | 1 | fn test_decimal_and_uuid() { |
1586 | 1 | let decimal_field = ArrowField::new("amount", DataType::Decimal128(25, 2), false); |
1587 | 1 | let dec_schema = single_field_schema(decimal_field); |
1588 | 1 | let avro_dec = AvroSchema::try_from(&dec_schema).unwrap(); |
1589 | 1 | assert_json_contains(&avro_dec.json_string, "\"logicalType\":\"decimal\""); |
1590 | 1 | assert_json_contains(&avro_dec.json_string, "\"precision\":25"); |
1591 | 1 | assert_json_contains(&avro_dec.json_string, "\"scale\":2"); |
1592 | 1 | let mut md = HashMap::new(); |
1593 | 1 | md.insert("logicalType".into(), "uuid".into()); |
1594 | 1 | let uuid_field = |
1595 | 1 | ArrowField::new("id", DataType::FixedSizeBinary(16), false).with_metadata(md); |
1596 | 1 | let uuid_schema = single_field_schema(uuid_field); |
1597 | 1 | let avro_uuid = AvroSchema::try_from(&uuid_schema).unwrap(); |
1598 | 1 | assert_json_contains(&avro_uuid.json_string, "\"logicalType\":\"uuid\""); |
1599 | 1 | } |
1600 | | |
1601 | | #[test] |
1602 | 1 | fn test_interval_duration() { |
1603 | 1 | let interval_field = ArrowField::new( |
1604 | | "span", |
1605 | 1 | DataType::Interval(IntervalUnit::MonthDayNano), |
1606 | | false, |
1607 | | ); |
1608 | 1 | let s = single_field_schema(interval_field); |
1609 | 1 | let avro = AvroSchema::try_from(&s).unwrap(); |
1610 | 1 | assert_json_contains(&avro.json_string, "\"logicalType\":\"duration\""); |
1611 | 1 | assert_json_contains(&avro.json_string, "\"size\":12"); |
1612 | 1 | let dur_field = ArrowField::new("latency", DataType::Duration(TimeUnit::Nanosecond), false); |
1613 | 1 | let s2 = single_field_schema(dur_field); |
1614 | 1 | let avro2 = AvroSchema::try_from(&s2).unwrap(); |
1615 | 1 | assert_json_contains(&avro2.json_string, "\"arrowDurationUnit\""); |
1616 | 1 | } |
1617 | | |
1618 | | #[test] |
1619 | 1 | fn test_complex_types() { |
1620 | 1 | let list_dt = DataType::List(Arc::new(ArrowField::new("item", DataType::Int32, true))); |
1621 | 1 | let list_schema = single_field_schema(ArrowField::new("numbers", list_dt, false)); |
1622 | 1 | let avro_list = AvroSchema::try_from(&list_schema).unwrap(); |
1623 | 1 | assert_json_contains(&avro_list.json_string, "\"type\":\"array\""); |
1624 | 1 | assert_json_contains(&avro_list.json_string, "\"items\""); |
1625 | 1 | let value_field = ArrowField::new("value", DataType::Boolean, true); |
1626 | 1 | let entries_struct = ArrowField::new( |
1627 | | "entries", |
1628 | 1 | DataType::Struct(Fields::from(vec![ |
1629 | 1 | ArrowField::new("key", DataType::Utf8, false), |
1630 | 1 | value_field.clone(), |
1631 | 1 | ])), |
1632 | | false, |
1633 | | ); |
1634 | 1 | let map_dt = DataType::Map(Arc::new(entries_struct), false); |
1635 | 1 | let map_schema = single_field_schema(ArrowField::new("props", map_dt, false)); |
1636 | 1 | let avro_map = AvroSchema::try_from(&map_schema).unwrap(); |
1637 | 1 | assert_json_contains(&avro_map.json_string, "\"type\":\"map\""); |
1638 | 1 | assert_json_contains(&avro_map.json_string, "\"values\""); |
1639 | 1 | let struct_dt = DataType::Struct(Fields::from(vec![ |
1640 | 1 | ArrowField::new("f1", DataType::Int64, false), |
1641 | 1 | ArrowField::new("f2", DataType::Utf8, true), |
1642 | 1 | ])); |
1643 | 1 | let struct_schema = single_field_schema(ArrowField::new("person", struct_dt, true)); |
1644 | 1 | let avro_struct = AvroSchema::try_from(&struct_schema).unwrap(); |
1645 | 1 | assert_json_contains(&avro_struct.json_string, "\"type\":\"record\""); |
1646 | 1 | assert_json_contains(&avro_struct.json_string, "\"null\""); |
1647 | 1 | } |
1648 | | |
1649 | | #[test] |
1650 | 1 | fn test_enum_dictionary() { |
1651 | 1 | let mut md = HashMap::new(); |
1652 | 1 | md.insert( |
1653 | 1 | AVRO_ENUM_SYMBOLS_METADATA_KEY.into(), |
1654 | 1 | "[\"OPEN\",\"CLOSED\"]".into(), |
1655 | | ); |
1656 | 1 | let enum_dt = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); |
1657 | 1 | let field = ArrowField::new("status", enum_dt, false).with_metadata(md); |
1658 | 1 | let schema = single_field_schema(field); |
1659 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1660 | 1 | assert_json_contains(&avro.json_string, "\"type\":\"enum\""); |
1661 | 1 | assert_json_contains(&avro.json_string, "\"symbols\":[\"OPEN\",\"CLOSED\"]"); |
1662 | 1 | } |
1663 | | |
1664 | | #[test] |
1665 | 1 | fn test_run_end_encoded() { |
1666 | 1 | let ree_dt = DataType::RunEndEncoded( |
1667 | 1 | Arc::new(ArrowField::new("run_ends", DataType::Int32, false)), |
1668 | 1 | Arc::new(ArrowField::new("values", DataType::Utf8, false)), |
1669 | 1 | ); |
1670 | 1 | let s = single_field_schema(ArrowField::new("text", ree_dt, false)); |
1671 | 1 | let avro = AvroSchema::try_from(&s).unwrap(); |
1672 | 1 | assert_json_contains(&avro.json_string, "\"string\""); |
1673 | 1 | } |
1674 | | |
1675 | | #[test] |
1676 | 1 | fn test_dense_union_error() { |
1677 | | use arrow_schema::UnionFields; |
1678 | 1 | let uf: UnionFields = vec![(0i8, Arc::new(ArrowField::new("a", DataType::Int32, false)))] |
1679 | 1 | .into_iter() |
1680 | 1 | .collect(); |
1681 | 1 | let union_dt = DataType::Union(uf, arrow_schema::UnionMode::Dense); |
1682 | 1 | let s = single_field_schema(ArrowField::new("u", union_dt, false)); |
1683 | 1 | let err = AvroSchema::try_from(&s).unwrap_err(); |
1684 | 1 | assert!(err |
1685 | 1 | .to_string() |
1686 | 1 | .contains("Arrow Union to Avro Union not yet supported")); |
1687 | 1 | } |
1688 | | |
1689 | | #[test] |
1690 | 1 | fn round_trip_primitive() { |
1691 | 1 | let arrow_schema = ArrowSchema::new(vec![ArrowField::new("f1", DataType::Int32, false)]); |
1692 | 1 | let avro_schema = AvroSchema::try_from(&arrow_schema).unwrap(); |
1693 | 1 | let decoded = avro_schema.schema().unwrap(); |
1694 | 1 | assert!(matches!0 (decoded, Schema::Complex(_))); |
1695 | 1 | } |
1696 | | |
1697 | | #[test] |
1698 | 1 | fn test_name_generator_sanitization_and_uniqueness() { |
1699 | 1 | let f1 = ArrowField::new("weird-name", DataType::FixedSizeBinary(8), false); |
1700 | 1 | let f2 = ArrowField::new("weird name", DataType::FixedSizeBinary(8), false); |
1701 | 1 | let f3 = ArrowField::new("123bad", DataType::FixedSizeBinary(8), false); |
1702 | 1 | let arrow_schema = ArrowSchema::new(vec![f1, f2, f3]); |
1703 | 1 | let avro = AvroSchema::try_from(&arrow_schema).unwrap(); |
1704 | 1 | assert_json_contains(&avro.json_string, "\"name\":\"weird_name\""); |
1705 | 1 | assert_json_contains(&avro.json_string, "\"name\":\"weird_name_1\""); |
1706 | 1 | assert_json_contains(&avro.json_string, "\"name\":\"_123bad\""); |
1707 | 1 | } |
1708 | | |
1709 | | #[test] |
1710 | 1 | fn test_date64_logical_type_mapping() { |
1711 | 1 | let field = ArrowField::new("d", DataType::Date64, true); |
1712 | 1 | let schema = single_field_schema(field); |
1713 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1714 | 1 | assert_json_contains( |
1715 | 1 | &avro.json_string, |
1716 | 1 | "\"logicalType\":\"local-timestamp-millis\"", |
1717 | | ); |
1718 | 1 | } |
1719 | | |
1720 | | #[test] |
1721 | 1 | fn test_duration_list_extras_propagated() { |
1722 | 1 | let child = ArrowField::new("lat", DataType::Duration(TimeUnit::Microsecond), false); |
1723 | 1 | let list_dt = DataType::List(Arc::new(child)); |
1724 | 1 | let arrow_schema = single_field_schema(ArrowField::new("durations", list_dt, false)); |
1725 | 1 | let avro = AvroSchema::try_from(&arrow_schema).unwrap(); |
1726 | 1 | assert_json_contains(&avro.json_string, "\"arrowDurationUnit\":\"microsecond\""); |
1727 | 1 | } |
1728 | | |
1729 | | #[test] |
1730 | 1 | fn test_interval_yearmonth_extra() { |
1731 | 1 | let field = ArrowField::new("iv", DataType::Interval(IntervalUnit::YearMonth), false); |
1732 | 1 | let schema = single_field_schema(field); |
1733 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1734 | 1 | assert_json_contains(&avro.json_string, "\"arrowIntervalUnit\":\"yearmonth\""); |
1735 | 1 | } |
1736 | | |
1737 | | #[test] |
1738 | 1 | fn test_interval_daytime_extra() { |
1739 | 1 | let field = ArrowField::new("iv_dt", DataType::Interval(IntervalUnit::DayTime), false); |
1740 | 1 | let schema = single_field_schema(field); |
1741 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1742 | 1 | assert_json_contains(&avro.json_string, "\"arrowIntervalUnit\":\"daytime\""); |
1743 | 1 | } |
1744 | | |
1745 | | #[test] |
1746 | 1 | fn test_fixed_size_list_extra() { |
1747 | 1 | let child = ArrowField::new("item", DataType::Int32, false); |
1748 | 1 | let dt = DataType::FixedSizeList(Arc::new(child), 3); |
1749 | 1 | let schema = single_field_schema(ArrowField::new("triples", dt, false)); |
1750 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1751 | 1 | assert_json_contains(&avro.json_string, "\"arrowFixedSize\":3"); |
1752 | 1 | } |
1753 | | |
1754 | | #[test] |
1755 | 1 | fn test_map_duration_value_extra() { |
1756 | 1 | let val_field = ArrowField::new("value", DataType::Duration(TimeUnit::Second), true); |
1757 | 1 | let entries_struct = ArrowField::new( |
1758 | | "entries", |
1759 | 1 | DataType::Struct(Fields::from(vec![ |
1760 | 1 | ArrowField::new("key", DataType::Utf8, false), |
1761 | 1 | val_field, |
1762 | 1 | ])), |
1763 | | false, |
1764 | | ); |
1765 | 1 | let map_dt = DataType::Map(Arc::new(entries_struct), false); |
1766 | 1 | let schema = single_field_schema(ArrowField::new("metrics", map_dt, false)); |
1767 | 1 | let avro = AvroSchema::try_from(&schema).unwrap(); |
1768 | 1 | assert_json_contains(&avro.json_string, "\"arrowDurationUnit\":\"second\""); |
1769 | 1 | } |
1770 | | } |