/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/field.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::error::ArrowError; |
19 | | use std::cmp::Ordering; |
20 | | use std::collections::HashMap; |
21 | | use std::hash::{Hash, Hasher}; |
22 | | use std::sync::Arc; |
23 | | |
24 | | use crate::datatype::DataType; |
25 | | #[cfg(feature = "canonical_extension_types")] |
26 | | use crate::extension::CanonicalExtensionType; |
27 | | use crate::schema::SchemaBuilder; |
28 | | use crate::{ |
29 | | Fields, UnionFields, UnionMode, |
30 | | extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType}, |
31 | | }; |
32 | | |
33 | | /// A reference counted [`Field`] |
34 | | pub type FieldRef = Arc<Field>; |
35 | | |
36 | | /// Describes a single column in a [`Schema`](super::Schema). |
37 | | /// |
38 | | /// A [`Schema`](super::Schema) is an ordered collection of |
39 | | /// [`Field`] objects. Fields contain: |
40 | | /// * `name`: the name of the field |
41 | | /// * `data_type`: the type of the field |
42 | | /// * `nullable`: if the field is nullable |
43 | | /// * `metadata`: a map of key-value pairs containing additional custom metadata |
44 | | /// |
45 | | /// Arrow Extension types, are encoded in `Field`s metadata. See |
46 | | /// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any. |
47 | | #[derive(Clone)] |
48 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
49 | | pub struct Field { |
50 | | name: String, |
51 | | data_type: DataType, |
52 | | nullable: bool, |
53 | | #[deprecated( |
54 | | since = "54.0.0", |
55 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
56 | | )] |
57 | | dict_id: i64, |
58 | | dict_is_ordered: bool, |
59 | | /// A map of key-value pairs containing additional custom meta data. |
60 | | metadata: HashMap<String, String>, |
61 | | } |
62 | | |
63 | | impl std::fmt::Debug for Field { |
64 | 3 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
65 | | #![expect(deprecated)] // Must still print dict_id, if set |
66 | | let Self { |
67 | 3 | name, |
68 | 3 | data_type, |
69 | 3 | nullable, |
70 | 3 | dict_id, |
71 | 3 | dict_is_ordered, |
72 | 3 | metadata, |
73 | 3 | } = self; |
74 | | |
75 | 3 | let mut s = f.debug_struct("Field"); |
76 | | |
77 | 3 | if name != "item" { |
78 | 3 | // Keep it short when debug-formatting `DataType::List` |
79 | 3 | s.field("name", name); |
80 | 3 | }0 |
81 | | |
82 | 3 | s.field("data_type", data_type); |
83 | | |
84 | 3 | if *nullable { |
85 | 3 | s.field("nullable", nullable); |
86 | 3 | }0 |
87 | | |
88 | 3 | if *dict_id != 0 { |
89 | 0 | s.field("dict_id", dict_id); |
90 | 3 | } |
91 | | |
92 | 3 | if *dict_is_ordered { |
93 | 0 | s.field("dict_is_ordered", dict_is_ordered); |
94 | 3 | } |
95 | | |
96 | 3 | if !metadata.is_empty() { |
97 | 0 | s.field("metadata", metadata); |
98 | 3 | } |
99 | 3 | s.finish() |
100 | 3 | } |
101 | | } |
102 | | |
103 | | // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` |
104 | | // into comparison. However, these properties are only used in IPC context |
105 | | // for matching dictionary encoded data. They are not necessary to be same |
106 | | // to consider schema equality. For example, in C++ `Field` implementation, |
107 | | // it doesn't contain these dictionary properties too. |
108 | | impl PartialEq for Field { |
109 | 80.0k | fn eq(&self, other: &Self) -> bool { |
110 | 80.0k | self.name == other.name |
111 | 80.0k | && self.data_type == other.data_type |
112 | 80.0k | && self.nullable == other.nullable |
113 | 80.0k | && self.metadata == other.metadata |
114 | 80.0k | } |
115 | | } |
116 | | |
117 | | impl Eq for Field {} |
118 | | |
119 | | impl PartialOrd for Field { |
120 | 0 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
121 | 0 | Some(self.cmp(other)) |
122 | 0 | } |
123 | | } |
124 | | |
125 | | impl Ord for Field { |
126 | 0 | fn cmp(&self, other: &Self) -> Ordering { |
127 | 0 | self.name |
128 | 0 | .cmp(other.name()) |
129 | 0 | .then_with(|| self.data_type.cmp(other.data_type())) |
130 | 0 | .then_with(|| self.nullable.cmp(&other.nullable)) |
131 | 0 | .then_with(|| { |
132 | | // ensure deterministic key order |
133 | 0 | let mut keys: Vec<&String> = |
134 | 0 | self.metadata.keys().chain(other.metadata.keys()).collect(); |
135 | 0 | keys.sort(); |
136 | 0 | for k in keys { |
137 | 0 | match (self.metadata.get(k), other.metadata.get(k)) { |
138 | 0 | (None, None) => {} |
139 | | (Some(_), None) => { |
140 | 0 | return Ordering::Less; |
141 | | } |
142 | | (None, Some(_)) => { |
143 | 0 | return Ordering::Greater; |
144 | | } |
145 | 0 | (Some(v1), Some(v2)) => match v1.cmp(v2) { |
146 | 0 | Ordering::Equal => {} |
147 | 0 | other => { |
148 | 0 | return other; |
149 | | } |
150 | | }, |
151 | | } |
152 | | } |
153 | | |
154 | 0 | Ordering::Equal |
155 | 0 | }) |
156 | 0 | } |
157 | | } |
158 | | |
159 | | impl Hash for Field { |
160 | 0 | fn hash<H: Hasher>(&self, state: &mut H) { |
161 | 0 | self.name.hash(state); |
162 | 0 | self.data_type.hash(state); |
163 | 0 | self.nullable.hash(state); |
164 | | |
165 | | // ensure deterministic key order |
166 | 0 | let mut keys: Vec<&String> = self.metadata.keys().collect(); |
167 | 0 | keys.sort(); |
168 | 0 | for k in keys { |
169 | 0 | k.hash(state); |
170 | 0 | self.metadata.get(k).expect("key valid").hash(state); |
171 | 0 | } |
172 | 0 | } |
173 | | } |
174 | | |
175 | | impl AsRef<Field> for Field { |
176 | 0 | fn as_ref(&self) -> &Field { |
177 | 0 | self |
178 | 0 | } |
179 | | } |
180 | | |
181 | | impl Field { |
182 | | /// Default list member field name |
183 | | pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item"; |
184 | | |
185 | | /// Creates a new field with the given name, data type, and nullability |
186 | | /// |
187 | | /// # Example |
188 | | /// ``` |
189 | | /// # use arrow_schema::{Field, DataType}; |
190 | | /// Field::new("field_name", DataType::Int32, true); |
191 | | /// ``` |
192 | 80.5k | pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self { |
193 | | #[allow(deprecated)] |
194 | 80.5k | Field { |
195 | 80.5k | name: name.into(), |
196 | 80.5k | data_type, |
197 | 80.5k | nullable, |
198 | 80.5k | dict_id: 0, |
199 | 80.5k | dict_is_ordered: false, |
200 | 80.5k | metadata: HashMap::default(), |
201 | 80.5k | } |
202 | 80.5k | } |
203 | | |
204 | | /// Creates a new `Field` suitable for [`DataType::List`] and |
205 | | /// [`DataType::LargeList`] |
206 | | /// |
207 | | /// While not required, this method follows the convention of naming the |
208 | | /// `Field` `"item"`. |
209 | | /// |
210 | | /// # Example |
211 | | /// ``` |
212 | | /// # use arrow_schema::{Field, DataType}; |
213 | | /// assert_eq!( |
214 | | /// Field::new("item", DataType::Int32, true), |
215 | | /// Field::new_list_field(DataType::Int32, true) |
216 | | /// ); |
217 | | /// ``` |
218 | 80.0k | pub fn new_list_field(data_type: DataType, nullable: bool) -> Self { |
219 | 80.0k | Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable) |
220 | 80.0k | } |
221 | | |
222 | | /// Creates a new field that has additional dictionary information |
223 | | #[deprecated( |
224 | | since = "54.0.0", |
225 | | note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter." |
226 | | )] |
227 | 0 | pub fn new_dict( |
228 | 0 | name: impl Into<String>, |
229 | 0 | data_type: DataType, |
230 | 0 | nullable: bool, |
231 | 0 | dict_id: i64, |
232 | 0 | dict_is_ordered: bool, |
233 | 0 | ) -> Self { |
234 | | #[allow(deprecated)] |
235 | 0 | Field { |
236 | 0 | name: name.into(), |
237 | 0 | data_type, |
238 | 0 | nullable, |
239 | 0 | dict_id, |
240 | 0 | dict_is_ordered, |
241 | 0 | metadata: HashMap::default(), |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | /// Create a new [`Field`] with [`DataType::Dictionary`] |
246 | | /// |
247 | | /// Use [`Self::new_dict`] for more advanced dictionary options |
248 | | /// |
249 | | /// # Panics |
250 | | /// |
251 | | /// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type] |
252 | 0 | pub fn new_dictionary( |
253 | 0 | name: impl Into<String>, |
254 | 0 | key: DataType, |
255 | 0 | value: DataType, |
256 | 0 | nullable: bool, |
257 | 0 | ) -> Self { |
258 | 0 | assert!( |
259 | 0 | key.is_dictionary_key_type(), |
260 | 0 | "{key} is not a valid dictionary key" |
261 | | ); |
262 | 0 | let data_type = DataType::Dictionary(Box::new(key), Box::new(value)); |
263 | 0 | Self::new(name, data_type, nullable) |
264 | 0 | } |
265 | | |
266 | | /// Create a new [`Field`] with [`DataType::Struct`] |
267 | | /// |
268 | | /// - `name`: the name of the [`DataType::Struct`] field |
269 | | /// - `fields`: the description of each struct element |
270 | | /// - `nullable`: if the [`DataType::Struct`] array is nullable |
271 | 0 | pub fn new_struct(name: impl Into<String>, fields: impl Into<Fields>, nullable: bool) -> Self { |
272 | 0 | Self::new(name, DataType::Struct(fields.into()), nullable) |
273 | 0 | } |
274 | | |
275 | | /// Create a new [`Field`] with [`DataType::List`] |
276 | | /// |
277 | | /// - `name`: the name of the [`DataType::List`] field |
278 | | /// - `value`: the description of each list element |
279 | | /// - `nullable`: if the [`DataType::List`] array is nullable |
280 | 0 | pub fn new_list(name: impl Into<String>, value: impl Into<FieldRef>, nullable: bool) -> Self { |
281 | 0 | Self::new(name, DataType::List(value.into()), nullable) |
282 | 0 | } |
283 | | |
284 | | /// Create a new [`Field`] with [`DataType::LargeList`] |
285 | | /// |
286 | | /// - `name`: the name of the [`DataType::LargeList`] field |
287 | | /// - `value`: the description of each list element |
288 | | /// - `nullable`: if the [`DataType::LargeList`] array is nullable |
289 | 0 | pub fn new_large_list( |
290 | 0 | name: impl Into<String>, |
291 | 0 | value: impl Into<FieldRef>, |
292 | 0 | nullable: bool, |
293 | 0 | ) -> Self { |
294 | 0 | Self::new(name, DataType::LargeList(value.into()), nullable) |
295 | 0 | } |
296 | | |
297 | | /// Create a new [`Field`] with [`DataType::FixedSizeList`] |
298 | | /// |
299 | | /// - `name`: the name of the [`DataType::FixedSizeList`] field |
300 | | /// - `value`: the description of each list element |
301 | | /// - `size`: the size of the fixed size list |
302 | | /// - `nullable`: if the [`DataType::FixedSizeList`] array is nullable |
303 | 0 | pub fn new_fixed_size_list( |
304 | 0 | name: impl Into<String>, |
305 | 0 | value: impl Into<FieldRef>, |
306 | 0 | size: i32, |
307 | 0 | nullable: bool, |
308 | 0 | ) -> Self { |
309 | 0 | Self::new(name, DataType::FixedSizeList(value.into(), size), nullable) |
310 | 0 | } |
311 | | |
312 | | /// Create a new [`Field`] with [`DataType::Map`] |
313 | | /// |
314 | | /// - `name`: the name of the [`DataType::Map`] field |
315 | | /// - `entries`: the name of the inner [`DataType::Struct`] field |
316 | | /// - `keys`: the map keys |
317 | | /// - `values`: the map values |
318 | | /// - `sorted`: if the [`DataType::Map`] array is sorted |
319 | | /// - `nullable`: if the [`DataType::Map`] array is nullable |
320 | 0 | pub fn new_map( |
321 | 0 | name: impl Into<String>, |
322 | 0 | entries: impl Into<String>, |
323 | 0 | keys: impl Into<FieldRef>, |
324 | 0 | values: impl Into<FieldRef>, |
325 | 0 | sorted: bool, |
326 | 0 | nullable: bool, |
327 | 0 | ) -> Self { |
328 | 0 | let data_type = DataType::Map( |
329 | 0 | Arc::new(Field::new( |
330 | 0 | entries.into(), |
331 | 0 | DataType::Struct(Fields::from([keys.into(), values.into()])), |
332 | 0 | false, // The inner map field is always non-nullable (#1697), |
333 | 0 | )), |
334 | 0 | sorted, |
335 | 0 | ); |
336 | 0 | Self::new(name, data_type, nullable) |
337 | 0 | } |
338 | | |
339 | | /// Create a new [`Field`] with [`DataType::Union`] |
340 | | /// |
341 | | /// - `name`: the name of the [`DataType::Union`] field |
342 | | /// - `type_ids`: the union type ids |
343 | | /// - `fields`: the union fields |
344 | | /// - `mode`: the union mode |
345 | 0 | pub fn new_union<S, F, T>(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self |
346 | 0 | where |
347 | 0 | S: Into<String>, |
348 | 0 | F: IntoIterator, |
349 | 0 | F::Item: Into<FieldRef>, |
350 | 0 | T: IntoIterator<Item = i8>, |
351 | | { |
352 | 0 | Self::new( |
353 | 0 | name, |
354 | 0 | DataType::Union(UnionFields::new(type_ids, fields), mode), |
355 | | false, // Unions cannot be nullable |
356 | | ) |
357 | 0 | } |
358 | | |
359 | | /// Sets the `Field`'s optional custom metadata. |
360 | | #[inline] |
361 | 0 | pub fn set_metadata(&mut self, metadata: HashMap<String, String>) { |
362 | 0 | self.metadata = metadata; |
363 | 0 | } |
364 | | |
365 | | /// Sets the metadata of this `Field` to be `metadata` and returns self |
366 | 0 | pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self { |
367 | 0 | self.set_metadata(metadata); |
368 | 0 | self |
369 | 0 | } |
370 | | |
371 | | /// Returns the immutable reference to the `Field`'s optional custom metadata. |
372 | | #[inline] |
373 | 8 | pub const fn metadata(&self) -> &HashMap<String, String> { |
374 | 8 | &self.metadata |
375 | 8 | } |
376 | | |
377 | | /// Returns a mutable reference to the `Field`'s optional custom metadata. |
378 | | #[inline] |
379 | 0 | pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> { |
380 | 0 | &mut self.metadata |
381 | 0 | } |
382 | | |
383 | | /// Returns an immutable reference to the `Field`'s name. |
384 | | #[inline] |
385 | 54 | pub const fn name(&self) -> &String { |
386 | 54 | &self.name |
387 | 54 | } |
388 | | |
389 | | /// Set the name of this [`Field`] |
390 | | #[inline] |
391 | 0 | pub fn set_name(&mut self, name: impl Into<String>) { |
392 | 0 | self.name = name.into(); |
393 | 0 | } |
394 | | |
395 | | /// Set the name of the [`Field`] and returns self. |
396 | | /// |
397 | | /// ``` |
398 | | /// # use arrow_schema::*; |
399 | | /// let field = Field::new("c1", DataType::Int64, false) |
400 | | /// .with_name("c2"); |
401 | | /// |
402 | | /// assert_eq!(field.name(), "c2"); |
403 | | /// ``` |
404 | 0 | pub fn with_name(mut self, name: impl Into<String>) -> Self { |
405 | 0 | self.set_name(name); |
406 | 0 | self |
407 | 0 | } |
408 | | |
409 | | /// Returns an immutable reference to the [`Field`]'s [`DataType`]. |
410 | | #[inline] |
411 | 81.2k | pub const fn data_type(&self) -> &DataType { |
412 | 81.2k | &self.data_type |
413 | 81.2k | } |
414 | | |
415 | | /// Set [`DataType`] of the [`Field`] |
416 | | /// |
417 | | /// ``` |
418 | | /// # use arrow_schema::*; |
419 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
420 | | /// field.set_data_type(DataType::Utf8); |
421 | | /// |
422 | | /// assert_eq!(field.data_type(), &DataType::Utf8); |
423 | | /// ``` |
424 | | #[inline] |
425 | 0 | pub fn set_data_type(&mut self, data_type: DataType) { |
426 | 0 | self.data_type = data_type; |
427 | 0 | } |
428 | | |
429 | | /// Set [`DataType`] of the [`Field`] and returns self. |
430 | | /// |
431 | | /// ``` |
432 | | /// # use arrow_schema::*; |
433 | | /// let field = Field::new("c1", DataType::Int64, false) |
434 | | /// .with_data_type(DataType::Utf8); |
435 | | /// |
436 | | /// assert_eq!(field.data_type(), &DataType::Utf8); |
437 | | /// ``` |
438 | 0 | pub fn with_data_type(mut self, data_type: DataType) -> Self { |
439 | 0 | self.set_data_type(data_type); |
440 | 0 | self |
441 | 0 | } |
442 | | |
443 | | /// Returns the extension type name of this [`Field`], if set. |
444 | | /// |
445 | | /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in |
446 | | /// [`Field::metadata`]. If the key is missing, there is no extension type |
447 | | /// name and this returns `None`. |
448 | | /// |
449 | | /// # Example |
450 | | /// |
451 | | /// ``` |
452 | | /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field}; |
453 | | /// |
454 | | /// let field = Field::new("", DataType::Null, false); |
455 | | /// assert_eq!(field.extension_type_name(), None); |
456 | | /// |
457 | | /// let field = Field::new("", DataType::Null, false).with_metadata( |
458 | | /// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())] |
459 | | /// .into_iter() |
460 | | /// .collect(), |
461 | | /// ); |
462 | | /// assert_eq!(field.extension_type_name(), Some("example")); |
463 | | /// ``` |
464 | 0 | pub fn extension_type_name(&self) -> Option<&str> { |
465 | 0 | self.metadata() |
466 | 0 | .get(EXTENSION_TYPE_NAME_KEY) |
467 | 0 | .map(String::as_ref) |
468 | 0 | } |
469 | | |
470 | | /// Returns the extension type metadata of this [`Field`], if set. |
471 | | /// |
472 | | /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in |
473 | | /// [`Field::metadata`]. If the key is missing, there is no extension type |
474 | | /// metadata and this returns `None`. |
475 | | /// |
476 | | /// # Example |
477 | | /// |
478 | | /// ``` |
479 | | /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field}; |
480 | | /// |
481 | | /// let field = Field::new("", DataType::Null, false); |
482 | | /// assert_eq!(field.extension_type_metadata(), None); |
483 | | /// |
484 | | /// let field = Field::new("", DataType::Null, false).with_metadata( |
485 | | /// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())] |
486 | | /// .into_iter() |
487 | | /// .collect(), |
488 | | /// ); |
489 | | /// assert_eq!(field.extension_type_metadata(), Some("example")); |
490 | | /// ``` |
491 | 0 | pub fn extension_type_metadata(&self) -> Option<&str> { |
492 | 0 | self.metadata() |
493 | 0 | .get(EXTENSION_TYPE_METADATA_KEY) |
494 | 0 | .map(String::as_ref) |
495 | 0 | } |
496 | | |
497 | | /// Returns an instance of the given [`ExtensionType`] of this [`Field`], |
498 | | /// if set in the [`Field::metadata`]. |
499 | | /// |
500 | | /// Note that using `try_extension_type` with an extension type that does |
501 | | /// not match the name in the metadata will return an `ArrowError` which can |
502 | | /// be slow due to string allocations. If you only want to check if a |
503 | | /// [`Field`] has a specific [`ExtensionType`], see the example below. |
504 | | /// |
505 | | /// # Errors |
506 | | /// |
507 | | /// Returns an error if |
508 | | /// - this field does not have the name of this extension type |
509 | | /// ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or |
510 | | /// missing) |
511 | | /// - the deserialization of the metadata |
512 | | /// ([`ExtensionType::deserialize_metadata`]) fails |
513 | | /// - the construction of the extension type ([`ExtensionType::try_new`]) |
514 | | /// fail (for example when the [`Field::data_type`] is not supported by |
515 | | /// the extension type ([`ExtensionType::supports_data_type`])) |
516 | | /// |
517 | | /// # Examples: Check and retrieve an extension type |
518 | | /// You can use this to check if a [`Field`] has a specific |
519 | | /// [`ExtensionType`] and retrieve it: |
520 | | /// ``` |
521 | | /// # use arrow_schema::{DataType, Field, ArrowError}; |
522 | | /// # use arrow_schema::extension::ExtensionType; |
523 | | /// # struct MyExtensionType; |
524 | | /// # impl ExtensionType for MyExtensionType { |
525 | | /// # const NAME: &'static str = "my_extension"; |
526 | | /// # type Metadata = String; |
527 | | /// # fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { Ok(()) } |
528 | | /// # fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> { Ok(Self) } |
529 | | /// # fn serialize_metadata(&self) -> Option<String> { unimplemented!() } |
530 | | /// # fn deserialize_metadata(s: Option<&str>) -> Result<Self::Metadata, ArrowError> { unimplemented!() } |
531 | | /// # fn metadata(&self) -> &<Self as ExtensionType>::Metadata { todo!() } |
532 | | /// # } |
533 | | /// # fn get_field() -> Field { Field::new("field", DataType::Null, false) } |
534 | | /// let field = get_field(); |
535 | | /// if let Ok(extension_type) = field.try_extension_type::<MyExtensionType>() { |
536 | | /// // do something with extension_type |
537 | | /// } |
538 | | /// ``` |
539 | | /// |
540 | | /// # Example: Checking if a field has a specific extension type first |
541 | | /// |
542 | | /// Since `try_extension_type` returns an error, it is more |
543 | | /// efficient to first check if the name matches before calling |
544 | | /// `try_extension_type`: |
545 | | /// ``` |
546 | | /// # use arrow_schema::{DataType, Field, ArrowError}; |
547 | | /// # use arrow_schema::extension::ExtensionType; |
548 | | /// # struct MyExtensionType; |
549 | | /// # impl ExtensionType for MyExtensionType { |
550 | | /// # const NAME: &'static str = "my_extension"; |
551 | | /// # type Metadata = String; |
552 | | /// # fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { Ok(()) } |
553 | | /// # fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> { Ok(Self) } |
554 | | /// # fn serialize_metadata(&self) -> Option<String> { unimplemented!() } |
555 | | /// # fn deserialize_metadata(s: Option<&str>) -> Result<Self::Metadata, ArrowError> { unimplemented!() } |
556 | | /// # fn metadata(&self) -> &<Self as ExtensionType>::Metadata { todo!() } |
557 | | /// # } |
558 | | /// # fn get_field() -> Field { Field::new("field", DataType::Null, false) } |
559 | | /// let field = get_field(); |
560 | | /// // First check if the name matches before calling the potentially expensive `try_extension_type` |
561 | | /// if field.extension_type_name() == Some(MyExtensionType::NAME) { |
562 | | /// if let Ok(extension_type) = field.try_extension_type::<MyExtensionType>() { |
563 | | /// // do something with extension_type |
564 | | /// } |
565 | | /// } |
566 | | /// ``` |
567 | 0 | pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> { |
568 | | // Check the extension name in the metadata |
569 | 0 | match self.extension_type_name() { |
570 | | // It should match the name of the given extension type |
571 | 0 | Some(name) if name == E::NAME => { |
572 | | // Deserialize the metadata and try to construct the extension |
573 | | // type |
574 | 0 | E::deserialize_metadata(self.extension_type_metadata()) |
575 | 0 | .and_then(|metadata| E::try_new(self.data_type(), metadata)) |
576 | | } |
577 | | // Name mismatch |
578 | 0 | Some(name) => Err(ArrowError::InvalidArgumentError(format!( |
579 | 0 | "Field extension type name mismatch, expected {}, found {name}", |
580 | 0 | E::NAME |
581 | 0 | ))), |
582 | | // Name missing |
583 | 0 | None => Err(ArrowError::InvalidArgumentError( |
584 | 0 | "Field extension type name missing".to_owned(), |
585 | 0 | )), |
586 | | } |
587 | 0 | } |
588 | | |
589 | | /// Returns an instance of the given [`ExtensionType`] of this [`Field`], |
590 | | /// panics if this [`Field`] does not have this extension type. |
591 | | /// |
592 | | /// # Panic |
593 | | /// |
594 | | /// This calls [`Field::try_extension_type`] and panics when it returns an |
595 | | /// error. |
596 | 0 | pub fn extension_type<E: ExtensionType>(&self) -> E { |
597 | 0 | self.try_extension_type::<E>() |
598 | 0 | .unwrap_or_else(|e| panic!("{e}")) |
599 | 0 | } |
600 | | |
601 | | /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] |
602 | | /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the |
603 | | /// given extension type supports the [`Field::data_type`] of this field |
604 | | /// ([`ExtensionType::supports_data_type`]). |
605 | | /// |
606 | | /// If the given extension type defines no metadata, a previously set |
607 | | /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared. |
608 | | /// |
609 | | /// # Error |
610 | | /// |
611 | | /// This functions returns an error if the data type of this field does not |
612 | | /// match any of the supported storage types of the given extension type. |
613 | 0 | pub fn try_with_extension_type<E: ExtensionType>( |
614 | 0 | &mut self, |
615 | 0 | extension_type: E, |
616 | 0 | ) -> Result<(), ArrowError> { |
617 | | // Make sure the data type of this field is supported |
618 | 0 | extension_type.supports_data_type(&self.data_type)?; |
619 | | |
620 | 0 | self.metadata |
621 | 0 | .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); |
622 | 0 | match extension_type.serialize_metadata() { |
623 | 0 | Some(metadata) => self |
624 | 0 | .metadata |
625 | 0 | .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata), |
626 | | // If this extension type has no metadata, we make sure to |
627 | | // clear previously set metadata. |
628 | 0 | None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY), |
629 | | }; |
630 | | |
631 | 0 | Ok(()) |
632 | 0 | } |
633 | | |
634 | | /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] |
635 | | /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. |
636 | | /// |
637 | | /// # Panics |
638 | | /// |
639 | | /// This calls [`Field::try_with_extension_type`] and panics when it |
640 | | /// returns an error. |
641 | 0 | pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self { |
642 | 0 | self.try_with_extension_type(extension_type) |
643 | 0 | .unwrap_or_else(|e| panic!("{e}")); |
644 | 0 | self |
645 | 0 | } |
646 | | |
647 | | /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set. |
648 | | /// |
649 | | /// # Error |
650 | | /// |
651 | | /// Returns an error if |
652 | | /// - this field does not have a canonical extension type (mismatch or missing) |
653 | | /// - the canonical extension is not supported |
654 | | /// - the construction of the extension type fails |
655 | | #[cfg(feature = "canonical_extension_types")] |
656 | | pub fn try_canonical_extension_type(&self) -> Result<CanonicalExtensionType, ArrowError> { |
657 | | CanonicalExtensionType::try_from(self) |
658 | | } |
659 | | |
660 | | /// Indicates whether this [`Field`] supports null values. |
661 | | /// |
662 | | /// If true, the field *may* contain null values. |
663 | | #[inline] |
664 | 80.7k | pub const fn is_nullable(&self) -> bool { |
665 | 80.7k | self.nullable |
666 | 80.7k | } |
667 | | |
668 | | /// Set the `nullable` of this [`Field`]. |
669 | | /// |
670 | | /// ``` |
671 | | /// # use arrow_schema::*; |
672 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
673 | | /// field.set_nullable(true); |
674 | | /// |
675 | | /// assert_eq!(field.is_nullable(), true); |
676 | | /// ``` |
677 | | #[inline] |
678 | 0 | pub fn set_nullable(&mut self, nullable: bool) { |
679 | 0 | self.nullable = nullable; |
680 | 0 | } |
681 | | |
682 | | /// Set `nullable` of the [`Field`] and returns self. |
683 | | /// |
684 | | /// ``` |
685 | | /// # use arrow_schema::*; |
686 | | /// let field = Field::new("c1", DataType::Int64, false) |
687 | | /// .with_nullable(true); |
688 | | /// |
689 | | /// assert_eq!(field.is_nullable(), true); |
690 | | /// ``` |
691 | 0 | pub fn with_nullable(mut self, nullable: bool) -> Self { |
692 | 0 | self.set_nullable(nullable); |
693 | 0 | self |
694 | 0 | } |
695 | | |
696 | | /// Returns a (flattened) [`Vec`] containing all child [`Field`]s |
697 | | /// within `self` contained within this field (including `self`) |
698 | 0 | pub(crate) fn fields(&self) -> Vec<&Field> { |
699 | 0 | let mut collected_fields = vec![self]; |
700 | 0 | collected_fields.append(&mut Field::_fields(&self.data_type)); |
701 | | |
702 | 0 | collected_fields |
703 | 0 | } |
704 | | |
705 | 0 | fn _fields(dt: &DataType) -> Vec<&Field> { |
706 | 0 | match dt { |
707 | 0 | DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), |
708 | 0 | DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(), |
709 | 0 | DataType::List(field) |
710 | 0 | | DataType::LargeList(field) |
711 | 0 | | DataType::FixedSizeList(field, _) |
712 | 0 | | DataType::Map(field, _) => field.fields(), |
713 | 0 | DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), |
714 | 0 | DataType::RunEndEncoded(_, field) => field.fields(), |
715 | 0 | _ => vec![], |
716 | | } |
717 | 0 | } |
718 | | |
719 | | /// Returns a vector containing all (potentially nested) `Field` instances selected by the |
720 | | /// dictionary ID they use |
721 | | #[inline] |
722 | | #[deprecated( |
723 | | since = "54.0.0", |
724 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
725 | | )] |
726 | 0 | pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> { |
727 | 0 | self.fields() |
728 | 0 | .into_iter() |
729 | 0 | .filter(|&field| { |
730 | | #[allow(deprecated)] |
731 | 0 | let matching_dict_id = field.dict_id == id; |
732 | 0 | matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id |
733 | 0 | }) |
734 | 0 | .collect() |
735 | 0 | } |
736 | | |
737 | | /// Returns the dictionary ID, if this is a dictionary type. |
738 | | #[inline] |
739 | | #[deprecated( |
740 | | since = "54.0.0", |
741 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
742 | | )] |
743 | 0 | pub const fn dict_id(&self) -> Option<i64> { |
744 | 0 | match self.data_type { |
745 | | #[allow(deprecated)] |
746 | 0 | DataType::Dictionary(_, _) => Some(self.dict_id), |
747 | 0 | _ => None, |
748 | | } |
749 | 0 | } |
750 | | |
751 | | /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type. |
752 | | /// |
753 | | /// # Example |
754 | | /// ``` |
755 | | /// # use arrow_schema::{DataType, Field}; |
756 | | /// // non dictionaries do not have a dict is ordered flat |
757 | | /// let field = Field::new("c1", DataType::Int64, false); |
758 | | /// assert_eq!(field.dict_is_ordered(), None); |
759 | | /// // by default dictionary is not ordered |
760 | | /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false); |
761 | | /// assert_eq!(field.dict_is_ordered(), Some(false)); |
762 | | /// let field = field.with_dict_is_ordered(true); |
763 | | /// assert_eq!(field.dict_is_ordered(), Some(true)); |
764 | | /// ``` |
765 | | #[inline] |
766 | 0 | pub const fn dict_is_ordered(&self) -> Option<bool> { |
767 | 0 | match self.data_type { |
768 | 0 | DataType::Dictionary(_, _) => Some(self.dict_is_ordered), |
769 | 0 | _ => None, |
770 | | } |
771 | 0 | } |
772 | | |
773 | | /// Set the is ordered field for this `Field`, if it is a dictionary. |
774 | | /// |
775 | | /// Does nothing if this is not a dictionary type. |
776 | | /// |
777 | | /// See [`Field::dict_is_ordered`] for more information. |
778 | 0 | pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self { |
779 | 0 | if matches!(self.data_type, DataType::Dictionary(_, _)) { |
780 | 0 | self.dict_is_ordered = dict_is_ordered; |
781 | 0 | }; |
782 | 0 | self |
783 | 0 | } |
784 | | |
785 | | /// Merge this field into self if it is compatible. |
786 | | /// |
787 | | /// Struct fields are merged recursively. |
788 | | /// |
789 | | /// NOTE: `self` may be updated to a partial / unexpected state in case of merge failure. |
790 | | /// |
791 | | /// Example: |
792 | | /// |
793 | | /// ``` |
794 | | /// # use arrow_schema::*; |
795 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
796 | | /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok()); |
797 | | /// assert!(field.is_nullable()); |
798 | | /// ``` |
799 | 0 | pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { |
800 | 0 | if from.dict_is_ordered != self.dict_is_ordered { |
801 | 0 | return Err(ArrowError::SchemaError(format!( |
802 | 0 | "Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}", |
803 | 0 | self.name, from.dict_is_ordered, self.dict_is_ordered |
804 | 0 | ))); |
805 | 0 | } |
806 | | // merge metadata |
807 | 0 | match (self.metadata().is_empty(), from.metadata().is_empty()) { |
808 | | (false, false) => { |
809 | 0 | let mut merged = self.metadata().clone(); |
810 | 0 | for (key, from_value) in from.metadata() { |
811 | 0 | if let Some(self_value) = self.metadata.get(key) { |
812 | 0 | if self_value != from_value { |
813 | 0 | return Err(ArrowError::SchemaError(format!( |
814 | 0 | "Fail to merge field '{}' due to conflicting metadata data value for key {}. |
815 | 0 | From value = {} does not match {}", self.name, key, from_value, self_value), |
816 | 0 | )); |
817 | 0 | } |
818 | 0 | } else { |
819 | 0 | merged.insert(key.clone(), from_value.clone()); |
820 | 0 | } |
821 | | } |
822 | 0 | self.set_metadata(merged); |
823 | | } |
824 | 0 | (true, false) => { |
825 | 0 | self.set_metadata(from.metadata().clone()); |
826 | 0 | } |
827 | 0 | _ => {} |
828 | | } |
829 | 0 | match &mut self.data_type { |
830 | 0 | DataType::Struct(nested_fields) => match &from.data_type { |
831 | 0 | DataType::Struct(from_nested_fields) => { |
832 | 0 | let mut builder = SchemaBuilder::new(); |
833 | 0 | nested_fields |
834 | 0 | .iter() |
835 | 0 | .chain(from_nested_fields) |
836 | 0 | .try_for_each(|f| builder.try_merge(f))?; |
837 | 0 | *nested_fields = builder.finish().fields; |
838 | | } |
839 | | _ => { |
840 | 0 | return Err(ArrowError::SchemaError(format!( |
841 | 0 | "Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct", |
842 | 0 | self.name, from.data_type |
843 | 0 | ))); |
844 | | } |
845 | | }, |
846 | 0 | DataType::Union(nested_fields, _) => match &from.data_type { |
847 | 0 | DataType::Union(from_nested_fields, _) => { |
848 | 0 | nested_fields.try_merge(from_nested_fields)? |
849 | | } |
850 | | _ => { |
851 | 0 | return Err(ArrowError::SchemaError(format!( |
852 | 0 | "Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union", |
853 | 0 | self.name, from.data_type |
854 | 0 | ))); |
855 | | } |
856 | | }, |
857 | 0 | DataType::List(field) => match &from.data_type { |
858 | 0 | DataType::List(from_field) => { |
859 | 0 | let mut f = (**field).clone(); |
860 | 0 | f.try_merge(from_field)?; |
861 | 0 | (*field) = Arc::new(f); |
862 | | } |
863 | | _ => { |
864 | 0 | return Err(ArrowError::SchemaError(format!( |
865 | 0 | "Fail to merge schema field '{}' because the from data_type = {} is not DataType::List", |
866 | 0 | self.name, from.data_type |
867 | 0 | ))); |
868 | | } |
869 | | }, |
870 | 0 | DataType::LargeList(field) => match &from.data_type { |
871 | 0 | DataType::LargeList(from_field) => { |
872 | 0 | let mut f = (**field).clone(); |
873 | 0 | f.try_merge(from_field)?; |
874 | 0 | (*field) = Arc::new(f); |
875 | | } |
876 | | _ => { |
877 | 0 | return Err(ArrowError::SchemaError(format!( |
878 | 0 | "Fail to merge schema field '{}' because the from data_type = {} is not DataType::LargeList", |
879 | 0 | self.name, from.data_type |
880 | 0 | ))); |
881 | | } |
882 | | }, |
883 | 0 | DataType::Null => { |
884 | 0 | self.nullable = true; |
885 | 0 | self.data_type = from.data_type.clone(); |
886 | 0 | } |
887 | | DataType::Boolean |
888 | | | DataType::Int8 |
889 | | | DataType::Int16 |
890 | | | DataType::Int32 |
891 | | | DataType::Int64 |
892 | | | DataType::UInt8 |
893 | | | DataType::UInt16 |
894 | | | DataType::UInt32 |
895 | | | DataType::UInt64 |
896 | | | DataType::Float16 |
897 | | | DataType::Float32 |
898 | | | DataType::Float64 |
899 | | | DataType::Timestamp(_, _) |
900 | | | DataType::Date32 |
901 | | | DataType::Date64 |
902 | | | DataType::Time32(_) |
903 | | | DataType::Time64(_) |
904 | | | DataType::Duration(_) |
905 | | | DataType::Binary |
906 | | | DataType::LargeBinary |
907 | | | DataType::BinaryView |
908 | | | DataType::Interval(_) |
909 | | | DataType::LargeListView(_) |
910 | | | DataType::ListView(_) |
911 | | | DataType::Map(_, _) |
912 | | | DataType::Dictionary(_, _) |
913 | | | DataType::RunEndEncoded(_, _) |
914 | | | DataType::FixedSizeList(_, _) |
915 | | | DataType::FixedSizeBinary(_) |
916 | | | DataType::Utf8 |
917 | | | DataType::LargeUtf8 |
918 | | | DataType::Utf8View |
919 | | | DataType::Decimal32(_, _) |
920 | | | DataType::Decimal64(_, _) |
921 | | | DataType::Decimal128(_, _) |
922 | | | DataType::Decimal256(_, _) => { |
923 | 0 | if from.data_type == DataType::Null { |
924 | 0 | self.nullable = true; |
925 | 0 | } else if self.data_type != from.data_type { |
926 | 0 | return Err(ArrowError::SchemaError(format!( |
927 | 0 | "Fail to merge schema field '{}' because the from data_type = {} does not equal {}", |
928 | 0 | self.name, from.data_type, self.data_type |
929 | 0 | ))); |
930 | 0 | } |
931 | | } |
932 | | } |
933 | 0 | self.nullable |= from.nullable; |
934 | | |
935 | 0 | Ok(()) |
936 | 0 | } |
937 | | |
938 | | /// Check to see if `self` is a superset of `other` field. Superset is defined as: |
939 | | /// |
940 | | /// * if nullability doesn't match, self needs to be nullable |
941 | | /// * self.metadata is a superset of other.metadata |
942 | | /// * all other fields are equal |
943 | 0 | pub fn contains(&self, other: &Field) -> bool { |
944 | 0 | self.name == other.name |
945 | 0 | && self.data_type.contains(&other.data_type) |
946 | 0 | && self.dict_is_ordered == other.dict_is_ordered |
947 | | // self need to be nullable or both of them are not nullable |
948 | 0 | && (self.nullable || !other.nullable) |
949 | | // make sure self.metadata is a superset of other.metadata |
950 | 0 | && other.metadata.iter().all(|(k, v1)| { |
951 | 0 | self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default() |
952 | 0 | }) |
953 | 0 | } |
954 | | |
955 | | /// Return size of this instance in bytes. |
956 | | /// |
957 | | /// Includes the size of `Self`. |
958 | 0 | pub fn size(&self) -> usize { |
959 | 0 | std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type) |
960 | 0 | + self.data_type.size() |
961 | 0 | + self.name.capacity() |
962 | 0 | + (std::mem::size_of::<(String, String)>() * self.metadata.capacity()) |
963 | 0 | + self |
964 | 0 | .metadata |
965 | 0 | .iter() |
966 | 0 | .map(|(k, v)| k.capacity() + v.capacity()) |
967 | 0 | .sum::<usize>() |
968 | 0 | } |
969 | | } |
970 | | |
971 | | impl std::fmt::Display for Field { |
972 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
973 | | #![expect(deprecated)] // Must still print dict_id, if set |
974 | | let Self { |
975 | 0 | name, |
976 | 0 | data_type, |
977 | 0 | nullable, |
978 | 0 | dict_id, |
979 | 0 | dict_is_ordered, |
980 | 0 | metadata, |
981 | 0 | } = self; |
982 | 0 | let maybe_nullable = if *nullable { "nullable " } else { "" }; |
983 | 0 | let metadata_str = if metadata.is_empty() { |
984 | 0 | String::new() |
985 | | } else { |
986 | 0 | format!(", metadata: {metadata:?}") |
987 | | }; |
988 | 0 | let dict_id_str = if dict_id == &0 { |
989 | 0 | String::new() |
990 | | } else { |
991 | 0 | format!(", dict_id: {dict_id}") |
992 | | }; |
993 | 0 | let dict_is_ordered_str = if *dict_is_ordered { |
994 | 0 | ", dict_is_ordered" |
995 | | } else { |
996 | 0 | "" |
997 | | }; |
998 | 0 | write!( |
999 | 0 | f, |
1000 | 0 | "Field {{ {name:?}: {maybe_nullable}{data_type}{dict_id_str}{dict_is_ordered_str}{metadata_str} }}" |
1001 | | ) |
1002 | 0 | } |
1003 | | } |
1004 | | |
1005 | | #[cfg(test)] |
1006 | | mod test { |
1007 | | use super::*; |
1008 | | use std::collections::hash_map::DefaultHasher; |
1009 | | |
1010 | | #[test] |
1011 | | fn test_new_with_string() { |
1012 | | // Fields should allow owned Strings to support reuse |
1013 | | let s = "c1"; |
1014 | | Field::new(s, DataType::Int64, false); |
1015 | | } |
1016 | | |
1017 | | #[test] |
1018 | | fn test_new_dict_with_string() { |
1019 | | // Fields should allow owned Strings to support reuse |
1020 | | let s = "c1"; |
1021 | | #[allow(deprecated)] |
1022 | | Field::new_dict(s, DataType::Int64, false, 4, false); |
1023 | | } |
1024 | | |
1025 | | #[test] |
1026 | | #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro |
1027 | | fn test_debug_format_field() { |
1028 | | // Make sure the `Debug` formatting of `Field` is readable and not too long |
1029 | | insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r" |
1030 | | Field { |
1031 | | data_type: UInt8, |
1032 | | } |
1033 | | "); |
1034 | | insta::assert_debug_snapshot!(Field::new("column", DataType::LargeUtf8, true), @r#" |
1035 | | Field { |
1036 | | name: "column", |
1037 | | data_type: LargeUtf8, |
1038 | | nullable: true, |
1039 | | } |
1040 | | "#); |
1041 | | } |
1042 | | |
1043 | | #[test] |
1044 | | fn test_merge_incompatible_types() { |
1045 | | let mut field = Field::new("c1", DataType::Int64, false); |
1046 | | let result = field |
1047 | | .try_merge(&Field::new("c1", DataType::Float32, true)) |
1048 | | .expect_err("should fail") |
1049 | | .to_string(); |
1050 | | assert_eq!( |
1051 | | "Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", |
1052 | | result |
1053 | | ); |
1054 | | } |
1055 | | |
1056 | | #[test] |
1057 | | fn test_merge_with_null() { |
1058 | | let mut field1 = Field::new("c1", DataType::Null, true); |
1059 | | field1 |
1060 | | .try_merge(&Field::new("c1", DataType::Float32, false)) |
1061 | | .expect("should widen type to nullable float"); |
1062 | | assert_eq!(Field::new("c1", DataType::Float32, true), field1); |
1063 | | |
1064 | | let mut field2 = Field::new("c2", DataType::Utf8, false); |
1065 | | field2 |
1066 | | .try_merge(&Field::new("c2", DataType::Null, true)) |
1067 | | .expect("should widen type to nullable utf8"); |
1068 | | assert_eq!(Field::new("c2", DataType::Utf8, true), field2); |
1069 | | } |
1070 | | |
1071 | | #[test] |
1072 | | fn test_merge_with_nested_null() { |
1073 | | let mut struct1 = Field::new( |
1074 | | "s1", |
1075 | | DataType::Struct(Fields::from(vec![Field::new( |
1076 | | "inner", |
1077 | | DataType::Float32, |
1078 | | false, |
1079 | | )])), |
1080 | | false, |
1081 | | ); |
1082 | | |
1083 | | let struct2 = Field::new( |
1084 | | "s2", |
1085 | | DataType::Struct(Fields::from(vec![Field::new( |
1086 | | "inner", |
1087 | | DataType::Null, |
1088 | | false, |
1089 | | )])), |
1090 | | true, |
1091 | | ); |
1092 | | |
1093 | | struct1 |
1094 | | .try_merge(&struct2) |
1095 | | .expect("should widen inner field's type to nullable float"); |
1096 | | assert_eq!( |
1097 | | Field::new( |
1098 | | "s1", |
1099 | | DataType::Struct(Fields::from(vec![Field::new( |
1100 | | "inner", |
1101 | | DataType::Float32, |
1102 | | true, |
1103 | | )])), |
1104 | | true, |
1105 | | ), |
1106 | | struct1 |
1107 | | ); |
1108 | | |
1109 | | let mut list1 = Field::new( |
1110 | | "l1", |
1111 | | DataType::List(Field::new("inner", DataType::Float32, false).into()), |
1112 | | false, |
1113 | | ); |
1114 | | |
1115 | | let list2 = Field::new( |
1116 | | "l2", |
1117 | | DataType::List(Field::new("inner", DataType::Null, false).into()), |
1118 | | true, |
1119 | | ); |
1120 | | |
1121 | | list1 |
1122 | | .try_merge(&list2) |
1123 | | .expect("should widen inner field's type to nullable float"); |
1124 | | assert_eq!( |
1125 | | Field::new( |
1126 | | "l1", |
1127 | | DataType::List(Field::new("inner", DataType::Float32, true).into()), |
1128 | | true, |
1129 | | ), |
1130 | | list1 |
1131 | | ); |
1132 | | |
1133 | | let mut large_list1 = Field::new( |
1134 | | "ll1", |
1135 | | DataType::LargeList(Field::new("inner", DataType::Float32, false).into()), |
1136 | | false, |
1137 | | ); |
1138 | | |
1139 | | let large_list2 = Field::new( |
1140 | | "ll2", |
1141 | | DataType::LargeList(Field::new("inner", DataType::Null, false).into()), |
1142 | | true, |
1143 | | ); |
1144 | | |
1145 | | large_list1 |
1146 | | .try_merge(&large_list2) |
1147 | | .expect("should widen inner field's type to nullable float"); |
1148 | | assert_eq!( |
1149 | | Field::new( |
1150 | | "ll1", |
1151 | | DataType::LargeList(Field::new("inner", DataType::Float32, true).into()), |
1152 | | true, |
1153 | | ), |
1154 | | large_list1 |
1155 | | ); |
1156 | | } |
1157 | | |
1158 | | #[test] |
1159 | | fn test_fields_with_dict_id() { |
1160 | | #[allow(deprecated)] |
1161 | | let dict1 = Field::new_dict( |
1162 | | "dict1", |
1163 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1164 | | false, |
1165 | | 10, |
1166 | | false, |
1167 | | ); |
1168 | | #[allow(deprecated)] |
1169 | | let dict2 = Field::new_dict( |
1170 | | "dict2", |
1171 | | DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()), |
1172 | | false, |
1173 | | 20, |
1174 | | false, |
1175 | | ); |
1176 | | |
1177 | | let field = Field::new( |
1178 | | "struct<dict1, list[struct<dict2, list[struct<dict1]>]>", |
1179 | | DataType::Struct(Fields::from(vec![ |
1180 | | dict1.clone(), |
1181 | | Field::new( |
1182 | | "list[struct<dict1, list[struct<dict2>]>]", |
1183 | | DataType::List(Arc::new(Field::new( |
1184 | | "struct<dict1, list[struct<dict2>]>", |
1185 | | DataType::Struct(Fields::from(vec![ |
1186 | | dict1.clone(), |
1187 | | Field::new( |
1188 | | "list[struct<dict2>]", |
1189 | | DataType::List(Arc::new(Field::new( |
1190 | | "struct<dict2>", |
1191 | | DataType::Struct(vec![dict2.clone()].into()), |
1192 | | false, |
1193 | | ))), |
1194 | | false, |
1195 | | ), |
1196 | | ])), |
1197 | | false, |
1198 | | ))), |
1199 | | false, |
1200 | | ), |
1201 | | ])), |
1202 | | false, |
1203 | | ); |
1204 | | |
1205 | | #[allow(deprecated)] |
1206 | | for field in field.fields_with_dict_id(10) { |
1207 | | assert_eq!(dict1, *field); |
1208 | | } |
1209 | | #[allow(deprecated)] |
1210 | | for field in field.fields_with_dict_id(20) { |
1211 | | assert_eq!(dict2, *field); |
1212 | | } |
1213 | | } |
1214 | | |
1215 | | fn get_field_hash(field: &Field) -> u64 { |
1216 | | let mut s = DefaultHasher::new(); |
1217 | | field.hash(&mut s); |
1218 | | s.finish() |
1219 | | } |
1220 | | |
1221 | | #[test] |
1222 | | fn test_field_comparison_case() { |
1223 | | // dictionary-encoding properties not used for field comparison |
1224 | | #[allow(deprecated)] |
1225 | | let dict1 = Field::new_dict( |
1226 | | "dict1", |
1227 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1228 | | false, |
1229 | | 10, |
1230 | | false, |
1231 | | ); |
1232 | | #[allow(deprecated)] |
1233 | | let dict2 = Field::new_dict( |
1234 | | "dict1", |
1235 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1236 | | false, |
1237 | | 20, |
1238 | | false, |
1239 | | ); |
1240 | | |
1241 | | assert_eq!(dict1, dict2); |
1242 | | assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); |
1243 | | |
1244 | | #[allow(deprecated)] |
1245 | | let dict1 = Field::new_dict( |
1246 | | "dict0", |
1247 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1248 | | false, |
1249 | | 10, |
1250 | | false, |
1251 | | ); |
1252 | | |
1253 | | assert_ne!(dict1, dict2); |
1254 | | assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); |
1255 | | } |
1256 | | |
1257 | | #[test] |
1258 | | fn test_field_comparison_metadata() { |
1259 | | let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1260 | | (String::from("k1"), String::from("v1")), |
1261 | | (String::from("k2"), String::from("v2")), |
1262 | | ])); |
1263 | | let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1264 | | (String::from("k1"), String::from("v1")), |
1265 | | (String::from("k3"), String::from("v3")), |
1266 | | ])); |
1267 | | let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1268 | | (String::from("k1"), String::from("v1")), |
1269 | | (String::from("k3"), String::from("v4")), |
1270 | | ])); |
1271 | | |
1272 | | assert!(f1.cmp(&f2).is_lt()); |
1273 | | assert!(f2.cmp(&f3).is_lt()); |
1274 | | assert!(f1.cmp(&f3).is_lt()); |
1275 | | } |
1276 | | |
1277 | | #[test] |
1278 | | #[expect(clippy::needless_borrows_for_generic_args)] // intentional to exercise various references |
1279 | | fn test_field_as_ref() { |
1280 | | let field = || Field::new("x", DataType::Binary, false); |
1281 | | |
1282 | | // AsRef can be used in a function accepting a field. |
1283 | | // However, this case actually works a bit better when function takes `&Field` |
1284 | | fn accept_ref(_: impl AsRef<Field>) {} |
1285 | | |
1286 | | accept_ref(field()); |
1287 | | accept_ref(&field()); |
1288 | | accept_ref(&&field()); |
1289 | | accept_ref(Arc::new(field())); |
1290 | | accept_ref(&Arc::new(field())); |
1291 | | accept_ref(&&Arc::new(field())); |
1292 | | |
1293 | | // AsRef can be used in a function accepting a collection of fields in any form, |
1294 | | // such as &[Field], or &[Arc<Field>] |
1295 | | fn accept_refs(_: impl IntoIterator<Item: AsRef<Field>>) {} |
1296 | | |
1297 | | accept_refs(vec![field()]); |
1298 | | accept_refs(vec![&field()]); |
1299 | | accept_refs(vec![Arc::new(field())]); |
1300 | | accept_refs(vec![&Arc::new(field())]); |
1301 | | accept_refs(&vec![field()]); |
1302 | | accept_refs(&vec![&field()]); |
1303 | | accept_refs(&vec![Arc::new(field())]); |
1304 | | accept_refs(&vec![&Arc::new(field())]); |
1305 | | } |
1306 | | |
1307 | | #[test] |
1308 | | fn test_contains_reflexivity() { |
1309 | | let mut field = Field::new("field1", DataType::Float16, false); |
1310 | | field.set_metadata(HashMap::from([ |
1311 | | (String::from("k0"), String::from("v0")), |
1312 | | (String::from("k1"), String::from("v1")), |
1313 | | ])); |
1314 | | assert!(field.contains(&field)) |
1315 | | } |
1316 | | |
1317 | | #[test] |
1318 | | fn test_contains_transitivity() { |
1319 | | let child_field = Field::new("child1", DataType::Float16, false); |
1320 | | |
1321 | | let mut field1 = Field::new( |
1322 | | "field1", |
1323 | | DataType::Struct(Fields::from(vec![child_field])), |
1324 | | false, |
1325 | | ); |
1326 | | field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); |
1327 | | |
1328 | | let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true); |
1329 | | field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))])); |
1330 | | field2.try_merge(&field1).unwrap(); |
1331 | | |
1332 | | let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false); |
1333 | | field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))])); |
1334 | | field3.try_merge(&field2).unwrap(); |
1335 | | |
1336 | | assert!(field2.contains(&field1)); |
1337 | | assert!(field3.contains(&field2)); |
1338 | | assert!(field3.contains(&field1)); |
1339 | | |
1340 | | assert!(!field1.contains(&field2)); |
1341 | | assert!(!field1.contains(&field3)); |
1342 | | assert!(!field2.contains(&field3)); |
1343 | | } |
1344 | | |
1345 | | #[test] |
1346 | | fn test_contains_nullable() { |
1347 | | let field1 = Field::new("field1", DataType::Boolean, true); |
1348 | | let field2 = Field::new("field1", DataType::Boolean, false); |
1349 | | assert!(field1.contains(&field2)); |
1350 | | assert!(!field2.contains(&field1)); |
1351 | | } |
1352 | | |
1353 | | #[test] |
1354 | | fn test_contains_must_have_same_fields() { |
1355 | | let child_field1 = Field::new("child1", DataType::Float16, false); |
1356 | | let child_field2 = Field::new("child2", DataType::Float16, false); |
1357 | | |
1358 | | let field1 = Field::new( |
1359 | | "field1", |
1360 | | DataType::Struct(vec![child_field1.clone()].into()), |
1361 | | true, |
1362 | | ); |
1363 | | let field2 = Field::new( |
1364 | | "field1", |
1365 | | DataType::Struct(vec![child_field1, child_field2].into()), |
1366 | | true, |
1367 | | ); |
1368 | | |
1369 | | assert!(!field1.contains(&field2)); |
1370 | | assert!(!field2.contains(&field1)); |
1371 | | |
1372 | | // UnionFields with different type ID |
1373 | | let field1 = Field::new( |
1374 | | "field1", |
1375 | | DataType::Union( |
1376 | | UnionFields::new( |
1377 | | vec![1, 2], |
1378 | | vec![ |
1379 | | Field::new("field1", DataType::UInt8, true), |
1380 | | Field::new("field3", DataType::Utf8, false), |
1381 | | ], |
1382 | | ), |
1383 | | UnionMode::Dense, |
1384 | | ), |
1385 | | true, |
1386 | | ); |
1387 | | let field2 = Field::new( |
1388 | | "field1", |
1389 | | DataType::Union( |
1390 | | UnionFields::new( |
1391 | | vec![1, 3], |
1392 | | vec![ |
1393 | | Field::new("field1", DataType::UInt8, false), |
1394 | | Field::new("field3", DataType::Utf8, false), |
1395 | | ], |
1396 | | ), |
1397 | | UnionMode::Dense, |
1398 | | ), |
1399 | | true, |
1400 | | ); |
1401 | | assert!(!field1.contains(&field2)); |
1402 | | |
1403 | | // UnionFields with same type ID |
1404 | | let field1 = Field::new( |
1405 | | "field1", |
1406 | | DataType::Union( |
1407 | | UnionFields::new( |
1408 | | vec![1, 2], |
1409 | | vec![ |
1410 | | Field::new("field1", DataType::UInt8, true), |
1411 | | Field::new("field3", DataType::Utf8, false), |
1412 | | ], |
1413 | | ), |
1414 | | UnionMode::Dense, |
1415 | | ), |
1416 | | true, |
1417 | | ); |
1418 | | let field2 = Field::new( |
1419 | | "field1", |
1420 | | DataType::Union( |
1421 | | UnionFields::new( |
1422 | | vec![1, 2], |
1423 | | vec![ |
1424 | | Field::new("field1", DataType::UInt8, false), |
1425 | | Field::new("field3", DataType::Utf8, false), |
1426 | | ], |
1427 | | ), |
1428 | | UnionMode::Dense, |
1429 | | ), |
1430 | | true, |
1431 | | ); |
1432 | | assert!(field1.contains(&field2)); |
1433 | | } |
1434 | | |
1435 | | #[cfg(feature = "serde")] |
1436 | | fn assert_binary_serde_round_trip(field: Field) { |
1437 | | let config = bincode::config::legacy(); |
1438 | | let serialized = bincode::serde::encode_to_vec(&field, config).unwrap(); |
1439 | | let (deserialized, _): (Field, _) = |
1440 | | bincode::serde::decode_from_slice(&serialized, config).unwrap(); |
1441 | | assert_eq!(field, deserialized) |
1442 | | } |
1443 | | |
1444 | | #[cfg(feature = "serde")] |
1445 | | #[test] |
1446 | | fn test_field_without_metadata_serde() { |
1447 | | let field = Field::new("name", DataType::Boolean, true); |
1448 | | assert_binary_serde_round_trip(field) |
1449 | | } |
1450 | | |
1451 | | #[cfg(feature = "serde")] |
1452 | | #[test] |
1453 | | fn test_field_with_empty_metadata_serde() { |
1454 | | let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new()); |
1455 | | |
1456 | | assert_binary_serde_round_trip(field) |
1457 | | } |
1458 | | |
1459 | | #[cfg(feature = "serde")] |
1460 | | #[test] |
1461 | | fn test_field_with_nonempty_metadata_serde() { |
1462 | | let mut metadata = HashMap::new(); |
1463 | | metadata.insert("hi".to_owned(), "".to_owned()); |
1464 | | let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata); |
1465 | | |
1466 | | assert_binary_serde_round_trip(field) |
1467 | | } |
1468 | | } |