/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/field.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::error::ArrowError; |
19 | | use std::cmp::Ordering; |
20 | | use std::collections::HashMap; |
21 | | use std::hash::{Hash, Hasher}; |
22 | | use std::sync::Arc; |
23 | | |
24 | | use crate::datatype::DataType; |
25 | | #[cfg(feature = "canonical_extension_types")] |
26 | | use crate::extension::CanonicalExtensionType; |
27 | | use crate::schema::SchemaBuilder; |
28 | | use crate::{ |
29 | | extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}, |
30 | | Fields, UnionFields, UnionMode, |
31 | | }; |
32 | | |
33 | | /// A reference counted [`Field`] |
34 | | pub type FieldRef = Arc<Field>; |
35 | | |
36 | | /// Describes a single column in a [`Schema`](super::Schema). |
37 | | /// |
38 | | /// A [`Schema`](super::Schema) is an ordered collection of |
39 | | /// [`Field`] objects. Fields contain: |
40 | | /// * `name`: the name of the field |
41 | | /// * `data_type`: the type of the field |
42 | | /// * `nullable`: if the field is nullable |
43 | | /// * `metadata`: a map of key-value pairs containing additional custom metadata |
44 | | /// |
45 | | /// Arrow Extension types, are encoded in `Field`s metadata. See |
46 | | /// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any. |
47 | | #[derive(Debug, Clone)] |
48 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
49 | | pub struct Field { |
50 | | name: String, |
51 | | data_type: DataType, |
52 | | nullable: bool, |
53 | | #[deprecated( |
54 | | since = "54.0.0", |
55 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
56 | | )] |
57 | | dict_id: i64, |
58 | | dict_is_ordered: bool, |
59 | | /// A map of key-value pairs containing additional custom meta data. |
60 | | metadata: HashMap<String, String>, |
61 | | } |
62 | | |
63 | | // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` |
64 | | // into comparison. However, these properties are only used in IPC context |
65 | | // for matching dictionary encoded data. They are not necessary to be same |
66 | | // to consider schema equality. For example, in C++ `Field` implementation, |
67 | | // it doesn't contain these dictionary properties too. |
68 | | impl PartialEq for Field { |
69 | 1.34k | fn eq(&self, other: &Self) -> bool { |
70 | 1.34k | self.name == other.name |
71 | 1.34k | && self.data_type == other.data_type |
72 | 1.34k | && self.nullable == other.nullable |
73 | 1.34k | && self.metadata == other.metadata |
74 | 1.34k | } |
75 | | } |
76 | | |
77 | | impl Eq for Field {} |
78 | | |
79 | | impl PartialOrd for Field { |
80 | 0 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
81 | 0 | Some(self.cmp(other)) |
82 | 0 | } |
83 | | } |
84 | | |
85 | | impl Ord for Field { |
86 | 0 | fn cmp(&self, other: &Self) -> Ordering { |
87 | 0 | self.name |
88 | 0 | .cmp(other.name()) |
89 | 0 | .then_with(|| self.data_type.cmp(other.data_type())) |
90 | 0 | .then_with(|| self.nullable.cmp(&other.nullable)) |
91 | 0 | .then_with(|| { |
92 | | // ensure deterministic key order |
93 | 0 | let mut keys: Vec<&String> = |
94 | 0 | self.metadata.keys().chain(other.metadata.keys()).collect(); |
95 | 0 | keys.sort(); |
96 | 0 | for k in keys { |
97 | 0 | match (self.metadata.get(k), other.metadata.get(k)) { |
98 | 0 | (None, None) => {} |
99 | | (Some(_), None) => { |
100 | 0 | return Ordering::Less; |
101 | | } |
102 | | (None, Some(_)) => { |
103 | 0 | return Ordering::Greater; |
104 | | } |
105 | 0 | (Some(v1), Some(v2)) => match v1.cmp(v2) { |
106 | 0 | Ordering::Equal => {} |
107 | 0 | other => { |
108 | 0 | return other; |
109 | | } |
110 | | }, |
111 | | } |
112 | | } |
113 | | |
114 | 0 | Ordering::Equal |
115 | 0 | }) |
116 | 0 | } |
117 | | } |
118 | | |
119 | | impl Hash for Field { |
120 | 0 | fn hash<H: Hasher>(&self, state: &mut H) { |
121 | 0 | self.name.hash(state); |
122 | 0 | self.data_type.hash(state); |
123 | 0 | self.nullable.hash(state); |
124 | | |
125 | | // ensure deterministic key order |
126 | 0 | let mut keys: Vec<&String> = self.metadata.keys().collect(); |
127 | 0 | keys.sort(); |
128 | 0 | for k in keys { |
129 | 0 | k.hash(state); |
130 | 0 | self.metadata.get(k).expect("key valid").hash(state); |
131 | 0 | } |
132 | 0 | } |
133 | | } |
134 | | |
135 | | impl Field { |
136 | | /// Default list member field name |
137 | | pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item"; |
138 | | |
139 | | /// Creates a new field with the given name, data type, and nullability |
140 | | /// |
141 | | /// # Example |
142 | | /// ``` |
143 | | /// # use arrow_schema::{Field, DataType}; |
144 | | /// Field::new("field_name", DataType::Int32, true); |
145 | | /// ``` |
146 | 1.42k | pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self { |
147 | | #[allow(deprecated)] |
148 | 1.42k | Field { |
149 | 1.42k | name: name.into(), |
150 | 1.42k | data_type, |
151 | 1.42k | nullable, |
152 | 1.42k | dict_id: 0, |
153 | 1.42k | dict_is_ordered: false, |
154 | 1.42k | metadata: HashMap::default(), |
155 | 1.42k | } |
156 | 1.42k | } |
157 | | |
158 | | /// Creates a new `Field` suitable for [`DataType::List`] and |
159 | | /// [`DataType::LargeList`] |
160 | | /// |
161 | | /// While not required, this method follows the convention of naming the |
162 | | /// `Field` `"item"`. |
163 | | /// |
164 | | /// # Example |
165 | | /// ``` |
166 | | /// # use arrow_schema::{Field, DataType}; |
167 | | /// assert_eq!( |
168 | | /// Field::new("item", DataType::Int32, true), |
169 | | /// Field::new_list_field(DataType::Int32, true) |
170 | | /// ); |
171 | | /// ``` |
172 | 12 | pub fn new_list_field(data_type: DataType, nullable: bool) -> Self { |
173 | 12 | Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable) |
174 | 12 | } |
175 | | |
176 | | /// Creates a new field that has additional dictionary information |
177 | | #[deprecated( |
178 | | since = "54.0.0", |
179 | | note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter." |
180 | | )] |
181 | 0 | pub fn new_dict( |
182 | 0 | name: impl Into<String>, |
183 | 0 | data_type: DataType, |
184 | 0 | nullable: bool, |
185 | 0 | dict_id: i64, |
186 | 0 | dict_is_ordered: bool, |
187 | 0 | ) -> Self { |
188 | | #[allow(deprecated)] |
189 | 0 | Field { |
190 | 0 | name: name.into(), |
191 | 0 | data_type, |
192 | 0 | nullable, |
193 | 0 | dict_id, |
194 | 0 | dict_is_ordered, |
195 | 0 | metadata: HashMap::default(), |
196 | 0 | } |
197 | 0 | } |
198 | | |
199 | | /// Create a new [`Field`] with [`DataType::Dictionary`] |
200 | | /// |
201 | | /// Use [`Self::new_dict`] for more advanced dictionary options |
202 | | /// |
203 | | /// # Panics |
204 | | /// |
205 | | /// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type] |
206 | 0 | pub fn new_dictionary( |
207 | 0 | name: impl Into<String>, |
208 | 0 | key: DataType, |
209 | 0 | value: DataType, |
210 | 0 | nullable: bool, |
211 | 0 | ) -> Self { |
212 | 0 | assert!( |
213 | 0 | key.is_dictionary_key_type(), |
214 | 0 | "{key} is not a valid dictionary key" |
215 | | ); |
216 | 0 | let data_type = DataType::Dictionary(Box::new(key), Box::new(value)); |
217 | 0 | Self::new(name, data_type, nullable) |
218 | 0 | } |
219 | | |
220 | | /// Create a new [`Field`] with [`DataType::Struct`] |
221 | | /// |
222 | | /// - `name`: the name of the [`DataType::Struct`] field |
223 | | /// - `fields`: the description of each struct element |
224 | | /// - `nullable`: if the [`DataType::Struct`] array is nullable |
225 | 0 | pub fn new_struct(name: impl Into<String>, fields: impl Into<Fields>, nullable: bool) -> Self { |
226 | 0 | Self::new(name, DataType::Struct(fields.into()), nullable) |
227 | 0 | } |
228 | | |
229 | | /// Create a new [`Field`] with [`DataType::List`] |
230 | | /// |
231 | | /// - `name`: the name of the [`DataType::List`] field |
232 | | /// - `value`: the description of each list element |
233 | | /// - `nullable`: if the [`DataType::List`] array is nullable |
234 | 0 | pub fn new_list(name: impl Into<String>, value: impl Into<FieldRef>, nullable: bool) -> Self { |
235 | 0 | Self::new(name, DataType::List(value.into()), nullable) |
236 | 0 | } |
237 | | |
238 | | /// Create a new [`Field`] with [`DataType::LargeList`] |
239 | | /// |
240 | | /// - `name`: the name of the [`DataType::LargeList`] field |
241 | | /// - `value`: the description of each list element |
242 | | /// - `nullable`: if the [`DataType::LargeList`] array is nullable |
243 | 0 | pub fn new_large_list( |
244 | 0 | name: impl Into<String>, |
245 | 0 | value: impl Into<FieldRef>, |
246 | 0 | nullable: bool, |
247 | 0 | ) -> Self { |
248 | 0 | Self::new(name, DataType::LargeList(value.into()), nullable) |
249 | 0 | } |
250 | | |
251 | | /// Create a new [`Field`] with [`DataType::FixedSizeList`] |
252 | | /// |
253 | | /// - `name`: the name of the [`DataType::FixedSizeList`] field |
254 | | /// - `value`: the description of each list element |
255 | | /// - `size`: the size of the fixed size list |
256 | | /// - `nullable`: if the [`DataType::FixedSizeList`] array is nullable |
257 | 0 | pub fn new_fixed_size_list( |
258 | 0 | name: impl Into<String>, |
259 | 0 | value: impl Into<FieldRef>, |
260 | 0 | size: i32, |
261 | 0 | nullable: bool, |
262 | 0 | ) -> Self { |
263 | 0 | Self::new(name, DataType::FixedSizeList(value.into(), size), nullable) |
264 | 0 | } |
265 | | |
266 | | /// Create a new [`Field`] with [`DataType::Map`] |
267 | | /// |
268 | | /// - `name`: the name of the [`DataType::Map`] field |
269 | | /// - `entries`: the name of the inner [`DataType::Struct`] field |
270 | | /// - `keys`: the map keys |
271 | | /// - `values`: the map values |
272 | | /// - `sorted`: if the [`DataType::Map`] array is sorted |
273 | | /// - `nullable`: if the [`DataType::Map`] array is nullable |
274 | 0 | pub fn new_map( |
275 | 0 | name: impl Into<String>, |
276 | 0 | entries: impl Into<String>, |
277 | 0 | keys: impl Into<FieldRef>, |
278 | 0 | values: impl Into<FieldRef>, |
279 | 0 | sorted: bool, |
280 | 0 | nullable: bool, |
281 | 0 | ) -> Self { |
282 | 0 | let data_type = DataType::Map( |
283 | 0 | Arc::new(Field::new( |
284 | 0 | entries.into(), |
285 | 0 | DataType::Struct(Fields::from([keys.into(), values.into()])), |
286 | 0 | false, // The inner map field is always non-nullable (#1697), |
287 | 0 | )), |
288 | 0 | sorted, |
289 | 0 | ); |
290 | 0 | Self::new(name, data_type, nullable) |
291 | 0 | } |
292 | | |
293 | | /// Create a new [`Field`] with [`DataType::Union`] |
294 | | /// |
295 | | /// - `name`: the name of the [`DataType::Union`] field |
296 | | /// - `type_ids`: the union type ids |
297 | | /// - `fields`: the union fields |
298 | | /// - `mode`: the union mode |
299 | 0 | pub fn new_union<S, F, T>(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self |
300 | 0 | where |
301 | 0 | S: Into<String>, |
302 | 0 | F: IntoIterator, |
303 | 0 | F::Item: Into<FieldRef>, |
304 | 0 | T: IntoIterator<Item = i8>, |
305 | | { |
306 | 0 | Self::new( |
307 | 0 | name, |
308 | 0 | DataType::Union(UnionFields::new(type_ids, fields), mode), |
309 | | false, // Unions cannot be nullable |
310 | | ) |
311 | 0 | } |
312 | | |
313 | | /// Sets the `Field`'s optional custom metadata. |
314 | | #[inline] |
315 | 937 | pub fn set_metadata(&mut self, metadata: HashMap<String, String>) { |
316 | 937 | self.metadata = metadata; |
317 | 937 | } |
318 | | |
319 | | /// Sets the metadata of this `Field` to be `metadata` and returns self |
320 | 937 | pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self { |
321 | 937 | self.set_metadata(metadata); |
322 | 937 | self |
323 | 937 | } |
324 | | |
325 | | /// Returns the immutable reference to the `Field`'s optional custom metadata. |
326 | | #[inline] |
327 | 326 | pub const fn metadata(&self) -> &HashMap<String, String> { |
328 | 326 | &self.metadata |
329 | 326 | } |
330 | | |
331 | | /// Returns a mutable reference to the `Field`'s optional custom metadata. |
332 | | #[inline] |
333 | 0 | pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> { |
334 | 0 | &mut self.metadata |
335 | 0 | } |
336 | | |
337 | | /// Returns an immutable reference to the `Field`'s name. |
338 | | #[inline] |
339 | 139 | pub const fn name(&self) -> &String { |
340 | 139 | &self.name |
341 | 139 | } |
342 | | |
343 | | /// Set the name of this [`Field`] |
344 | | #[inline] |
345 | 0 | pub fn set_name(&mut self, name: impl Into<String>) { |
346 | 0 | self.name = name.into(); |
347 | 0 | } |
348 | | |
349 | | /// Set the name of the [`Field`] and returns self. |
350 | | /// |
351 | | /// ``` |
352 | | /// # use arrow_schema::*; |
353 | | /// let field = Field::new("c1", DataType::Int64, false) |
354 | | /// .with_name("c2"); |
355 | | /// |
356 | | /// assert_eq!(field.name(), "c2"); |
357 | | /// ``` |
358 | 0 | pub fn with_name(mut self, name: impl Into<String>) -> Self { |
359 | 0 | self.set_name(name); |
360 | 0 | self |
361 | 0 | } |
362 | | |
363 | | /// Returns an immutable reference to the [`Field`]'s [`DataType`]. |
364 | | #[inline] |
365 | 1.88k | pub const fn data_type(&self) -> &DataType { |
366 | 1.88k | &self.data_type |
367 | 1.88k | } |
368 | | |
369 | | /// Set [`DataType`] of the [`Field`] |
370 | | /// |
371 | | /// ``` |
372 | | /// # use arrow_schema::*; |
373 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
374 | | /// field.set_data_type(DataType::Utf8); |
375 | | /// |
376 | | /// assert_eq!(field.data_type(), &DataType::Utf8); |
377 | | /// ``` |
378 | | #[inline] |
379 | 0 | pub fn set_data_type(&mut self, data_type: DataType) { |
380 | 0 | self.data_type = data_type; |
381 | 0 | } |
382 | | |
383 | | /// Set [`DataType`] of the [`Field`] and returns self. |
384 | | /// |
385 | | /// ``` |
386 | | /// # use arrow_schema::*; |
387 | | /// let field = Field::new("c1", DataType::Int64, false) |
388 | | /// .with_data_type(DataType::Utf8); |
389 | | /// |
390 | | /// assert_eq!(field.data_type(), &DataType::Utf8); |
391 | | /// ``` |
392 | 0 | pub fn with_data_type(mut self, data_type: DataType) -> Self { |
393 | 0 | self.set_data_type(data_type); |
394 | 0 | self |
395 | 0 | } |
396 | | |
397 | | /// Returns the extension type name of this [`Field`], if set. |
398 | | /// |
399 | | /// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in |
400 | | /// [`Field::metadata`]. If the key is missing, there is no extension type |
401 | | /// name and this returns `None`. |
402 | | /// |
403 | | /// # Example |
404 | | /// |
405 | | /// ``` |
406 | | /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field}; |
407 | | /// |
408 | | /// let field = Field::new("", DataType::Null, false); |
409 | | /// assert_eq!(field.extension_type_name(), None); |
410 | | /// |
411 | | /// let field = Field::new("", DataType::Null, false).with_metadata( |
412 | | /// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())] |
413 | | /// .into_iter() |
414 | | /// .collect(), |
415 | | /// ); |
416 | | /// assert_eq!(field.extension_type_name(), Some("example")); |
417 | | /// ``` |
418 | 0 | pub fn extension_type_name(&self) -> Option<&str> { |
419 | 0 | self.metadata() |
420 | 0 | .get(EXTENSION_TYPE_NAME_KEY) |
421 | 0 | .map(String::as_ref) |
422 | 0 | } |
423 | | |
424 | | /// Returns the extension type metadata of this [`Field`], if set. |
425 | | /// |
426 | | /// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in |
427 | | /// [`Field::metadata`]. If the key is missing, there is no extension type |
428 | | /// metadata and this returns `None`. |
429 | | /// |
430 | | /// # Example |
431 | | /// |
432 | | /// ``` |
433 | | /// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field}; |
434 | | /// |
435 | | /// let field = Field::new("", DataType::Null, false); |
436 | | /// assert_eq!(field.extension_type_metadata(), None); |
437 | | /// |
438 | | /// let field = Field::new("", DataType::Null, false).with_metadata( |
439 | | /// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())] |
440 | | /// .into_iter() |
441 | | /// .collect(), |
442 | | /// ); |
443 | | /// assert_eq!(field.extension_type_metadata(), Some("example")); |
444 | | /// ``` |
445 | 0 | pub fn extension_type_metadata(&self) -> Option<&str> { |
446 | 0 | self.metadata() |
447 | 0 | .get(EXTENSION_TYPE_METADATA_KEY) |
448 | 0 | .map(String::as_ref) |
449 | 0 | } |
450 | | |
451 | | /// Returns an instance of the given [`ExtensionType`] of this [`Field`], |
452 | | /// if set in the [`Field::metadata`]. |
453 | | /// |
454 | | /// # Error |
455 | | /// |
456 | | /// Returns an error if |
457 | | /// - this field does not have the name of this extension type |
458 | | /// ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or |
459 | | /// missing) |
460 | | /// - the deserialization of the metadata |
461 | | /// ([`ExtensionType::deserialize_metadata`]) fails |
462 | | /// - the construction of the extension type ([`ExtensionType::try_new`]) |
463 | | /// fail (for example when the [`Field::data_type`] is not supported by |
464 | | /// the extension type ([`ExtensionType::supports_data_type`])) |
465 | 0 | pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> { |
466 | | // Check the extension name in the metadata |
467 | 0 | match self.extension_type_name() { |
468 | | // It should match the name of the given extension type |
469 | 0 | Some(name) if name == E::NAME => { |
470 | | // Deserialize the metadata and try to construct the extension |
471 | | // type |
472 | 0 | E::deserialize_metadata(self.extension_type_metadata()) |
473 | 0 | .and_then(|metadata| E::try_new(self.data_type(), metadata)) |
474 | | } |
475 | | // Name mismatch |
476 | 0 | Some(name) => Err(ArrowError::InvalidArgumentError(format!( |
477 | 0 | "Field extension type name mismatch, expected {}, found {name}", |
478 | 0 | E::NAME |
479 | 0 | ))), |
480 | | // Name missing |
481 | 0 | None => Err(ArrowError::InvalidArgumentError( |
482 | 0 | "Field extension type name missing".to_owned(), |
483 | 0 | )), |
484 | | } |
485 | 0 | } |
486 | | |
487 | | /// Returns an instance of the given [`ExtensionType`] of this [`Field`], |
488 | | /// panics if this [`Field`] does not have this extension type. |
489 | | /// |
490 | | /// # Panic |
491 | | /// |
492 | | /// This calls [`Field::try_extension_type`] and panics when it returns an |
493 | | /// error. |
494 | 0 | pub fn extension_type<E: ExtensionType>(&self) -> E { |
495 | 0 | self.try_extension_type::<E>() |
496 | 0 | .unwrap_or_else(|e| panic!("{e}")) |
497 | 0 | } |
498 | | |
499 | | /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] |
500 | | /// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the |
501 | | /// given extension type supports the [`Field::data_type`] of this field |
502 | | /// ([`ExtensionType::supports_data_type`]). |
503 | | /// |
504 | | /// If the given extension type defines no metadata, a previously set |
505 | | /// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared. |
506 | | /// |
507 | | /// # Error |
508 | | /// |
509 | | /// This functions returns an error if the data type of this field does not |
510 | | /// match any of the supported storage types of the given extension type. |
511 | 0 | pub fn try_with_extension_type<E: ExtensionType>( |
512 | 0 | &mut self, |
513 | 0 | extension_type: E, |
514 | 0 | ) -> Result<(), ArrowError> { |
515 | | // Make sure the data type of this field is supported |
516 | 0 | extension_type.supports_data_type(&self.data_type)?; |
517 | | |
518 | 0 | self.metadata |
519 | 0 | .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); |
520 | 0 | match extension_type.serialize_metadata() { |
521 | 0 | Some(metadata) => self |
522 | 0 | .metadata |
523 | 0 | .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata), |
524 | | // If this extension type has no metadata, we make sure to |
525 | | // clear previously set metadata. |
526 | 0 | None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY), |
527 | | }; |
528 | | |
529 | 0 | Ok(()) |
530 | 0 | } |
531 | | |
532 | | /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] |
533 | | /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. |
534 | | /// |
535 | | /// # Panics |
536 | | /// |
537 | | /// This calls [`Field::try_with_extension_type`] and panics when it |
538 | | /// returns an error. |
539 | 0 | pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self { |
540 | 0 | self.try_with_extension_type(extension_type) |
541 | 0 | .unwrap_or_else(|e| panic!("{e}")); |
542 | 0 | self |
543 | 0 | } |
544 | | |
545 | | /// Returns the [`CanonicalExtensionType`] of this [`Field`], if set. |
546 | | /// |
547 | | /// # Error |
548 | | /// |
549 | | /// Returns an error if |
550 | | /// - this field does have a canonical extension type (mismatch or missing) |
551 | | /// - the canonical extension is not supported |
552 | | /// - the construction of the extension type fails |
553 | | #[cfg(feature = "canonical_extension_types")] |
554 | | pub fn try_canonical_extension_type(&self) -> Result<CanonicalExtensionType, ArrowError> { |
555 | | CanonicalExtensionType::try_from(self) |
556 | | } |
557 | | |
558 | | /// Indicates whether this [`Field`] supports null values. |
559 | | /// |
560 | | /// If true, the field *may* contain null values. |
561 | | #[inline] |
562 | 1.88k | pub const fn is_nullable(&self) -> bool { |
563 | 1.88k | self.nullable |
564 | 1.88k | } |
565 | | |
566 | | /// Set the `nullable` of this [`Field`]. |
567 | | /// |
568 | | /// ``` |
569 | | /// # use arrow_schema::*; |
570 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
571 | | /// field.set_nullable(true); |
572 | | /// |
573 | | /// assert_eq!(field.is_nullable(), true); |
574 | | /// ``` |
575 | | #[inline] |
576 | 14 | pub fn set_nullable(&mut self, nullable: bool) { |
577 | 14 | self.nullable = nullable; |
578 | 14 | } |
579 | | |
580 | | /// Set `nullable` of the [`Field`] and returns self. |
581 | | /// |
582 | | /// ``` |
583 | | /// # use arrow_schema::*; |
584 | | /// let field = Field::new("c1", DataType::Int64, false) |
585 | | /// .with_nullable(true); |
586 | | /// |
587 | | /// assert_eq!(field.is_nullable(), true); |
588 | | /// ``` |
589 | 14 | pub fn with_nullable(mut self, nullable: bool) -> Self { |
590 | 14 | self.set_nullable(nullable); |
591 | 14 | self |
592 | 14 | } |
593 | | |
594 | | /// Returns a (flattened) [`Vec`] containing all child [`Field`]s |
595 | | /// within `self` contained within this field (including `self`) |
596 | 0 | pub(crate) fn fields(&self) -> Vec<&Field> { |
597 | 0 | let mut collected_fields = vec![self]; |
598 | 0 | collected_fields.append(&mut Field::_fields(&self.data_type)); |
599 | | |
600 | 0 | collected_fields |
601 | 0 | } |
602 | | |
603 | 0 | fn _fields(dt: &DataType) -> Vec<&Field> { |
604 | 0 | match dt { |
605 | 0 | DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), |
606 | 0 | DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(), |
607 | 0 | DataType::List(field) |
608 | 0 | | DataType::LargeList(field) |
609 | 0 | | DataType::FixedSizeList(field, _) |
610 | 0 | | DataType::Map(field, _) => field.fields(), |
611 | 0 | DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), |
612 | 0 | DataType::RunEndEncoded(_, field) => field.fields(), |
613 | 0 | _ => vec![], |
614 | | } |
615 | 0 | } |
616 | | |
617 | | /// Returns a vector containing all (potentially nested) `Field` instances selected by the |
618 | | /// dictionary ID they use |
619 | | #[inline] |
620 | | #[deprecated( |
621 | | since = "54.0.0", |
622 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
623 | | )] |
624 | 0 | pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> { |
625 | 0 | self.fields() |
626 | 0 | .into_iter() |
627 | 0 | .filter(|&field| { |
628 | | #[allow(deprecated)] |
629 | 0 | let matching_dict_id = field.dict_id == id; |
630 | 0 | matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id |
631 | 0 | }) |
632 | 0 | .collect() |
633 | 0 | } |
634 | | |
635 | | /// Returns the dictionary ID, if this is a dictionary type. |
636 | | #[inline] |
637 | | #[deprecated( |
638 | | since = "54.0.0", |
639 | | note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." |
640 | | )] |
641 | 0 | pub const fn dict_id(&self) -> Option<i64> { |
642 | 0 | match self.data_type { |
643 | | #[allow(deprecated)] |
644 | 0 | DataType::Dictionary(_, _) => Some(self.dict_id), |
645 | 0 | _ => None, |
646 | | } |
647 | 0 | } |
648 | | |
649 | | /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type. |
650 | | /// |
651 | | /// # Example |
652 | | /// ``` |
653 | | /// # use arrow_schema::{DataType, Field}; |
654 | | /// // non dictionaries do not have a dict is ordered flat |
655 | | /// let field = Field::new("c1", DataType::Int64, false); |
656 | | /// assert_eq!(field.dict_is_ordered(), None); |
657 | | /// // by default dictionary is not ordered |
658 | | /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false); |
659 | | /// assert_eq!(field.dict_is_ordered(), Some(false)); |
660 | | /// let field = field.with_dict_is_ordered(true); |
661 | | /// assert_eq!(field.dict_is_ordered(), Some(true)); |
662 | | /// ``` |
663 | | #[inline] |
664 | 0 | pub const fn dict_is_ordered(&self) -> Option<bool> { |
665 | 0 | match self.data_type { |
666 | 0 | DataType::Dictionary(_, _) => Some(self.dict_is_ordered), |
667 | 0 | _ => None, |
668 | | } |
669 | 0 | } |
670 | | |
671 | | /// Set the is ordered field for this `Field`, if it is a dictionary. |
672 | | /// |
673 | | /// Does nothing if this is not a dictionary type. |
674 | | /// |
675 | | /// See [`Field::dict_is_ordered`] for more information. |
676 | 0 | pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self { |
677 | 0 | if matches!(self.data_type, DataType::Dictionary(_, _)) { |
678 | 0 | self.dict_is_ordered = dict_is_ordered; |
679 | 0 | }; |
680 | 0 | self |
681 | 0 | } |
682 | | |
683 | | /// Merge this field into self if it is compatible. |
684 | | /// |
685 | | /// Struct fields are merged recursively. |
686 | | /// |
687 | | /// NOTE: `self` may be updated to a partial / unexpected state in case of merge failure. |
688 | | /// |
689 | | /// Example: |
690 | | /// |
691 | | /// ``` |
692 | | /// # use arrow_schema::*; |
693 | | /// let mut field = Field::new("c1", DataType::Int64, false); |
694 | | /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok()); |
695 | | /// assert!(field.is_nullable()); |
696 | | /// ``` |
697 | 0 | pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { |
698 | 0 | if from.dict_is_ordered != self.dict_is_ordered { |
699 | 0 | return Err(ArrowError::SchemaError(format!( |
700 | 0 | "Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}", |
701 | 0 | self.name, from.dict_is_ordered, self.dict_is_ordered |
702 | 0 | ))); |
703 | 0 | } |
704 | | // merge metadata |
705 | 0 | match (self.metadata().is_empty(), from.metadata().is_empty()) { |
706 | | (false, false) => { |
707 | 0 | let mut merged = self.metadata().clone(); |
708 | 0 | for (key, from_value) in from.metadata() { |
709 | 0 | if let Some(self_value) = self.metadata.get(key) { |
710 | 0 | if self_value != from_value { |
711 | 0 | return Err(ArrowError::SchemaError(format!( |
712 | 0 | "Fail to merge field '{}' due to conflicting metadata data value for key {}. |
713 | 0 | From value = {} does not match {}", self.name, key, from_value, self_value), |
714 | 0 | )); |
715 | 0 | } |
716 | 0 | } else { |
717 | 0 | merged.insert(key.clone(), from_value.clone()); |
718 | 0 | } |
719 | | } |
720 | 0 | self.set_metadata(merged); |
721 | | } |
722 | 0 | (true, false) => { |
723 | 0 | self.set_metadata(from.metadata().clone()); |
724 | 0 | } |
725 | 0 | _ => {} |
726 | | } |
727 | 0 | match &mut self.data_type { |
728 | 0 | DataType::Struct(nested_fields) => match &from.data_type { |
729 | 0 | DataType::Struct(from_nested_fields) => { |
730 | 0 | let mut builder = SchemaBuilder::new(); |
731 | 0 | nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?; |
732 | 0 | *nested_fields = builder.finish().fields; |
733 | | } |
734 | | _ => { |
735 | 0 | return Err(ArrowError::SchemaError( |
736 | 0 | format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct", |
737 | 0 | self.name, from.data_type) |
738 | 0 | ))} |
739 | | }, |
740 | 0 | DataType::Union(nested_fields, _) => match &from.data_type { |
741 | 0 | DataType::Union(from_nested_fields, _) => { |
742 | 0 | nested_fields.try_merge(from_nested_fields)? |
743 | | } |
744 | | _ => { |
745 | 0 | return Err(ArrowError::SchemaError( |
746 | 0 | format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union", |
747 | 0 | self.name, from.data_type) |
748 | 0 | )); |
749 | | } |
750 | | }, |
751 | 0 | DataType::List(field) => match &from.data_type { |
752 | 0 | DataType::List(from_field) => { |
753 | 0 | let mut f = (**field).clone(); |
754 | 0 | f.try_merge(from_field)?; |
755 | 0 | (*field) = Arc::new(f); |
756 | | }, |
757 | | _ => { |
758 | 0 | return Err(ArrowError::SchemaError( |
759 | 0 | format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::List", |
760 | 0 | self.name, from.data_type) |
761 | 0 | ))} |
762 | | }, |
763 | 0 | DataType::LargeList(field) => match &from.data_type { |
764 | 0 | DataType::LargeList(from_field) => { |
765 | 0 | let mut f = (**field).clone(); |
766 | 0 | f.try_merge(from_field)?; |
767 | 0 | (*field) = Arc::new(f); |
768 | | }, |
769 | | _ => { |
770 | 0 | return Err(ArrowError::SchemaError( |
771 | 0 | format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::LargeList", |
772 | 0 | self.name, from.data_type) |
773 | 0 | ))} |
774 | | }, |
775 | 0 | DataType::Null => { |
776 | 0 | self.nullable = true; |
777 | 0 | self.data_type = from.data_type.clone(); |
778 | 0 | } |
779 | | | DataType::Boolean |
780 | | | DataType::Int8 |
781 | | | DataType::Int16 |
782 | | | DataType::Int32 |
783 | | | DataType::Int64 |
784 | | | DataType::UInt8 |
785 | | | DataType::UInt16 |
786 | | | DataType::UInt32 |
787 | | | DataType::UInt64 |
788 | | | DataType::Float16 |
789 | | | DataType::Float32 |
790 | | | DataType::Float64 |
791 | | | DataType::Timestamp(_, _) |
792 | | | DataType::Date32 |
793 | | | DataType::Date64 |
794 | | | DataType::Time32(_) |
795 | | | DataType::Time64(_) |
796 | | | DataType::Duration(_) |
797 | | | DataType::Binary |
798 | | | DataType::LargeBinary |
799 | | | DataType::BinaryView |
800 | | | DataType::Interval(_) |
801 | | | DataType::LargeListView(_) |
802 | | | DataType::ListView(_) |
803 | | | DataType::Map(_, _) |
804 | | | DataType::Dictionary(_, _) |
805 | | | DataType::RunEndEncoded(_, _) |
806 | | | DataType::FixedSizeList(_, _) |
807 | | | DataType::FixedSizeBinary(_) |
808 | | | DataType::Utf8 |
809 | | | DataType::LargeUtf8 |
810 | | | DataType::Utf8View |
811 | | | DataType::Decimal32(_, _) |
812 | | | DataType::Decimal64(_, _) |
813 | | | DataType::Decimal128(_, _) |
814 | | | DataType::Decimal256(_, _) => { |
815 | 0 | if from.data_type == DataType::Null { |
816 | 0 | self.nullable = true; |
817 | 0 | } else if self.data_type != from.data_type { |
818 | 0 | return Err(ArrowError::SchemaError( |
819 | 0 | format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}", |
820 | 0 | self.name, from.data_type, self.data_type) |
821 | 0 | )); |
822 | 0 | } |
823 | | } |
824 | | } |
825 | 0 | self.nullable |= from.nullable; |
826 | | |
827 | 0 | Ok(()) |
828 | 0 | } |
829 | | |
830 | | /// Check to see if `self` is a superset of `other` field. Superset is defined as: |
831 | | /// |
832 | | /// * if nullability doesn't match, self needs to be nullable |
833 | | /// * self.metadata is a superset of other.metadata |
834 | | /// * all other fields are equal |
835 | 0 | pub fn contains(&self, other: &Field) -> bool { |
836 | 0 | self.name == other.name |
837 | 0 | && self.data_type.contains(&other.data_type) |
838 | 0 | && self.dict_is_ordered == other.dict_is_ordered |
839 | | // self need to be nullable or both of them are not nullable |
840 | 0 | && (self.nullable || !other.nullable) |
841 | | // make sure self.metadata is a superset of other.metadata |
842 | 0 | && other.metadata.iter().all(|(k, v1)| { |
843 | 0 | self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default() |
844 | 0 | }) |
845 | 0 | } |
846 | | |
847 | | /// Return size of this instance in bytes. |
848 | | /// |
849 | | /// Includes the size of `Self`. |
850 | 0 | pub fn size(&self) -> usize { |
851 | 0 | std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type) |
852 | 0 | + self.data_type.size() |
853 | 0 | + self.name.capacity() |
854 | 0 | + (std::mem::size_of::<(String, String)>() * self.metadata.capacity()) |
855 | 0 | + self |
856 | 0 | .metadata |
857 | 0 | .iter() |
858 | 0 | .map(|(k, v)| k.capacity() + v.capacity()) |
859 | 0 | .sum::<usize>() |
860 | 0 | } |
861 | | } |
862 | | |
863 | | // TODO: improve display with crate https://crates.io/crates/derive_more ? |
864 | | impl std::fmt::Display for Field { |
865 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
866 | 0 | write!(f, "{self:?}") |
867 | 0 | } |
868 | | } |
869 | | |
870 | | #[cfg(test)] |
871 | | mod test { |
872 | | use super::*; |
873 | | use std::collections::hash_map::DefaultHasher; |
874 | | |
875 | | #[test] |
876 | | fn test_new_with_string() { |
877 | | // Fields should allow owned Strings to support reuse |
878 | | let s = "c1"; |
879 | | Field::new(s, DataType::Int64, false); |
880 | | } |
881 | | |
882 | | #[test] |
883 | | fn test_new_dict_with_string() { |
884 | | // Fields should allow owned Strings to support reuse |
885 | | let s = "c1"; |
886 | | #[allow(deprecated)] |
887 | | Field::new_dict(s, DataType::Int64, false, 4, false); |
888 | | } |
889 | | |
890 | | #[test] |
891 | | fn test_merge_incompatible_types() { |
892 | | let mut field = Field::new("c1", DataType::Int64, false); |
893 | | let result = field |
894 | | .try_merge(&Field::new("c1", DataType::Float32, true)) |
895 | | .expect_err("should fail") |
896 | | .to_string(); |
897 | | assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result); |
898 | | } |
899 | | |
900 | | #[test] |
901 | | fn test_merge_with_null() { |
902 | | let mut field1 = Field::new("c1", DataType::Null, true); |
903 | | field1 |
904 | | .try_merge(&Field::new("c1", DataType::Float32, false)) |
905 | | .expect("should widen type to nullable float"); |
906 | | assert_eq!(Field::new("c1", DataType::Float32, true), field1); |
907 | | |
908 | | let mut field2 = Field::new("c2", DataType::Utf8, false); |
909 | | field2 |
910 | | .try_merge(&Field::new("c2", DataType::Null, true)) |
911 | | .expect("should widen type to nullable utf8"); |
912 | | assert_eq!(Field::new("c2", DataType::Utf8, true), field2); |
913 | | } |
914 | | |
915 | | #[test] |
916 | | fn test_merge_with_nested_null() { |
917 | | let mut struct1 = Field::new( |
918 | | "s1", |
919 | | DataType::Struct(Fields::from(vec![Field::new( |
920 | | "inner", |
921 | | DataType::Float32, |
922 | | false, |
923 | | )])), |
924 | | false, |
925 | | ); |
926 | | |
927 | | let struct2 = Field::new( |
928 | | "s2", |
929 | | DataType::Struct(Fields::from(vec![Field::new( |
930 | | "inner", |
931 | | DataType::Null, |
932 | | false, |
933 | | )])), |
934 | | true, |
935 | | ); |
936 | | |
937 | | struct1 |
938 | | .try_merge(&struct2) |
939 | | .expect("should widen inner field's type to nullable float"); |
940 | | assert_eq!( |
941 | | Field::new( |
942 | | "s1", |
943 | | DataType::Struct(Fields::from(vec![Field::new( |
944 | | "inner", |
945 | | DataType::Float32, |
946 | | true, |
947 | | )])), |
948 | | true, |
949 | | ), |
950 | | struct1 |
951 | | ); |
952 | | |
953 | | let mut list1 = Field::new( |
954 | | "l1", |
955 | | DataType::List(Field::new("inner", DataType::Float32, false).into()), |
956 | | false, |
957 | | ); |
958 | | |
959 | | let list2 = Field::new( |
960 | | "l2", |
961 | | DataType::List(Field::new("inner", DataType::Null, false).into()), |
962 | | true, |
963 | | ); |
964 | | |
965 | | list1 |
966 | | .try_merge(&list2) |
967 | | .expect("should widen inner field's type to nullable float"); |
968 | | assert_eq!( |
969 | | Field::new( |
970 | | "l1", |
971 | | DataType::List(Field::new("inner", DataType::Float32, true).into()), |
972 | | true, |
973 | | ), |
974 | | list1 |
975 | | ); |
976 | | |
977 | | let mut large_list1 = Field::new( |
978 | | "ll1", |
979 | | DataType::LargeList(Field::new("inner", DataType::Float32, false).into()), |
980 | | false, |
981 | | ); |
982 | | |
983 | | let large_list2 = Field::new( |
984 | | "ll2", |
985 | | DataType::LargeList(Field::new("inner", DataType::Null, false).into()), |
986 | | true, |
987 | | ); |
988 | | |
989 | | large_list1 |
990 | | .try_merge(&large_list2) |
991 | | .expect("should widen inner field's type to nullable float"); |
992 | | assert_eq!( |
993 | | Field::new( |
994 | | "ll1", |
995 | | DataType::LargeList(Field::new("inner", DataType::Float32, true).into()), |
996 | | true, |
997 | | ), |
998 | | large_list1 |
999 | | ); |
1000 | | } |
1001 | | |
1002 | | #[test] |
1003 | | fn test_fields_with_dict_id() { |
1004 | | #[allow(deprecated)] |
1005 | | let dict1 = Field::new_dict( |
1006 | | "dict1", |
1007 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1008 | | false, |
1009 | | 10, |
1010 | | false, |
1011 | | ); |
1012 | | #[allow(deprecated)] |
1013 | | let dict2 = Field::new_dict( |
1014 | | "dict2", |
1015 | | DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()), |
1016 | | false, |
1017 | | 20, |
1018 | | false, |
1019 | | ); |
1020 | | |
1021 | | let field = Field::new( |
1022 | | "struct<dict1, list[struct<dict2, list[struct<dict1]>]>", |
1023 | | DataType::Struct(Fields::from(vec![ |
1024 | | dict1.clone(), |
1025 | | Field::new( |
1026 | | "list[struct<dict1, list[struct<dict2>]>]", |
1027 | | DataType::List(Arc::new(Field::new( |
1028 | | "struct<dict1, list[struct<dict2>]>", |
1029 | | DataType::Struct(Fields::from(vec![ |
1030 | | dict1.clone(), |
1031 | | Field::new( |
1032 | | "list[struct<dict2>]", |
1033 | | DataType::List(Arc::new(Field::new( |
1034 | | "struct<dict2>", |
1035 | | DataType::Struct(vec![dict2.clone()].into()), |
1036 | | false, |
1037 | | ))), |
1038 | | false, |
1039 | | ), |
1040 | | ])), |
1041 | | false, |
1042 | | ))), |
1043 | | false, |
1044 | | ), |
1045 | | ])), |
1046 | | false, |
1047 | | ); |
1048 | | |
1049 | | #[allow(deprecated)] |
1050 | | for field in field.fields_with_dict_id(10) { |
1051 | | assert_eq!(dict1, *field); |
1052 | | } |
1053 | | #[allow(deprecated)] |
1054 | | for field in field.fields_with_dict_id(20) { |
1055 | | assert_eq!(dict2, *field); |
1056 | | } |
1057 | | } |
1058 | | |
1059 | | fn get_field_hash(field: &Field) -> u64 { |
1060 | | let mut s = DefaultHasher::new(); |
1061 | | field.hash(&mut s); |
1062 | | s.finish() |
1063 | | } |
1064 | | |
1065 | | #[test] |
1066 | | fn test_field_comparison_case() { |
1067 | | // dictionary-encoding properties not used for field comparison |
1068 | | #[allow(deprecated)] |
1069 | | let dict1 = Field::new_dict( |
1070 | | "dict1", |
1071 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1072 | | false, |
1073 | | 10, |
1074 | | false, |
1075 | | ); |
1076 | | #[allow(deprecated)] |
1077 | | let dict2 = Field::new_dict( |
1078 | | "dict1", |
1079 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1080 | | false, |
1081 | | 20, |
1082 | | false, |
1083 | | ); |
1084 | | |
1085 | | assert_eq!(dict1, dict2); |
1086 | | assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); |
1087 | | |
1088 | | #[allow(deprecated)] |
1089 | | let dict1 = Field::new_dict( |
1090 | | "dict0", |
1091 | | DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), |
1092 | | false, |
1093 | | 10, |
1094 | | false, |
1095 | | ); |
1096 | | |
1097 | | assert_ne!(dict1, dict2); |
1098 | | assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); |
1099 | | } |
1100 | | |
1101 | | #[test] |
1102 | | fn test_field_comparison_metadata() { |
1103 | | let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1104 | | (String::from("k1"), String::from("v1")), |
1105 | | (String::from("k2"), String::from("v2")), |
1106 | | ])); |
1107 | | let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1108 | | (String::from("k1"), String::from("v1")), |
1109 | | (String::from("k3"), String::from("v3")), |
1110 | | ])); |
1111 | | let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ |
1112 | | (String::from("k1"), String::from("v1")), |
1113 | | (String::from("k3"), String::from("v4")), |
1114 | | ])); |
1115 | | |
1116 | | assert!(f1.cmp(&f2).is_lt()); |
1117 | | assert!(f2.cmp(&f3).is_lt()); |
1118 | | assert!(f1.cmp(&f3).is_lt()); |
1119 | | } |
1120 | | |
1121 | | #[test] |
1122 | | fn test_contains_reflexivity() { |
1123 | | let mut field = Field::new("field1", DataType::Float16, false); |
1124 | | field.set_metadata(HashMap::from([ |
1125 | | (String::from("k0"), String::from("v0")), |
1126 | | (String::from("k1"), String::from("v1")), |
1127 | | ])); |
1128 | | assert!(field.contains(&field)) |
1129 | | } |
1130 | | |
1131 | | #[test] |
1132 | | fn test_contains_transitivity() { |
1133 | | let child_field = Field::new("child1", DataType::Float16, false); |
1134 | | |
1135 | | let mut field1 = Field::new( |
1136 | | "field1", |
1137 | | DataType::Struct(Fields::from(vec![child_field])), |
1138 | | false, |
1139 | | ); |
1140 | | field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); |
1141 | | |
1142 | | let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true); |
1143 | | field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))])); |
1144 | | field2.try_merge(&field1).unwrap(); |
1145 | | |
1146 | | let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false); |
1147 | | field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))])); |
1148 | | field3.try_merge(&field2).unwrap(); |
1149 | | |
1150 | | assert!(field2.contains(&field1)); |
1151 | | assert!(field3.contains(&field2)); |
1152 | | assert!(field3.contains(&field1)); |
1153 | | |
1154 | | assert!(!field1.contains(&field2)); |
1155 | | assert!(!field1.contains(&field3)); |
1156 | | assert!(!field2.contains(&field3)); |
1157 | | } |
1158 | | |
1159 | | #[test] |
1160 | | fn test_contains_nullable() { |
1161 | | let field1 = Field::new("field1", DataType::Boolean, true); |
1162 | | let field2 = Field::new("field1", DataType::Boolean, false); |
1163 | | assert!(field1.contains(&field2)); |
1164 | | assert!(!field2.contains(&field1)); |
1165 | | } |
1166 | | |
1167 | | #[test] |
1168 | | fn test_contains_must_have_same_fields() { |
1169 | | let child_field1 = Field::new("child1", DataType::Float16, false); |
1170 | | let child_field2 = Field::new("child2", DataType::Float16, false); |
1171 | | |
1172 | | let field1 = Field::new( |
1173 | | "field1", |
1174 | | DataType::Struct(vec![child_field1.clone()].into()), |
1175 | | true, |
1176 | | ); |
1177 | | let field2 = Field::new( |
1178 | | "field1", |
1179 | | DataType::Struct(vec![child_field1, child_field2].into()), |
1180 | | true, |
1181 | | ); |
1182 | | |
1183 | | assert!(!field1.contains(&field2)); |
1184 | | assert!(!field2.contains(&field1)); |
1185 | | |
1186 | | // UnionFields with different type ID |
1187 | | let field1 = Field::new( |
1188 | | "field1", |
1189 | | DataType::Union( |
1190 | | UnionFields::new( |
1191 | | vec![1, 2], |
1192 | | vec![ |
1193 | | Field::new("field1", DataType::UInt8, true), |
1194 | | Field::new("field3", DataType::Utf8, false), |
1195 | | ], |
1196 | | ), |
1197 | | UnionMode::Dense, |
1198 | | ), |
1199 | | true, |
1200 | | ); |
1201 | | let field2 = Field::new( |
1202 | | "field1", |
1203 | | DataType::Union( |
1204 | | UnionFields::new( |
1205 | | vec![1, 3], |
1206 | | vec![ |
1207 | | Field::new("field1", DataType::UInt8, false), |
1208 | | Field::new("field3", DataType::Utf8, false), |
1209 | | ], |
1210 | | ), |
1211 | | UnionMode::Dense, |
1212 | | ), |
1213 | | true, |
1214 | | ); |
1215 | | assert!(!field1.contains(&field2)); |
1216 | | |
1217 | | // UnionFields with same type ID |
1218 | | let field1 = Field::new( |
1219 | | "field1", |
1220 | | DataType::Union( |
1221 | | UnionFields::new( |
1222 | | vec![1, 2], |
1223 | | vec![ |
1224 | | Field::new("field1", DataType::UInt8, true), |
1225 | | Field::new("field3", DataType::Utf8, false), |
1226 | | ], |
1227 | | ), |
1228 | | UnionMode::Dense, |
1229 | | ), |
1230 | | true, |
1231 | | ); |
1232 | | let field2 = Field::new( |
1233 | | "field1", |
1234 | | DataType::Union( |
1235 | | UnionFields::new( |
1236 | | vec![1, 2], |
1237 | | vec![ |
1238 | | Field::new("field1", DataType::UInt8, false), |
1239 | | Field::new("field3", DataType::Utf8, false), |
1240 | | ], |
1241 | | ), |
1242 | | UnionMode::Dense, |
1243 | | ), |
1244 | | true, |
1245 | | ); |
1246 | | assert!(field1.contains(&field2)); |
1247 | | } |
1248 | | |
1249 | | #[cfg(feature = "serde")] |
1250 | | fn assert_binary_serde_round_trip(field: Field) { |
1251 | | let serialized = bincode::serialize(&field).unwrap(); |
1252 | | let deserialized: Field = bincode::deserialize(&serialized).unwrap(); |
1253 | | assert_eq!(field, deserialized) |
1254 | | } |
1255 | | |
1256 | | #[cfg(feature = "serde")] |
1257 | | #[test] |
1258 | | fn test_field_without_metadata_serde() { |
1259 | | let field = Field::new("name", DataType::Boolean, true); |
1260 | | assert_binary_serde_round_trip(field) |
1261 | | } |
1262 | | |
1263 | | #[cfg(feature = "serde")] |
1264 | | #[test] |
1265 | | fn test_field_with_empty_metadata_serde() { |
1266 | | let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new()); |
1267 | | |
1268 | | assert_binary_serde_round_trip(field) |
1269 | | } |
1270 | | |
1271 | | #[cfg(feature = "serde")] |
1272 | | #[test] |
1273 | | fn test_field_with_nonempty_metadata_serde() { |
1274 | | let mut metadata = HashMap::new(); |
1275 | | metadata.insert("hi".to_owned(), "".to_owned()); |
1276 | | let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata); |
1277 | | |
1278 | | assert_binary_serde_round_trip(field) |
1279 | | } |
1280 | | } |