/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::fmt; |
19 | | use std::str::FromStr; |
20 | | use std::sync::Arc; |
21 | | |
22 | | use crate::{ArrowError, Field, FieldRef, Fields, UnionFields}; |
23 | | |
24 | | /// Datatypes supported by this implementation of Apache Arrow. |
25 | | /// |
26 | | /// The variants of this enum include primitive fixed size types as well as |
27 | | /// parametric or nested types. See [`Schema.fbs`] for Arrow's specification. |
28 | | /// |
29 | | /// # Examples |
30 | | /// |
31 | | /// Primitive types |
32 | | /// ``` |
33 | | /// # use arrow_schema::DataType; |
34 | | /// // create a new 32-bit signed integer |
35 | | /// let data_type = DataType::Int32; |
36 | | /// ``` |
37 | | /// |
38 | | /// Nested Types |
39 | | /// ``` |
40 | | /// # use arrow_schema::{DataType, Field}; |
41 | | /// # use std::sync::Arc; |
42 | | /// // create a new list of 32-bit signed integers directly |
43 | | /// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); |
44 | | /// // Create the same list type with constructor |
45 | | /// let list_data_type2 = DataType::new_list(DataType::Int32, true); |
46 | | /// assert_eq!(list_data_type, list_data_type2); |
47 | | /// ``` |
48 | | /// |
49 | | /// Dictionary Types |
50 | | /// ``` |
51 | | /// # use arrow_schema::{DataType}; |
52 | | /// // String Dictionary (key type Int32 and value type Utf8) |
53 | | /// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); |
54 | | /// ``` |
55 | | /// |
56 | | /// Timestamp Types |
57 | | /// ``` |
58 | | /// # use arrow_schema::{DataType, TimeUnit}; |
59 | | /// // timestamp with millisecond precision without timezone specified |
60 | | /// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None); |
61 | | /// // timestamp with nanosecond precision in UTC timezone |
62 | | /// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())); |
63 | | ///``` |
64 | | /// |
65 | | /// # Display and FromStr |
66 | | /// |
67 | | /// The `Display` and `FromStr` implementations for `DataType` are |
68 | | /// human-readable, parseable, and reversible. |
69 | | /// |
70 | | /// ``` |
71 | | /// # use arrow_schema::DataType; |
72 | | /// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); |
73 | | /// let data_type_string = data_type.to_string(); |
74 | | /// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)"); |
75 | | /// // display can be parsed back into the original type |
76 | | /// let parsed_data_type: DataType = data_type.to_string().parse().unwrap(); |
77 | | /// assert_eq!(data_type, parsed_data_type); |
78 | | /// ``` |
79 | | /// |
80 | | /// # Nested Support |
81 | | /// Currently, the Rust implementation supports the following nested types: |
82 | | /// - `List<T>` |
83 | | /// - `LargeList<T>` |
84 | | /// - `FixedSizeList<T>` |
85 | | /// - `Struct<T, U, V, ...>` |
86 | | /// - `Union<T, U, V, ...>` |
87 | | /// - `Map<K, V>` |
88 | | /// |
89 | | /// Nested types can themselves be nested within other arrays. |
90 | | /// For more information on these types please see |
91 | | /// [the physical memory layout of Apache Arrow] |
92 | | /// |
93 | | /// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs |
94 | | /// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout |
95 | | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
96 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
97 | | pub enum DataType { |
98 | | /// Null type |
99 | | Null, |
100 | | /// A boolean datatype representing the values `true` and `false`. |
101 | | Boolean, |
102 | | /// A signed 8-bit integer. |
103 | | Int8, |
104 | | /// A signed 16-bit integer. |
105 | | Int16, |
106 | | /// A signed 32-bit integer. |
107 | | Int32, |
108 | | /// A signed 64-bit integer. |
109 | | Int64, |
110 | | /// An unsigned 8-bit integer. |
111 | | UInt8, |
112 | | /// An unsigned 16-bit integer. |
113 | | UInt16, |
114 | | /// An unsigned 32-bit integer. |
115 | | UInt32, |
116 | | /// An unsigned 64-bit integer. |
117 | | UInt64, |
118 | | /// A 16-bit floating point number. |
119 | | Float16, |
120 | | /// A 32-bit floating point number. |
121 | | Float32, |
122 | | /// A 64-bit floating point number. |
123 | | Float64, |
124 | | /// A timestamp with an optional timezone. |
125 | | /// |
126 | | /// Time is measured as a Unix epoch, counting the seconds from |
127 | | /// 00:00:00.000 on 1 January 1970, excluding leap seconds, |
128 | | /// as a signed 64-bit integer. |
129 | | /// |
130 | | /// The time zone is a string indicating the name of a time zone, one of: |
131 | | /// |
132 | | /// * As used in the Olson time zone database (the "tz database" or |
133 | | /// "tzdata"), such as "America/New_York" |
134 | | /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 |
135 | | /// |
136 | | /// Timestamps with a non-empty timezone |
137 | | /// ------------------------------------ |
138 | | /// |
139 | | /// If a Timestamp column has a non-empty timezone value, its epoch is |
140 | | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone |
141 | | /// (the Unix epoch), regardless of the Timestamp's own timezone. |
142 | | /// |
143 | | /// Therefore, timestamp values with a non-empty timezone correspond to |
144 | | /// physical points in time together with some additional information about |
145 | | /// how the data was obtained and/or how to display it (the timezone). |
146 | | /// |
147 | | /// For example, the timestamp value 0 with the timezone string "Europe/Paris" |
148 | | /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the |
149 | | /// application may prefer to display it as "January 1st 1970, 01h00" in |
150 | | /// the Europe/Paris timezone (which is the same physical point in time). |
151 | | /// |
152 | | /// One consequence is that timestamp values with a non-empty timezone |
153 | | /// can be compared and ordered directly, since they all share the same |
154 | | /// well-known point of reference (the Unix epoch). |
155 | | /// |
156 | | /// Timestamps with an unset / empty timezone |
157 | | /// ----------------------------------------- |
158 | | /// |
159 | | /// If a Timestamp column has no timezone value, its epoch is |
160 | | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. |
161 | | /// |
162 | | /// Therefore, timestamp values without a timezone cannot be meaningfully |
163 | | /// interpreted as physical points in time, but only as calendar / clock |
164 | | /// indications ("wall clock time") in an unspecified timezone. |
165 | | /// |
166 | | /// For example, the timestamp value 0 with an empty timezone string |
167 | | /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there |
168 | | /// is not enough information to interpret it as a well-defined physical |
169 | | /// point in time. |
170 | | /// |
171 | | /// One consequence is that timestamp values without a timezone cannot |
172 | | /// be reliably compared or ordered, since they may have different points of |
173 | | /// reference. In particular, it is *not* possible to interpret an unset |
174 | | /// or empty timezone as the same as "UTC". |
175 | | /// |
176 | | /// Conversion between timezones |
177 | | /// ---------------------------- |
178 | | /// |
179 | | /// If a Timestamp column has a non-empty timezone, changing the timezone |
180 | | /// to a different non-empty value is a metadata-only operation: |
181 | | /// the timestamp values need not change as their point of reference remains |
182 | | /// the same (the Unix epoch). |
183 | | /// |
184 | | /// However, if a Timestamp column has no timezone value, changing it to a |
185 | | /// non-empty value requires to think about the desired semantics. |
186 | | /// One possibility is to assume that the original timestamp values are |
187 | | /// relative to the epoch of the timezone being set; timestamp values should |
188 | | /// then adjusted to the Unix epoch (for example, changing the timezone from |
189 | | /// empty to "Europe/Paris" would require converting the timestamp values |
190 | | /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is |
191 | | /// nevertheless correct). |
192 | | /// |
193 | | /// ``` |
194 | | /// # use arrow_schema::{DataType, TimeUnit}; |
195 | | /// DataType::Timestamp(TimeUnit::Second, None); |
196 | | /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); |
197 | | /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); |
198 | | /// ``` |
199 | | /// |
200 | | /// # Timezone representation |
201 | | /// ---------------------------- |
202 | | /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00". |
203 | | /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous. |
204 | | /// |
205 | | /// Most arrow-rs functionalities use the absolute offset representation, |
206 | | /// such as [`PrimitiveArray::with_timezone_utc`] that applies a |
207 | | /// UTC timezone to timestamp arrays. |
208 | | /// |
209 | | /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc |
210 | | /// |
211 | | /// Timezone string parsing |
212 | | /// ----------------------- |
213 | | /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930". |
214 | | /// |
215 | | /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/) |
216 | | /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) |
217 | | /// timezones. |
218 | | Timestamp(TimeUnit, Option<Arc<str>>), |
219 | | /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
220 | | /// in days. |
221 | | Date32, |
222 | | /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
223 | | /// in milliseconds. |
224 | | /// |
225 | | /// # Valid Ranges |
226 | | /// |
227 | | /// According to the Arrow specification ([Schema.fbs]), values of Date64 |
228 | | /// are treated as the number of *days*, in milliseconds, since the UNIX |
229 | | /// epoch. Therefore, values of this type must be evenly divisible by |
230 | | /// `86_400_000`, the number of milliseconds in a standard day. |
231 | | /// |
232 | | /// It is not valid to store milliseconds that do not represent an exact |
233 | | /// day. The reason for this restriction is compatibility with other |
234 | | /// language's native libraries (specifically Java), which historically |
235 | | /// lacked a dedicated date type and only supported timestamps. |
236 | | /// |
237 | | /// # Validation |
238 | | /// |
239 | | /// This library does not validate or enforce that Date64 values are evenly |
240 | | /// divisible by `86_400_000` for performance and usability reasons. Date64 |
241 | | /// values are treated similarly to `Timestamp(TimeUnit::Millisecond, |
242 | | /// None)`: values will be displayed with a time of day if the value does |
243 | | /// not represent an exact day, and arithmetic will be done at the |
244 | | /// millisecond granularity. |
245 | | /// |
246 | | /// # Recommendation |
247 | | /// |
248 | | /// Users should prefer [`Date32`] to cleanly represent the number |
249 | | /// of days, or one of the Timestamp variants to include time as part of the |
250 | | /// representation, depending on their use case. |
251 | | /// |
252 | | /// # Further Reading |
253 | | /// |
254 | | /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288). |
255 | | /// |
256 | | /// [`Date32`]: Self::Date32 |
257 | | /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs |
258 | | Date64, |
259 | | /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
260 | | /// Must be either seconds or milliseconds. |
261 | | Time32(TimeUnit), |
262 | | /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
263 | | /// Must be either microseconds or nanoseconds. |
264 | | Time64(TimeUnit), |
265 | | /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. |
266 | | Duration(TimeUnit), |
267 | | /// A "calendar" interval which models types that don't necessarily |
268 | | /// have a precise duration without the context of a base timestamp (e.g. |
269 | | /// days can differ in length during day light savings time transitions). |
270 | | Interval(IntervalUnit), |
271 | | /// Opaque binary data of variable length. |
272 | | /// |
273 | | /// A single Binary array can store up to [`i32::MAX`] bytes |
274 | | /// of binary data in total. |
275 | | Binary, |
276 | | /// Opaque binary data of fixed size. |
277 | | /// Enum parameter specifies the number of bytes per value. |
278 | | FixedSizeBinary(i32), |
279 | | /// Opaque binary data of variable length and 64-bit offsets. |
280 | | /// |
281 | | /// A single LargeBinary array can store up to [`i64::MAX`] bytes |
282 | | /// of binary data in total. |
283 | | LargeBinary, |
284 | | /// Opaque binary data of variable length. |
285 | | /// |
286 | | /// Logically the same as [`Binary`], but the internal representation uses a view |
287 | | /// struct that contains the string length and either the string's entire data |
288 | | /// inline (for small strings) or an inlined prefix, an index of another buffer, |
289 | | /// and an offset pointing to a slice in that buffer (for non-small strings). |
290 | | /// |
291 | | /// [`Binary`]: Self::Binary |
292 | | BinaryView, |
293 | | /// A variable-length string in Unicode with UTF-8 encoding. |
294 | | /// |
295 | | /// A single Utf8 array can store up to [`i32::MAX`] bytes |
296 | | /// of string data in total. |
297 | | Utf8, |
298 | | /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. |
299 | | /// |
300 | | /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes |
301 | | /// of string data in total. |
302 | | LargeUtf8, |
303 | | /// A variable-length string in Unicode with UTF-8 encoding |
304 | | /// |
305 | | /// Logically the same as [`Utf8`], but the internal representation uses a view |
306 | | /// struct that contains the string length and either the string's entire data |
307 | | /// inline (for small strings) or an inlined prefix, an index of another buffer, |
308 | | /// and an offset pointing to a slice in that buffer (for non-small strings). |
309 | | /// |
310 | | /// [`Utf8`]: Self::Utf8 |
311 | | Utf8View, |
312 | | /// A list of some logical data type with variable length. |
313 | | /// |
314 | | /// A single List array can store up to [`i32::MAX`] elements in total. |
315 | | List(FieldRef), |
316 | | |
317 | | /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length. |
318 | | /// |
319 | | /// Logically the same as [`List`], but the internal representation differs in how child |
320 | | /// data is referenced, allowing flexibility in how data is layed out. |
321 | | /// |
322 | | /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. |
323 | | /// |
324 | | /// [`List`]: Self::List |
325 | | ListView(FieldRef), |
326 | | /// A list of some logical data type with fixed length. |
327 | | FixedSizeList(FieldRef, i32), |
328 | | /// A list of some logical data type with variable length and 64-bit offsets. |
329 | | /// |
330 | | /// A single LargeList array can store up to [`i64::MAX`] elements in total. |
331 | | LargeList(FieldRef), |
332 | | |
333 | | /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length and 64-bit offsets. |
334 | | /// |
335 | | /// Logically the same as [`LargeList`], but the internal representation differs in how child |
336 | | /// data is referenced, allowing flexibility in how data is layed out. |
337 | | /// |
338 | | /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. |
339 | | /// |
340 | | /// [`LargeList`]: Self::LargeList |
341 | | LargeListView(FieldRef), |
342 | | /// A nested datatype that contains a number of sub-fields. |
343 | | Struct(Fields), |
344 | | /// A nested datatype that can represent slots of differing types. Components: |
345 | | /// |
346 | | /// 1. [`UnionFields`] |
347 | | /// 2. The type of union (Sparse or Dense) |
348 | | Union(UnionFields, UnionMode), |
349 | | /// A dictionary encoded array (`key_type`, `value_type`), where |
350 | | /// each array element is an index of `key_type` into an |
351 | | /// associated dictionary of `value_type`. |
352 | | /// |
353 | | /// Dictionary arrays are used to store columns of `value_type` |
354 | | /// that contain many repeated values using less memory, but with |
355 | | /// a higher CPU overhead for some operations. |
356 | | /// |
357 | | /// This type mostly used to represent low cardinality string |
358 | | /// arrays or a limited set of primitive types as integers. |
359 | | Dictionary(Box<DataType>, Box<DataType>), |
360 | | /// Exact 32-bit width decimal value with precision and scale |
361 | | /// |
362 | | /// * precision is the total number of digits |
363 | | /// * scale is the number of digits past the decimal |
364 | | /// |
365 | | /// For example the number 123.45 has precision 5 and scale 2. |
366 | | /// |
367 | | /// In certain situations, scale could be negative number. For |
368 | | /// negative scale, it is the number of padding 0 to the right |
369 | | /// of the digits. |
370 | | /// |
371 | | /// For example the number 12300 could be treated as a decimal |
372 | | /// has precision 3 and scale -2. |
373 | | Decimal32(u8, i8), |
374 | | /// Exact 64-bit width decimal value with precision and scale |
375 | | /// |
376 | | /// * precision is the total number of digits |
377 | | /// * scale is the number of digits past the decimal |
378 | | /// |
379 | | /// For example the number 123.45 has precision 5 and scale 2. |
380 | | /// |
381 | | /// In certain situations, scale could be negative number. For |
382 | | /// negative scale, it is the number of padding 0 to the right |
383 | | /// of the digits. |
384 | | /// |
385 | | /// For example the number 12300 could be treated as a decimal |
386 | | /// has precision 3 and scale -2. |
387 | | Decimal64(u8, i8), |
388 | | /// Exact 128-bit width decimal value with precision and scale |
389 | | /// |
390 | | /// * precision is the total number of digits |
391 | | /// * scale is the number of digits past the decimal |
392 | | /// |
393 | | /// For example the number 123.45 has precision 5 and scale 2. |
394 | | /// |
395 | | /// In certain situations, scale could be negative number. For |
396 | | /// negative scale, it is the number of padding 0 to the right |
397 | | /// of the digits. |
398 | | /// |
399 | | /// For example the number 12300 could be treated as a decimal |
400 | | /// has precision 3 and scale -2. |
401 | | Decimal128(u8, i8), |
402 | | /// Exact 256-bit width decimal value with precision and scale |
403 | | /// |
404 | | /// * precision is the total number of digits |
405 | | /// * scale is the number of digits past the decimal |
406 | | /// |
407 | | /// For example the number 123.45 has precision 5 and scale 2. |
408 | | /// |
409 | | /// In certain situations, scale could be negative number. For |
410 | | /// negative scale, it is the number of padding 0 to the right |
411 | | /// of the digits. |
412 | | /// |
413 | | /// For example the number 12300 could be treated as a decimal |
414 | | /// has precision 3 and scale -2. |
415 | | Decimal256(u8, i8), |
416 | | /// A Map is a logical nested type that is represented as |
417 | | /// |
418 | | /// `List<entries: Struct<key: K, value: V>>` |
419 | | /// |
420 | | /// The keys and values are each respectively contiguous. |
421 | | /// The key and value types are not constrained, but keys should be |
422 | | /// hashable and unique. |
423 | | /// Whether the keys are sorted can be set in the `bool` after the `Field`. |
424 | | /// |
425 | | /// In a field with Map type, the field has a child Struct field, which then |
426 | | /// has two children: key type and the second the value type. The names of the |
427 | | /// child fields may be respectively "entries", "key", and "value", but this is |
428 | | /// not enforced. |
429 | | Map(FieldRef, bool), |
430 | | /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These |
431 | | /// encodings are well-suited for representing data containing sequences of the |
432 | | /// same value, called runs. Each run is represented as a value and an integer giving |
433 | | /// the index in the array where the run ends. |
434 | | /// |
435 | | /// A run-end encoded array has no buffers by itself, but has two child arrays. The |
436 | | /// first child array, called the run ends array, holds either 16, 32, or 64-bit |
437 | | /// signed integers. The actual values of each run are held in the second child array. |
438 | | /// |
439 | | /// These child arrays are prescribed the standard names of "run_ends" and "values" |
440 | | /// respectively. |
441 | | RunEndEncoded(FieldRef, FieldRef), |
442 | | } |
443 | | |
444 | | /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. |
445 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] |
446 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
447 | | pub enum TimeUnit { |
448 | | /// Time in seconds. |
449 | | Second, |
450 | | /// Time in milliseconds. |
451 | | Millisecond, |
452 | | /// Time in microseconds. |
453 | | Microsecond, |
454 | | /// Time in nanoseconds. |
455 | | Nanosecond, |
456 | | } |
457 | | |
458 | | /// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. |
459 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] |
460 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
461 | | pub enum IntervalUnit { |
462 | | /// Indicates the number of elapsed whole months, stored as 4-byte integers. |
463 | | YearMonth, |
464 | | /// Indicates the number of elapsed days and milliseconds, |
465 | | /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). |
466 | | DayTime, |
467 | | /// A triple of the number of elapsed months, days, and nanoseconds. |
468 | | /// The values are stored contiguously in 16 byte blocks. Months and |
469 | | /// days are encoded as 32 bit integers and nanoseconds is encoded as a |
470 | | /// 64 bit integer. All integers are signed. Each field is independent |
471 | | /// (e.g. there is no constraint that nanoseconds have the same sign |
472 | | /// as days or that the quantity of nanoseconds represents less |
473 | | /// than a day's worth of time). |
474 | | MonthDayNano, |
475 | | } |
476 | | |
477 | | /// Sparse or Dense union layouts |
478 | | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)] |
479 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
480 | | pub enum UnionMode { |
481 | | /// Sparse union layout |
482 | | Sparse, |
483 | | /// Dense union layout |
484 | | Dense, |
485 | | } |
486 | | |
487 | | impl fmt::Display for DataType { |
488 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
489 | 0 | match &self { |
490 | 0 | DataType::Struct(fields) => { |
491 | 0 | write!(f, "Struct(")?; |
492 | 0 | if !fields.is_empty() { |
493 | 0 | let fields_str = fields |
494 | 0 | .iter() |
495 | 0 | .map(|f| format!("{} {}", f.name(), f.data_type())) |
496 | 0 | .collect::<Vec<_>>() |
497 | 0 | .join(", "); |
498 | 0 | write!(f, "{fields_str}")?; |
499 | 0 | } |
500 | 0 | write!(f, ")")?; |
501 | 0 | Ok(()) |
502 | | } |
503 | 0 | _ => write!(f, "{self:?}"), |
504 | | } |
505 | 0 | } |
506 | | } |
507 | | |
508 | | /// Parses `str` into a `DataType`. |
509 | | /// |
510 | | /// This is the reverse of [`DataType`]'s `Display` |
511 | | /// impl, and maintains the invariant that |
512 | | /// `DataType::try_from(&data_type.to_string()).unwrap() == data_type` |
513 | | /// |
514 | | /// # Example |
515 | | /// ``` |
516 | | /// use arrow_schema::DataType; |
517 | | /// |
518 | | /// let data_type: DataType = "Int32".parse().unwrap(); |
519 | | /// assert_eq!(data_type, DataType::Int32); |
520 | | /// ``` |
521 | | impl FromStr for DataType { |
522 | | type Err = ArrowError; |
523 | | |
524 | 0 | fn from_str(s: &str) -> Result<Self, Self::Err> { |
525 | 0 | crate::datatype_parse::parse_data_type(s) |
526 | 0 | } |
527 | | } |
528 | | |
529 | | impl TryFrom<&str> for DataType { |
530 | | type Error = ArrowError; |
531 | | |
532 | 0 | fn try_from(value: &str) -> Result<Self, Self::Error> { |
533 | 0 | value.parse() |
534 | 0 | } |
535 | | } |
536 | | |
537 | | impl DataType { |
538 | | /// Returns true if the type is primitive: (numeric, temporal). |
539 | | #[inline] |
540 | 0 | pub fn is_primitive(&self) -> bool { |
541 | 0 | self.is_numeric() || self.is_temporal() |
542 | 0 | } |
543 | | |
544 | | /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*). |
545 | | #[inline] |
546 | 0 | pub fn is_numeric(&self) -> bool { |
547 | | use DataType::*; |
548 | 0 | matches!( |
549 | 0 | self, |
550 | | UInt8 |
551 | | | UInt16 |
552 | | | UInt32 |
553 | | | UInt64 |
554 | | | Int8 |
555 | | | Int16 |
556 | | | Int32 |
557 | | | Int64 |
558 | | | Float16 |
559 | | | Float32 |
560 | | | Float64 |
561 | | | Decimal32(_, _) |
562 | | | Decimal64(_, _) |
563 | | | Decimal128(_, _) |
564 | | | Decimal256(_, _) |
565 | | ) |
566 | 0 | } |
567 | | |
568 | | /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). |
569 | | #[inline] |
570 | 0 | pub fn is_temporal(&self) -> bool { |
571 | | use DataType::*; |
572 | 0 | matches!( |
573 | 0 | self, |
574 | | Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_) |
575 | | ) |
576 | 0 | } |
577 | | |
578 | | /// Returns true if this type is floating: (Float*). |
579 | | #[inline] |
580 | 0 | pub fn is_floating(&self) -> bool { |
581 | | use DataType::*; |
582 | 0 | matches!(self, Float16 | Float32 | Float64) |
583 | 0 | } |
584 | | |
585 | | /// Returns true if this type is integer: (Int*, UInt*). |
586 | | #[inline] |
587 | 0 | pub fn is_integer(&self) -> bool { |
588 | 0 | self.is_signed_integer() || self.is_unsigned_integer() |
589 | 0 | } |
590 | | |
591 | | /// Returns true if this type is signed integer: (Int*). |
592 | | #[inline] |
593 | 0 | pub fn is_signed_integer(&self) -> bool { |
594 | | use DataType::*; |
595 | 0 | matches!(self, Int8 | Int16 | Int32 | Int64) |
596 | 0 | } |
597 | | |
598 | | /// Returns true if this type is unsigned integer: (UInt*). |
599 | | #[inline] |
600 | 0 | pub fn is_unsigned_integer(&self) -> bool { |
601 | | use DataType::*; |
602 | 0 | matches!(self, UInt8 | UInt16 | UInt32 | UInt64) |
603 | 0 | } |
604 | | |
605 | | /// Returns true if this type is valid as a dictionary key |
606 | | #[inline] |
607 | 0 | pub fn is_dictionary_key_type(&self) -> bool { |
608 | 0 | self.is_integer() |
609 | 0 | } |
610 | | |
611 | | /// Returns true if this type is valid for run-ends array in RunArray |
612 | | #[inline] |
613 | 0 | pub fn is_run_ends_type(&self) -> bool { |
614 | | use DataType::*; |
615 | 0 | matches!(self, Int16 | Int32 | Int64) |
616 | 0 | } |
617 | | |
618 | | /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union, |
619 | | /// or Map), or a dictionary of a nested type |
620 | | #[inline] |
621 | 0 | pub fn is_nested(&self) -> bool { |
622 | | use DataType::*; |
623 | 0 | match self { |
624 | 0 | Dictionary(_, v) => DataType::is_nested(v.as_ref()), |
625 | 0 | RunEndEncoded(_, v) => DataType::is_nested(v.data_type()), |
626 | | List(_) |
627 | | | FixedSizeList(_, _) |
628 | | | LargeList(_) |
629 | | | ListView(_) |
630 | | | LargeListView(_) |
631 | | | Struct(_) |
632 | | | Union(_, _) |
633 | 0 | | Map(_, _) => true, |
634 | 0 | _ => false, |
635 | | } |
636 | 0 | } |
637 | | |
638 | | /// Returns true if this type is DataType::Null. |
639 | | #[inline] |
640 | 0 | pub fn is_null(&self) -> bool { |
641 | | use DataType::*; |
642 | 0 | matches!(self, Null) |
643 | 0 | } |
644 | | |
645 | | /// Compares the datatype with another, ignoring nested field names |
646 | | /// and metadata. |
647 | 0 | pub fn equals_datatype(&self, other: &DataType) -> bool { |
648 | 0 | match (&self, other) { |
649 | 0 | (DataType::List(a), DataType::List(b)) |
650 | 0 | | (DataType::LargeList(a), DataType::LargeList(b)) |
651 | 0 | | (DataType::ListView(a), DataType::ListView(b)) |
652 | 0 | | (DataType::LargeListView(a), DataType::LargeListView(b)) => { |
653 | 0 | a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type()) |
654 | | } |
655 | 0 | (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { |
656 | 0 | a_size == b_size |
657 | 0 | && a.is_nullable() == b.is_nullable() |
658 | 0 | && a.data_type().equals_datatype(b.data_type()) |
659 | | } |
660 | 0 | (DataType::Struct(a), DataType::Struct(b)) => { |
661 | 0 | a.len() == b.len() |
662 | 0 | && a.iter().zip(b).all(|(a, b)| { |
663 | 0 | a.is_nullable() == b.is_nullable() |
664 | 0 | && a.data_type().equals_datatype(b.data_type()) |
665 | 0 | }) |
666 | | } |
667 | 0 | (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => { |
668 | 0 | a_field.is_nullable() == b_field.is_nullable() |
669 | 0 | && a_field.data_type().equals_datatype(b_field.data_type()) |
670 | 0 | && a_is_sorted == b_is_sorted |
671 | | } |
672 | 0 | (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => { |
673 | 0 | a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value) |
674 | | } |
675 | | ( |
676 | 0 | DataType::RunEndEncoded(a_run_ends, a_values), |
677 | 0 | DataType::RunEndEncoded(b_run_ends, b_values), |
678 | | ) => { |
679 | 0 | a_run_ends.is_nullable() == b_run_ends.is_nullable() |
680 | 0 | && a_run_ends |
681 | 0 | .data_type() |
682 | 0 | .equals_datatype(b_run_ends.data_type()) |
683 | 0 | && a_values.is_nullable() == b_values.is_nullable() |
684 | 0 | && a_values.data_type().equals_datatype(b_values.data_type()) |
685 | | } |
686 | | ( |
687 | 0 | DataType::Union(a_union_fields, a_union_mode), |
688 | 0 | DataType::Union(b_union_fields, b_union_mode), |
689 | | ) => { |
690 | 0 | a_union_mode == b_union_mode |
691 | 0 | && a_union_fields.len() == b_union_fields.len() |
692 | 0 | && a_union_fields.iter().all(|a| { |
693 | 0 | b_union_fields.iter().any(|b| { |
694 | 0 | a.0 == b.0 |
695 | 0 | && a.1.is_nullable() == b.1.is_nullable() |
696 | 0 | && a.1.data_type().equals_datatype(b.1.data_type()) |
697 | 0 | }) |
698 | 0 | }) |
699 | | } |
700 | 0 | _ => self == other, |
701 | | } |
702 | 0 | } |
703 | | |
704 | | /// Returns the byte width of this type if it is a primitive type |
705 | | /// |
706 | | /// Returns `None` if not a primitive type |
707 | | #[inline] |
708 | 3 | pub fn primitive_width(&self) -> Option<usize> { |
709 | 0 | match self { |
710 | 0 | DataType::Null => None, |
711 | 0 | DataType::Boolean => None, |
712 | 0 | DataType::Int8 | DataType::UInt8 => Some(1), |
713 | 0 | DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2), |
714 | 2 | DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4), |
715 | 1 | DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8), |
716 | 0 | DataType::Timestamp(_, _) => Some(8), |
717 | 0 | DataType::Date32 | DataType::Time32(_) => Some(4), |
718 | 0 | DataType::Date64 | DataType::Time64(_) => Some(8), |
719 | 0 | DataType::Duration(_) => Some(8), |
720 | 0 | DataType::Interval(IntervalUnit::YearMonth) => Some(4), |
721 | 0 | DataType::Interval(IntervalUnit::DayTime) => Some(8), |
722 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), |
723 | 0 | DataType::Decimal32(_, _) => Some(4), |
724 | 0 | DataType::Decimal64(_, _) => Some(8), |
725 | 0 | DataType::Decimal128(_, _) => Some(16), |
726 | 0 | DataType::Decimal256(_, _) => Some(32), |
727 | 0 | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None, |
728 | 0 | DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None, |
729 | 0 | DataType::FixedSizeBinary(_) => None, |
730 | | DataType::List(_) |
731 | | | DataType::ListView(_) |
732 | | | DataType::LargeList(_) |
733 | | | DataType::LargeListView(_) |
734 | 0 | | DataType::Map(_, _) => None, |
735 | 0 | DataType::FixedSizeList(_, _) => None, |
736 | 0 | DataType::Struct(_) => None, |
737 | 0 | DataType::Union(_, _) => None, |
738 | 0 | DataType::Dictionary(_, _) => None, |
739 | 0 | DataType::RunEndEncoded(_, _) => None, |
740 | | } |
741 | 3 | } |
742 | | |
743 | | /// Return size of this instance in bytes. |
744 | | /// |
745 | | /// Includes the size of `Self`. |
746 | 0 | pub fn size(&self) -> usize { |
747 | 0 | std::mem::size_of_val(self) |
748 | 0 | + match self { |
749 | | DataType::Null |
750 | | | DataType::Boolean |
751 | | | DataType::Int8 |
752 | | | DataType::Int16 |
753 | | | DataType::Int32 |
754 | | | DataType::Int64 |
755 | | | DataType::UInt8 |
756 | | | DataType::UInt16 |
757 | | | DataType::UInt32 |
758 | | | DataType::UInt64 |
759 | | | DataType::Float16 |
760 | | | DataType::Float32 |
761 | | | DataType::Float64 |
762 | | | DataType::Date32 |
763 | | | DataType::Date64 |
764 | | | DataType::Time32(_) |
765 | | | DataType::Time64(_) |
766 | | | DataType::Duration(_) |
767 | | | DataType::Interval(_) |
768 | | | DataType::Binary |
769 | | | DataType::FixedSizeBinary(_) |
770 | | | DataType::LargeBinary |
771 | | | DataType::BinaryView |
772 | | | DataType::Utf8 |
773 | | | DataType::LargeUtf8 |
774 | | | DataType::Utf8View |
775 | | | DataType::Decimal32(_, _) |
776 | | | DataType::Decimal64(_, _) |
777 | | | DataType::Decimal128(_, _) |
778 | 0 | | DataType::Decimal256(_, _) => 0, |
779 | 0 | DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(), |
780 | 0 | DataType::List(field) |
781 | 0 | | DataType::ListView(field) |
782 | 0 | | DataType::FixedSizeList(field, _) |
783 | 0 | | DataType::LargeList(field) |
784 | 0 | | DataType::LargeListView(field) |
785 | 0 | | DataType::Map(field, _) => field.size(), |
786 | 0 | DataType::Struct(fields) => fields.size(), |
787 | 0 | DataType::Union(fields, _) => fields.size(), |
788 | 0 | DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), |
789 | 0 | DataType::RunEndEncoded(run_ends, values) => { |
790 | 0 | run_ends.size() - std::mem::size_of_val(run_ends) + values.size() |
791 | 0 | - std::mem::size_of_val(values) |
792 | | } |
793 | | } |
794 | 0 | } |
795 | | |
796 | | /// Check to see if `self` is a superset of `other` |
797 | | /// |
798 | | /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type |
799 | | /// else it will check to see if the DataType is equal to the other DataType |
800 | 0 | pub fn contains(&self, other: &DataType) -> bool { |
801 | 0 | match (self, other) { |
802 | 0 | (DataType::List(f1), DataType::List(f2)) |
803 | 0 | | (DataType::LargeList(f1), DataType::LargeList(f2)) |
804 | 0 | | (DataType::ListView(f1), DataType::ListView(f2)) |
805 | 0 | | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2), |
806 | 0 | (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => { |
807 | 0 | s1 == s2 && f1.contains(f2) |
808 | | } |
809 | 0 | (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2), |
810 | 0 | (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2), |
811 | 0 | (DataType::Union(f1, s1), DataType::Union(f2, s2)) => { |
812 | 0 | s1 == s2 |
813 | 0 | && f1 |
814 | 0 | .iter() |
815 | 0 | .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1))) |
816 | | } |
817 | 0 | (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { |
818 | 0 | k1.contains(k2) && v1.contains(v2) |
819 | | } |
820 | 0 | _ => self == other, |
821 | | } |
822 | 0 | } |
823 | | |
824 | | /// Create a [`DataType::List`] with elements of the specified type |
825 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
826 | | /// |
827 | | /// To specify field level metadata, construct the inner [`Field`] |
828 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
829 | 0 | pub fn new_list(data_type: DataType, nullable: bool) -> Self { |
830 | 0 | DataType::List(Arc::new(Field::new_list_field(data_type, nullable))) |
831 | 0 | } |
832 | | |
833 | | /// Create a [`DataType::LargeList`] with elements of the specified type |
834 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
835 | | /// |
836 | | /// To specify field level metadata, construct the inner [`Field`] |
837 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
838 | 0 | pub fn new_large_list(data_type: DataType, nullable: bool) -> Self { |
839 | 0 | DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable))) |
840 | 0 | } |
841 | | |
842 | | /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size |
843 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
844 | | /// |
845 | | /// To specify field level metadata, construct the inner [`Field`] |
846 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
847 | 0 | pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self { |
848 | 0 | DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size) |
849 | 0 | } |
850 | | } |
851 | | |
852 | | /// The maximum precision for [DataType::Decimal32] values |
853 | | pub const DECIMAL32_MAX_PRECISION: u8 = 9; |
854 | | |
855 | | /// The maximum scale for [DataType::Decimal32] values |
856 | | pub const DECIMAL32_MAX_SCALE: i8 = 9; |
857 | | |
858 | | /// The maximum precision for [DataType::Decimal64] values |
859 | | pub const DECIMAL64_MAX_PRECISION: u8 = 18; |
860 | | |
861 | | /// The maximum scale for [DataType::Decimal64] values |
862 | | pub const DECIMAL64_MAX_SCALE: i8 = 18; |
863 | | |
864 | | /// The maximum precision for [DataType::Decimal128] values |
865 | | pub const DECIMAL128_MAX_PRECISION: u8 = 38; |
866 | | |
867 | | /// The maximum scale for [DataType::Decimal128] values |
868 | | pub const DECIMAL128_MAX_SCALE: i8 = 38; |
869 | | |
870 | | /// The maximum precision for [DataType::Decimal256] values |
871 | | pub const DECIMAL256_MAX_PRECISION: u8 = 76; |
872 | | |
873 | | /// The maximum scale for [DataType::Decimal256] values |
874 | | pub const DECIMAL256_MAX_SCALE: i8 = 76; |
875 | | |
876 | | /// The default scale for [DataType::Decimal32] values |
877 | | pub const DECIMAL32_DEFAULT_SCALE: i8 = 2; |
878 | | |
879 | | /// The default scale for [DataType::Decimal64] values |
880 | | pub const DECIMAL64_DEFAULT_SCALE: i8 = 6; |
881 | | |
882 | | /// The default scale for [DataType::Decimal128] and [DataType::Decimal256] |
883 | | /// values |
884 | | pub const DECIMAL_DEFAULT_SCALE: i8 = 10; |
885 | | |
886 | | #[cfg(test)] |
887 | | mod tests { |
888 | | use super::*; |
889 | | |
890 | | #[test] |
891 | | #[cfg(feature = "serde")] |
892 | | fn serde_struct_type() { |
893 | | use std::collections::HashMap; |
894 | | |
895 | | let kv_array = [("k".to_string(), "v".to_string())]; |
896 | | let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect(); |
897 | | |
898 | | // Non-empty map: should be converted as JSON obj { ... } |
899 | | let first_name = |
900 | | Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); |
901 | | |
902 | | // Empty map: should be omitted. |
903 | | let last_name = |
904 | | Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default()); |
905 | | |
906 | | let person = DataType::Struct(Fields::from(vec![ |
907 | | first_name, |
908 | | last_name, |
909 | | Field::new( |
910 | | "address", |
911 | | DataType::Struct(Fields::from(vec![ |
912 | | Field::new("street", DataType::Utf8, false), |
913 | | Field::new("zip", DataType::UInt16, false), |
914 | | ])), |
915 | | false, |
916 | | ), |
917 | | ])); |
918 | | |
919 | | let serialized = serde_json::to_string(&person).unwrap(); |
920 | | |
921 | | // NOTE that this is testing the default (derived) serialization format, not the |
922 | | // JSON format specified in metadata.md |
923 | | |
924 | | assert_eq!( |
925 | | "{\"Struct\":[\ |
926 | | {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ |
927 | | {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ |
928 | | {\"name\":\"address\",\"data_type\":{\"Struct\":\ |
929 | | [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ |
930 | | {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\ |
931 | | ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}", |
932 | | serialized |
933 | | ); |
934 | | |
935 | | let deserialized = serde_json::from_str(&serialized).unwrap(); |
936 | | |
937 | | assert_eq!(person, deserialized); |
938 | | } |
939 | | |
940 | | #[test] |
941 | | fn test_list_datatype_equality() { |
942 | | // tests that list type equality is checked while ignoring list names |
943 | | let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); |
944 | | let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true))); |
945 | | let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
946 | | let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true))); |
947 | | assert!(list_a.equals_datatype(&list_b)); |
948 | | assert!(!list_a.equals_datatype(&list_c)); |
949 | | assert!(!list_b.equals_datatype(&list_c)); |
950 | | assert!(!list_a.equals_datatype(&list_d)); |
951 | | |
952 | | let list_e = |
953 | | DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3); |
954 | | let list_f = |
955 | | DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3); |
956 | | let list_g = DataType::FixedSizeList( |
957 | | Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)), |
958 | | 3, |
959 | | ); |
960 | | assert!(list_e.equals_datatype(&list_f)); |
961 | | assert!(!list_e.equals_datatype(&list_g)); |
962 | | assert!(!list_f.equals_datatype(&list_g)); |
963 | | |
964 | | let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)])); |
965 | | let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)])); |
966 | | let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)])); |
967 | | let list_k = DataType::Struct(Fields::from(vec![ |
968 | | Field::new("f1", list_f.clone(), false), |
969 | | Field::new("f2", list_g.clone(), false), |
970 | | Field::new("f3", DataType::Utf8, true), |
971 | | ])); |
972 | | let list_l = DataType::Struct(Fields::from(vec![ |
973 | | Field::new("ff1", list_f.clone(), false), |
974 | | Field::new("ff2", list_g.clone(), false), |
975 | | Field::new("ff3", DataType::LargeUtf8, true), |
976 | | ])); |
977 | | let list_m = DataType::Struct(Fields::from(vec![ |
978 | | Field::new("ff1", list_f, false), |
979 | | Field::new("ff2", list_g, false), |
980 | | Field::new("ff3", DataType::Utf8, true), |
981 | | ])); |
982 | | assert!(list_h.equals_datatype(&list_i)); |
983 | | assert!(!list_h.equals_datatype(&list_j)); |
984 | | assert!(!list_k.equals_datatype(&list_l)); |
985 | | assert!(list_k.equals_datatype(&list_m)); |
986 | | |
987 | | let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true); |
988 | | let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true); |
989 | | let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false); |
990 | | let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true); |
991 | | let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true); |
992 | | |
993 | | assert!(list_n.equals_datatype(&list_o)); |
994 | | assert!(!list_n.equals_datatype(&list_p)); |
995 | | assert!(!list_n.equals_datatype(&list_q)); |
996 | | assert!(!list_n.equals_datatype(&list_r)); |
997 | | |
998 | | let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a)); |
999 | | let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone())); |
1000 | | let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b)); |
1001 | | let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c)); |
1002 | | |
1003 | | assert!(list_s.equals_datatype(&list_t)); |
1004 | | assert!(!list_s.equals_datatype(&list_u)); |
1005 | | assert!(!list_s.equals_datatype(&list_v)); |
1006 | | |
1007 | | let union_a = DataType::Union( |
1008 | | UnionFields::new( |
1009 | | vec![1, 2], |
1010 | | vec![ |
1011 | | Field::new("f1", DataType::Utf8, false), |
1012 | | Field::new("f2", DataType::UInt8, false), |
1013 | | ], |
1014 | | ), |
1015 | | UnionMode::Sparse, |
1016 | | ); |
1017 | | let union_b = DataType::Union( |
1018 | | UnionFields::new( |
1019 | | vec![1, 2], |
1020 | | vec![ |
1021 | | Field::new("ff1", DataType::Utf8, false), |
1022 | | Field::new("ff2", DataType::UInt8, false), |
1023 | | ], |
1024 | | ), |
1025 | | UnionMode::Sparse, |
1026 | | ); |
1027 | | let union_c = DataType::Union( |
1028 | | UnionFields::new( |
1029 | | vec![2, 1], |
1030 | | vec![ |
1031 | | Field::new("fff2", DataType::UInt8, false), |
1032 | | Field::new("fff1", DataType::Utf8, false), |
1033 | | ], |
1034 | | ), |
1035 | | UnionMode::Sparse, |
1036 | | ); |
1037 | | let union_d = DataType::Union( |
1038 | | UnionFields::new( |
1039 | | vec![2, 1], |
1040 | | vec![ |
1041 | | Field::new("fff1", DataType::Int8, false), |
1042 | | Field::new("fff2", DataType::UInt8, false), |
1043 | | ], |
1044 | | ), |
1045 | | UnionMode::Sparse, |
1046 | | ); |
1047 | | let union_e = DataType::Union( |
1048 | | UnionFields::new( |
1049 | | vec![1, 2], |
1050 | | vec![ |
1051 | | Field::new("f1", DataType::Utf8, true), |
1052 | | Field::new("f2", DataType::UInt8, false), |
1053 | | ], |
1054 | | ), |
1055 | | UnionMode::Sparse, |
1056 | | ); |
1057 | | |
1058 | | assert!(union_a.equals_datatype(&union_b)); |
1059 | | assert!(union_a.equals_datatype(&union_c)); |
1060 | | assert!(!union_a.equals_datatype(&union_d)); |
1061 | | assert!(!union_a.equals_datatype(&union_e)); |
1062 | | |
1063 | | let list_w = DataType::RunEndEncoded( |
1064 | | Arc::new(Field::new("f1", DataType::Int64, true)), |
1065 | | Arc::new(Field::new("f2", DataType::Utf8, true)), |
1066 | | ); |
1067 | | let list_x = DataType::RunEndEncoded( |
1068 | | Arc::new(Field::new("ff1", DataType::Int64, true)), |
1069 | | Arc::new(Field::new("ff2", DataType::Utf8, true)), |
1070 | | ); |
1071 | | let list_y = DataType::RunEndEncoded( |
1072 | | Arc::new(Field::new("ff1", DataType::UInt16, true)), |
1073 | | Arc::new(Field::new("ff2", DataType::Utf8, true)), |
1074 | | ); |
1075 | | let list_z = DataType::RunEndEncoded( |
1076 | | Arc::new(Field::new("f1", DataType::Int64, false)), |
1077 | | Arc::new(Field::new("f2", DataType::Utf8, true)), |
1078 | | ); |
1079 | | |
1080 | | assert!(list_w.equals_datatype(&list_x)); |
1081 | | assert!(!list_w.equals_datatype(&list_y)); |
1082 | | assert!(!list_w.equals_datatype(&list_z)); |
1083 | | } |
1084 | | |
1085 | | #[test] |
1086 | | fn create_struct_type() { |
1087 | | let _person = DataType::Struct(Fields::from(vec![ |
1088 | | Field::new("first_name", DataType::Utf8, false), |
1089 | | Field::new("last_name", DataType::Utf8, false), |
1090 | | Field::new( |
1091 | | "address", |
1092 | | DataType::Struct(Fields::from(vec![ |
1093 | | Field::new("street", DataType::Utf8, false), |
1094 | | Field::new("zip", DataType::UInt16, false), |
1095 | | ])), |
1096 | | false, |
1097 | | ), |
1098 | | ])); |
1099 | | } |
1100 | | |
1101 | | #[test] |
1102 | | fn test_nested() { |
1103 | | let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1104 | | let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1105 | | let large_list_view = |
1106 | | DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1107 | | |
1108 | | assert!(!DataType::is_nested(&DataType::Boolean)); |
1109 | | assert!(!DataType::is_nested(&DataType::Int32)); |
1110 | | assert!(!DataType::is_nested(&DataType::Utf8)); |
1111 | | assert!(DataType::is_nested(&list)); |
1112 | | assert!(DataType::is_nested(&list_view)); |
1113 | | assert!(DataType::is_nested(&large_list_view)); |
1114 | | |
1115 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1116 | | Box::new(DataType::Int32), |
1117 | | Box::new(DataType::Boolean) |
1118 | | ))); |
1119 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1120 | | Box::new(DataType::Int32), |
1121 | | Box::new(DataType::Int64) |
1122 | | ))); |
1123 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1124 | | Box::new(DataType::Int32), |
1125 | | Box::new(DataType::LargeUtf8) |
1126 | | ))); |
1127 | | assert!(DataType::is_nested(&DataType::Dictionary( |
1128 | | Box::new(DataType::Int32), |
1129 | | Box::new(list) |
1130 | | ))); |
1131 | | } |
1132 | | |
1133 | | #[test] |
1134 | | fn test_integer() { |
1135 | | // is_integer |
1136 | | assert!(DataType::is_integer(&DataType::Int32)); |
1137 | | assert!(DataType::is_integer(&DataType::UInt64)); |
1138 | | assert!(!DataType::is_integer(&DataType::Float16)); |
1139 | | |
1140 | | // is_signed_integer |
1141 | | assert!(DataType::is_signed_integer(&DataType::Int32)); |
1142 | | assert!(!DataType::is_signed_integer(&DataType::UInt64)); |
1143 | | assert!(!DataType::is_signed_integer(&DataType::Float16)); |
1144 | | |
1145 | | // is_unsigned_integer |
1146 | | assert!(!DataType::is_unsigned_integer(&DataType::Int32)); |
1147 | | assert!(DataType::is_unsigned_integer(&DataType::UInt64)); |
1148 | | assert!(!DataType::is_unsigned_integer(&DataType::Float16)); |
1149 | | |
1150 | | // is_dictionary_key_type |
1151 | | assert!(DataType::is_dictionary_key_type(&DataType::Int32)); |
1152 | | assert!(DataType::is_dictionary_key_type(&DataType::UInt64)); |
1153 | | assert!(!DataType::is_dictionary_key_type(&DataType::Float16)); |
1154 | | } |
1155 | | |
1156 | | #[test] |
1157 | | fn test_floating() { |
1158 | | assert!(DataType::is_floating(&DataType::Float16)); |
1159 | | assert!(!DataType::is_floating(&DataType::Int32)); |
1160 | | } |
1161 | | |
1162 | | #[test] |
1163 | | fn test_datatype_is_null() { |
1164 | | assert!(DataType::is_null(&DataType::Null)); |
1165 | | assert!(!DataType::is_null(&DataType::Int32)); |
1166 | | } |
1167 | | |
1168 | | #[test] |
1169 | | fn size_should_not_regress() { |
1170 | | assert_eq!(std::mem::size_of::<DataType>(), 24); |
1171 | | } |
1172 | | |
1173 | | #[test] |
1174 | | #[should_panic(expected = "duplicate type id: 1")] |
1175 | | fn test_union_with_duplicated_type_id() { |
1176 | | let type_ids = vec![1, 1]; |
1177 | | let _union = DataType::Union( |
1178 | | UnionFields::new( |
1179 | | type_ids, |
1180 | | vec![ |
1181 | | Field::new("f1", DataType::Int32, false), |
1182 | | Field::new("f2", DataType::Utf8, false), |
1183 | | ], |
1184 | | ), |
1185 | | UnionMode::Dense, |
1186 | | ); |
1187 | | } |
1188 | | |
1189 | | #[test] |
1190 | | fn test_try_from_str() { |
1191 | | let data_type: DataType = "Int32".try_into().unwrap(); |
1192 | | assert_eq!(data_type, DataType::Int32); |
1193 | | } |
1194 | | |
1195 | | #[test] |
1196 | | fn test_from_str() { |
1197 | | let data_type: DataType = "UInt64".parse().unwrap(); |
1198 | | assert_eq!(data_type, DataType::UInt64); |
1199 | | } |
1200 | | } |