/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::str::FromStr; |
19 | | use std::sync::Arc; |
20 | | |
21 | | use crate::{ArrowError, Field, FieldRef, Fields, UnionFields}; |
22 | | |
23 | | /// Datatypes supported by this implementation of Apache Arrow. |
24 | | /// |
25 | | /// The variants of this enum include primitive fixed size types as well as |
26 | | /// parametric or nested types. See [`Schema.fbs`] for Arrow's specification. |
27 | | /// |
28 | | /// # Examples |
29 | | /// |
30 | | /// Primitive types |
31 | | /// ``` |
32 | | /// # use arrow_schema::DataType; |
33 | | /// // create a new 32-bit signed integer |
34 | | /// let data_type = DataType::Int32; |
35 | | /// ``` |
36 | | /// |
37 | | /// Nested Types |
38 | | /// ``` |
39 | | /// # use arrow_schema::{DataType, Field}; |
40 | | /// # use std::sync::Arc; |
41 | | /// // create a new list of 32-bit signed integers directly |
42 | | /// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); |
43 | | /// // Create the same list type with constructor |
44 | | /// let list_data_type2 = DataType::new_list(DataType::Int32, true); |
45 | | /// assert_eq!(list_data_type, list_data_type2); |
46 | | /// ``` |
47 | | /// |
48 | | /// Dictionary Types |
49 | | /// ``` |
50 | | /// # use arrow_schema::{DataType}; |
51 | | /// // String Dictionary (key type Int32 and value type Utf8) |
52 | | /// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); |
53 | | /// ``` |
54 | | /// |
55 | | /// Timestamp Types |
56 | | /// ``` |
57 | | /// # use arrow_schema::{DataType, TimeUnit}; |
58 | | /// // timestamp with millisecond precision without timezone specified |
59 | | /// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None); |
60 | | /// // timestamp with nanosecond precision in UTC timezone |
61 | | /// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())); |
62 | | ///``` |
63 | | /// |
64 | | /// # Display and FromStr |
65 | | /// |
66 | | /// The `Display` and `FromStr` implementations for `DataType` are |
67 | | /// human-readable, parseable, and reversible. |
68 | | /// |
69 | | /// ``` |
70 | | /// # use arrow_schema::DataType; |
71 | | /// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); |
72 | | /// let data_type_string = data_type.to_string(); |
73 | | /// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)"); |
74 | | /// // display can be parsed back into the original type |
75 | | /// let parsed_data_type: DataType = data_type.to_string().parse().unwrap(); |
76 | | /// assert_eq!(data_type, parsed_data_type); |
77 | | /// ``` |
78 | | /// |
79 | | /// # Nested Support |
80 | | /// Currently, the Rust implementation supports the following nested types: |
81 | | /// - `List<T>` |
82 | | /// - `LargeList<T>` |
83 | | /// - `FixedSizeList<T>` |
84 | | /// - `Struct<T, U, V, ...>` |
85 | | /// - `Union<T, U, V, ...>` |
86 | | /// - `Map<K, V>` |
87 | | /// |
88 | | /// Nested types can themselves be nested within other arrays. |
89 | | /// For more information on these types please see |
90 | | /// [the physical memory layout of Apache Arrow] |
91 | | /// |
92 | | /// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs |
93 | | /// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout |
94 | | #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] |
95 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
96 | | pub enum DataType { |
97 | | /// Null type |
98 | | Null, |
99 | | /// A boolean datatype representing the values `true` and `false`. |
100 | | Boolean, |
101 | | /// A signed 8-bit integer. |
102 | | Int8, |
103 | | /// A signed 16-bit integer. |
104 | | Int16, |
105 | | /// A signed 32-bit integer. |
106 | | Int32, |
107 | | /// A signed 64-bit integer. |
108 | | Int64, |
109 | | /// An unsigned 8-bit integer. |
110 | | UInt8, |
111 | | /// An unsigned 16-bit integer. |
112 | | UInt16, |
113 | | /// An unsigned 32-bit integer. |
114 | | UInt32, |
115 | | /// An unsigned 64-bit integer. |
116 | | UInt64, |
117 | | /// A 16-bit floating point number. |
118 | | Float16, |
119 | | /// A 32-bit floating point number. |
120 | | Float32, |
121 | | /// A 64-bit floating point number. |
122 | | Float64, |
123 | | /// A timestamp with an optional timezone. |
124 | | /// |
125 | | /// Time is measured as a Unix epoch, counting the seconds from |
126 | | /// 00:00:00.000 on 1 January 1970, excluding leap seconds, |
127 | | /// as a signed 64-bit integer. |
128 | | /// |
129 | | /// The time zone is a string indicating the name of a time zone, one of: |
130 | | /// |
131 | | /// * As used in the Olson time zone database (the "tz database" or |
132 | | /// "tzdata"), such as "America/New_York" |
133 | | /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 |
134 | | /// |
135 | | /// Timestamps with a non-empty timezone |
136 | | /// ------------------------------------ |
137 | | /// |
138 | | /// If a Timestamp column has a non-empty timezone value, its epoch is |
139 | | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone |
140 | | /// (the Unix epoch), regardless of the Timestamp's own timezone. |
141 | | /// |
142 | | /// Therefore, timestamp values with a non-empty timezone correspond to |
143 | | /// physical points in time together with some additional information about |
144 | | /// how the data was obtained and/or how to display it (the timezone). |
145 | | /// |
146 | | /// For example, the timestamp value 0 with the timezone string "Europe/Paris" |
147 | | /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the |
148 | | /// application may prefer to display it as "January 1st 1970, 01h00" in |
149 | | /// the Europe/Paris timezone (which is the same physical point in time). |
150 | | /// |
151 | | /// One consequence is that timestamp values with a non-empty timezone |
152 | | /// can be compared and ordered directly, since they all share the same |
153 | | /// well-known point of reference (the Unix epoch). |
154 | | /// |
155 | | /// Timestamps with an unset / empty timezone |
156 | | /// ----------------------------------------- |
157 | | /// |
158 | | /// If a Timestamp column has no timezone value, its epoch is |
159 | | /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. |
160 | | /// |
161 | | /// Therefore, timestamp values without a timezone cannot be meaningfully |
162 | | /// interpreted as physical points in time, but only as calendar / clock |
163 | | /// indications ("wall clock time") in an unspecified timezone. |
164 | | /// |
165 | | /// For example, the timestamp value 0 with an empty timezone string |
166 | | /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there |
167 | | /// is not enough information to interpret it as a well-defined physical |
168 | | /// point in time. |
169 | | /// |
170 | | /// One consequence is that timestamp values without a timezone cannot |
171 | | /// be reliably compared or ordered, since they may have different points of |
172 | | /// reference. In particular, it is *not* possible to interpret an unset |
173 | | /// or empty timezone as the same as "UTC". |
174 | | /// |
175 | | /// Conversion between timezones |
176 | | /// ---------------------------- |
177 | | /// |
178 | | /// If a Timestamp column has a non-empty timezone, changing the timezone |
179 | | /// to a different non-empty value is a metadata-only operation: |
180 | | /// the timestamp values need not change as their point of reference remains |
181 | | /// the same (the Unix epoch). |
182 | | /// |
183 | | /// However, if a Timestamp column has no timezone value, changing it to a |
184 | | /// non-empty value requires to think about the desired semantics. |
185 | | /// One possibility is to assume that the original timestamp values are |
186 | | /// relative to the epoch of the timezone being set; timestamp values should |
187 | | /// then adjusted to the Unix epoch (for example, changing the timezone from |
188 | | /// empty to "Europe/Paris" would require converting the timestamp values |
189 | | /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is |
190 | | /// nevertheless correct). |
191 | | /// |
192 | | /// ``` |
193 | | /// # use arrow_schema::{DataType, TimeUnit}; |
194 | | /// DataType::Timestamp(TimeUnit::Second, None); |
195 | | /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); |
196 | | /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); |
197 | | /// ``` |
198 | | /// |
199 | | /// # Timezone representation |
200 | | /// ---------------------------- |
201 | | /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00". |
202 | | /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous. |
203 | | /// |
204 | | /// Most arrow-rs functionalities use the absolute offset representation, |
205 | | /// such as [`PrimitiveArray::with_timezone_utc`] that applies a |
206 | | /// UTC timezone to timestamp arrays. |
207 | | /// |
208 | | /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc |
209 | | /// |
210 | | /// Timezone string parsing |
211 | | /// ----------------------- |
212 | | /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930". |
213 | | /// |
214 | | /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/) |
215 | | /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) |
216 | | /// timezones. |
217 | | Timestamp(TimeUnit, Option<Arc<str>>), |
218 | | /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
219 | | /// in days. |
220 | | Date32, |
221 | | /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
222 | | /// in milliseconds. |
223 | | /// |
224 | | /// # Valid Ranges |
225 | | /// |
226 | | /// According to the Arrow specification ([Schema.fbs]), values of Date64 |
227 | | /// are treated as the number of *days*, in milliseconds, since the UNIX |
228 | | /// epoch. Therefore, values of this type must be evenly divisible by |
229 | | /// `86_400_000`, the number of milliseconds in a standard day. |
230 | | /// |
231 | | /// It is not valid to store milliseconds that do not represent an exact |
232 | | /// day. The reason for this restriction is compatibility with other |
233 | | /// language's native libraries (specifically Java), which historically |
234 | | /// lacked a dedicated date type and only supported timestamps. |
235 | | /// |
236 | | /// # Validation |
237 | | /// |
238 | | /// This library does not validate or enforce that Date64 values are evenly |
239 | | /// divisible by `86_400_000` for performance and usability reasons. Date64 |
240 | | /// values are treated similarly to `Timestamp(TimeUnit::Millisecond, |
241 | | /// None)`: values will be displayed with a time of day if the value does |
242 | | /// not represent an exact day, and arithmetic will be done at the |
243 | | /// millisecond granularity. |
244 | | /// |
245 | | /// # Recommendation |
246 | | /// |
247 | | /// Users should prefer [`Date32`] to cleanly represent the number |
248 | | /// of days, or one of the Timestamp variants to include time as part of the |
249 | | /// representation, depending on their use case. |
250 | | /// |
251 | | /// # Further Reading |
252 | | /// |
253 | | /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288). |
254 | | /// |
255 | | /// [`Date32`]: Self::Date32 |
256 | | /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs |
257 | | Date64, |
258 | | /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
259 | | /// Must be either seconds or milliseconds. |
260 | | Time32(TimeUnit), |
261 | | /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
262 | | /// Must be either microseconds or nanoseconds. |
263 | | Time64(TimeUnit), |
264 | | /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. |
265 | | Duration(TimeUnit), |
266 | | /// A "calendar" interval which models types that don't necessarily |
267 | | /// have a precise duration without the context of a base timestamp (e.g. |
268 | | /// days can differ in length during day light savings time transitions). |
269 | | Interval(IntervalUnit), |
270 | | /// Opaque binary data of variable length. |
271 | | /// |
272 | | /// A single Binary array can store up to [`i32::MAX`] bytes |
273 | | /// of binary data in total. |
274 | | Binary, |
275 | | /// Opaque binary data of fixed size. |
276 | | /// Enum parameter specifies the number of bytes per value. |
277 | | FixedSizeBinary(i32), |
278 | | /// Opaque binary data of variable length and 64-bit offsets. |
279 | | /// |
280 | | /// A single LargeBinary array can store up to [`i64::MAX`] bytes |
281 | | /// of binary data in total. |
282 | | LargeBinary, |
283 | | /// Opaque binary data of variable length. |
284 | | /// |
285 | | /// Logically the same as [`Binary`], but the internal representation uses a view |
286 | | /// struct that contains the string length and either the string's entire data |
287 | | /// inline (for small strings) or an inlined prefix, an index of another buffer, |
288 | | /// and an offset pointing to a slice in that buffer (for non-small strings). |
289 | | /// |
290 | | /// [`Binary`]: Self::Binary |
291 | | BinaryView, |
292 | | /// A variable-length string in Unicode with UTF-8 encoding. |
293 | | /// |
294 | | /// A single Utf8 array can store up to [`i32::MAX`] bytes |
295 | | /// of string data in total. |
296 | | Utf8, |
297 | | /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. |
298 | | /// |
299 | | /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes |
300 | | /// of string data in total. |
301 | | LargeUtf8, |
302 | | /// A variable-length string in Unicode with UTF-8 encoding |
303 | | /// |
304 | | /// Logically the same as [`Utf8`], but the internal representation uses a view |
305 | | /// struct that contains the string length and either the string's entire data |
306 | | /// inline (for small strings) or an inlined prefix, an index of another buffer, |
307 | | /// and an offset pointing to a slice in that buffer (for non-small strings). |
308 | | /// |
309 | | /// [`Utf8`]: Self::Utf8 |
310 | | Utf8View, |
311 | | /// A list of some logical data type with variable length. |
312 | | /// |
313 | | /// A single List array can store up to [`i32::MAX`] elements in total. |
314 | | List(FieldRef), |
315 | | |
316 | | /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length. |
317 | | /// |
318 | | /// Logically the same as [`List`], but the internal representation differs in how child |
319 | | /// data is referenced, allowing flexibility in how data is layed out. |
320 | | /// |
321 | | /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. |
322 | | /// |
323 | | /// [`List`]: Self::List |
324 | | ListView(FieldRef), |
325 | | /// A list of some logical data type with fixed length. |
326 | | FixedSizeList(FieldRef, i32), |
327 | | /// A list of some logical data type with variable length and 64-bit offsets. |
328 | | /// |
329 | | /// A single LargeList array can store up to [`i64::MAX`] elements in total. |
330 | | LargeList(FieldRef), |
331 | | |
332 | | /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length and 64-bit offsets. |
333 | | /// |
334 | | /// Logically the same as [`LargeList`], but the internal representation differs in how child |
335 | | /// data is referenced, allowing flexibility in how data is layed out. |
336 | | /// |
337 | | /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. |
338 | | /// |
339 | | /// [`LargeList`]: Self::LargeList |
340 | | LargeListView(FieldRef), |
341 | | /// A nested datatype that contains a number of sub-fields. |
342 | | Struct(Fields), |
343 | | /// A nested datatype that can represent slots of differing types. Components: |
344 | | /// |
345 | | /// 1. [`UnionFields`] |
346 | | /// 2. The type of union (Sparse or Dense) |
347 | | Union(UnionFields, UnionMode), |
348 | | /// A dictionary encoded array (`key_type`, `value_type`), where |
349 | | /// each array element is an index of `key_type` into an |
350 | | /// associated dictionary of `value_type`. |
351 | | /// |
352 | | /// Dictionary arrays are used to store columns of `value_type` |
353 | | /// that contain many repeated values using less memory, but with |
354 | | /// a higher CPU overhead for some operations. |
355 | | /// |
356 | | /// This type mostly used to represent low cardinality string |
357 | | /// arrays or a limited set of primitive types as integers. |
358 | | Dictionary(Box<DataType>, Box<DataType>), |
359 | | /// Exact 32-bit width decimal value with precision and scale |
360 | | /// |
361 | | /// * precision is the total number of digits |
362 | | /// * scale is the number of digits past the decimal |
363 | | /// |
364 | | /// For example the number 123.45 has precision 5 and scale 2. |
365 | | /// |
366 | | /// In certain situations, scale could be negative number. For |
367 | | /// negative scale, it is the number of padding 0 to the right |
368 | | /// of the digits. |
369 | | /// |
370 | | /// For example the number 12300 could be treated as a decimal |
371 | | /// has precision 3 and scale -2. |
372 | | Decimal32(u8, i8), |
373 | | /// Exact 64-bit width decimal value with precision and scale |
374 | | /// |
375 | | /// * precision is the total number of digits |
376 | | /// * scale is the number of digits past the decimal |
377 | | /// |
378 | | /// For example the number 123.45 has precision 5 and scale 2. |
379 | | /// |
380 | | /// In certain situations, scale could be negative number. For |
381 | | /// negative scale, it is the number of padding 0 to the right |
382 | | /// of the digits. |
383 | | /// |
384 | | /// For example the number 12300 could be treated as a decimal |
385 | | /// has precision 3 and scale -2. |
386 | | Decimal64(u8, i8), |
387 | | /// Exact 128-bit width decimal value with precision and scale |
388 | | /// |
389 | | /// * precision is the total number of digits |
390 | | /// * scale is the number of digits past the decimal |
391 | | /// |
392 | | /// For example the number 123.45 has precision 5 and scale 2. |
393 | | /// |
394 | | /// In certain situations, scale could be negative number. For |
395 | | /// negative scale, it is the number of padding 0 to the right |
396 | | /// of the digits. |
397 | | /// |
398 | | /// For example the number 12300 could be treated as a decimal |
399 | | /// has precision 3 and scale -2. |
400 | | Decimal128(u8, i8), |
401 | | /// Exact 256-bit width decimal value with precision and scale |
402 | | /// |
403 | | /// * precision is the total number of digits |
404 | | /// * scale is the number of digits past the decimal |
405 | | /// |
406 | | /// For example the number 123.45 has precision 5 and scale 2. |
407 | | /// |
408 | | /// In certain situations, scale could be negative number. For |
409 | | /// negative scale, it is the number of padding 0 to the right |
410 | | /// of the digits. |
411 | | /// |
412 | | /// For example the number 12300 could be treated as a decimal |
413 | | /// has precision 3 and scale -2. |
414 | | Decimal256(u8, i8), |
415 | | /// A Map is a logical nested type that is represented as |
416 | | /// |
417 | | /// `List<entries: Struct<key: K, value: V>>` |
418 | | /// |
419 | | /// The keys and values are each respectively contiguous. |
420 | | /// The key and value types are not constrained, but keys should be |
421 | | /// hashable and unique. |
422 | | /// Whether the keys are sorted can be set in the `bool` after the `Field`. |
423 | | /// |
424 | | /// In a field with Map type, the field has a child Struct field, which then |
425 | | /// has two children: key type and the second the value type. The names of the |
426 | | /// child fields may be respectively "entries", "key", and "value", but this is |
427 | | /// not enforced. |
428 | | Map(FieldRef, bool), |
429 | | /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These |
430 | | /// encodings are well-suited for representing data containing sequences of the |
431 | | /// same value, called runs. Each run is represented as a value and an integer giving |
432 | | /// the index in the array where the run ends. |
433 | | /// |
434 | | /// A run-end encoded array has no buffers by itself, but has two child arrays. The |
435 | | /// first child array, called the run ends array, holds either 16, 32, or 64-bit |
436 | | /// signed integers. The actual values of each run are held in the second child array. |
437 | | /// |
438 | | /// These child arrays are prescribed the standard names of "run_ends" and "values" |
439 | | /// respectively. |
440 | | RunEndEncoded(FieldRef, FieldRef), |
441 | | } |
442 | | |
443 | | /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. |
444 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] |
445 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
446 | | pub enum TimeUnit { |
447 | | /// Time in seconds. |
448 | | Second, |
449 | | /// Time in milliseconds. |
450 | | Millisecond, |
451 | | /// Time in microseconds. |
452 | | Microsecond, |
453 | | /// Time in nanoseconds. |
454 | | Nanosecond, |
455 | | } |
456 | | |
457 | | impl std::fmt::Display for TimeUnit { |
458 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
459 | 0 | match self { |
460 | 0 | TimeUnit::Second => write!(f, "s"), |
461 | 0 | TimeUnit::Millisecond => write!(f, "ms"), |
462 | 0 | TimeUnit::Microsecond => write!(f, "µs"), |
463 | 0 | TimeUnit::Nanosecond => write!(f, "ns"), |
464 | | } |
465 | 0 | } |
466 | | } |
467 | | |
468 | | /// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. |
469 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] |
470 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
471 | | pub enum IntervalUnit { |
472 | | /// Indicates the number of elapsed whole months, stored as 4-byte integers. |
473 | | YearMonth, |
474 | | /// Indicates the number of elapsed days and milliseconds, |
475 | | /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). |
476 | | DayTime, |
477 | | /// A triple of the number of elapsed months, days, and nanoseconds. |
478 | | /// The values are stored contiguously in 16 byte blocks. Months and |
479 | | /// days are encoded as 32 bit integers and nanoseconds is encoded as a |
480 | | /// 64 bit integer. All integers are signed. Each field is independent |
481 | | /// (e.g. there is no constraint that nanoseconds have the same sign |
482 | | /// as days or that the quantity of nanoseconds represents less |
483 | | /// than a day's worth of time). |
484 | | MonthDayNano, |
485 | | } |
486 | | |
487 | | /// Sparse or Dense union layouts |
488 | | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)] |
489 | | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] |
490 | | pub enum UnionMode { |
491 | | /// Sparse union layout |
492 | | Sparse, |
493 | | /// Dense union layout |
494 | | Dense, |
495 | | } |
496 | | |
497 | | /// Parses `str` into a `DataType`. |
498 | | /// |
499 | | /// This is the reverse of [`DataType`]'s `Display` |
500 | | /// impl, and maintains the invariant that |
501 | | /// `DataType::try_from(&data_type.to_string()).unwrap() == data_type` |
502 | | /// |
503 | | /// # Example |
504 | | /// ``` |
505 | | /// use arrow_schema::DataType; |
506 | | /// |
507 | | /// let data_type: DataType = "Int32".parse().unwrap(); |
508 | | /// assert_eq!(data_type, DataType::Int32); |
509 | | /// ``` |
510 | | impl FromStr for DataType { |
511 | | type Err = ArrowError; |
512 | | |
513 | 0 | fn from_str(s: &str) -> Result<Self, Self::Err> { |
514 | 0 | crate::datatype_parse::parse_data_type(s) |
515 | 0 | } |
516 | | } |
517 | | |
518 | | impl TryFrom<&str> for DataType { |
519 | | type Error = ArrowError; |
520 | | |
521 | 0 | fn try_from(value: &str) -> Result<Self, Self::Error> { |
522 | 0 | value.parse() |
523 | 0 | } |
524 | | } |
525 | | |
526 | | impl DataType { |
527 | | /// Returns true if the type is primitive: (numeric, temporal). |
528 | | #[inline] |
529 | 2 | pub fn is_primitive(&self) -> bool { |
530 | 2 | self.is_numeric() || self0 .is_temporal0 () |
531 | 2 | } |
532 | | |
533 | | /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*). |
534 | | #[inline] |
535 | 2 | pub fn is_numeric(&self) -> bool { |
536 | | use DataType::*; |
537 | 0 | matches!( |
538 | 2 | self, |
539 | | UInt8 |
540 | | | UInt16 |
541 | | | UInt32 |
542 | | | UInt64 |
543 | | | Int8 |
544 | | | Int16 |
545 | | | Int32 |
546 | | | Int64 |
547 | | | Float16 |
548 | | | Float32 |
549 | | | Float64 |
550 | | | Decimal32(_, _) |
551 | | | Decimal64(_, _) |
552 | | | Decimal128(_, _) |
553 | | | Decimal256(_, _) |
554 | | ) |
555 | 2 | } |
556 | | |
557 | | /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). |
558 | | #[inline] |
559 | 0 | pub fn is_temporal(&self) -> bool { |
560 | | use DataType::*; |
561 | 0 | matches!( |
562 | 0 | self, |
563 | | Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_) |
564 | | ) |
565 | 0 | } |
566 | | |
567 | | /// Returns true if this type is floating: (Float*). |
568 | | #[inline] |
569 | 0 | pub fn is_floating(&self) -> bool { |
570 | | use DataType::*; |
571 | 0 | matches!(self, Float16 | Float32 | Float64) |
572 | 0 | } |
573 | | |
574 | | /// Returns true if this type is integer: (Int*, UInt*). |
575 | | #[inline] |
576 | 0 | pub fn is_integer(&self) -> bool { |
577 | 0 | self.is_signed_integer() || self.is_unsigned_integer() |
578 | 0 | } |
579 | | |
580 | | /// Returns true if this type is signed integer: (Int*). |
581 | | #[inline] |
582 | 0 | pub fn is_signed_integer(&self) -> bool { |
583 | | use DataType::*; |
584 | 0 | matches!(self, Int8 | Int16 | Int32 | Int64) |
585 | 0 | } |
586 | | |
587 | | /// Returns true if this type is unsigned integer: (UInt*). |
588 | | #[inline] |
589 | 0 | pub fn is_unsigned_integer(&self) -> bool { |
590 | | use DataType::*; |
591 | 0 | matches!(self, UInt8 | UInt16 | UInt32 | UInt64) |
592 | 0 | } |
593 | | |
594 | | /// Returns true if this type is valid as a dictionary key |
595 | | #[inline] |
596 | 0 | pub fn is_dictionary_key_type(&self) -> bool { |
597 | 0 | self.is_integer() |
598 | 0 | } |
599 | | |
600 | | /// Returns true if this type is valid for run-ends array in RunArray |
601 | | #[inline] |
602 | 40 | pub fn is_run_ends_type(&self) -> bool { |
603 | | use DataType::*; |
604 | 40 | matches!0 (self, Int16 | Int32 | Int64) |
605 | 40 | } |
606 | | |
607 | | /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union, |
608 | | /// or Map), or a dictionary of a nested type |
609 | | #[inline] |
610 | 0 | pub fn is_nested(&self) -> bool { |
611 | | use DataType::*; |
612 | 0 | match self { |
613 | 0 | Dictionary(_, v) => DataType::is_nested(v.as_ref()), |
614 | 0 | RunEndEncoded(_, v) => DataType::is_nested(v.data_type()), |
615 | | List(_) |
616 | | | FixedSizeList(_, _) |
617 | | | LargeList(_) |
618 | | | ListView(_) |
619 | | | LargeListView(_) |
620 | | | Struct(_) |
621 | | | Union(_, _) |
622 | 0 | | Map(_, _) => true, |
623 | 0 | _ => false, |
624 | | } |
625 | 0 | } |
626 | | |
627 | | /// Returns true if this type is DataType::Null. |
628 | | #[inline] |
629 | 21 | pub fn is_null(&self) -> bool { |
630 | | use DataType::*; |
631 | 21 | matches!20 (self, Null) |
632 | 21 | } |
633 | | |
634 | | /// Compares the datatype with another, ignoring nested field names |
635 | | /// and metadata. |
636 | 0 | pub fn equals_datatype(&self, other: &DataType) -> bool { |
637 | 0 | match (&self, other) { |
638 | 0 | (DataType::List(a), DataType::List(b)) |
639 | 0 | | (DataType::LargeList(a), DataType::LargeList(b)) |
640 | 0 | | (DataType::ListView(a), DataType::ListView(b)) |
641 | 0 | | (DataType::LargeListView(a), DataType::LargeListView(b)) => { |
642 | 0 | a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type()) |
643 | | } |
644 | 0 | (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { |
645 | 0 | a_size == b_size |
646 | 0 | && a.is_nullable() == b.is_nullable() |
647 | 0 | && a.data_type().equals_datatype(b.data_type()) |
648 | | } |
649 | 0 | (DataType::Struct(a), DataType::Struct(b)) => { |
650 | 0 | a.len() == b.len() |
651 | 0 | && a.iter().zip(b).all(|(a, b)| { |
652 | 0 | a.is_nullable() == b.is_nullable() |
653 | 0 | && a.data_type().equals_datatype(b.data_type()) |
654 | 0 | }) |
655 | | } |
656 | 0 | (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => { |
657 | 0 | a_field.is_nullable() == b_field.is_nullable() |
658 | 0 | && a_field.data_type().equals_datatype(b_field.data_type()) |
659 | 0 | && a_is_sorted == b_is_sorted |
660 | | } |
661 | 0 | (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => { |
662 | 0 | a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value) |
663 | | } |
664 | | ( |
665 | 0 | DataType::RunEndEncoded(a_run_ends, a_values), |
666 | 0 | DataType::RunEndEncoded(b_run_ends, b_values), |
667 | | ) => { |
668 | 0 | a_run_ends.is_nullable() == b_run_ends.is_nullable() |
669 | 0 | && a_run_ends |
670 | 0 | .data_type() |
671 | 0 | .equals_datatype(b_run_ends.data_type()) |
672 | 0 | && a_values.is_nullable() == b_values.is_nullable() |
673 | 0 | && a_values.data_type().equals_datatype(b_values.data_type()) |
674 | | } |
675 | | ( |
676 | 0 | DataType::Union(a_union_fields, a_union_mode), |
677 | 0 | DataType::Union(b_union_fields, b_union_mode), |
678 | | ) => { |
679 | 0 | a_union_mode == b_union_mode |
680 | 0 | && a_union_fields.len() == b_union_fields.len() |
681 | 0 | && a_union_fields.iter().all(|a| { |
682 | 0 | b_union_fields.iter().any(|b| { |
683 | 0 | a.0 == b.0 |
684 | 0 | && a.1.is_nullable() == b.1.is_nullable() |
685 | 0 | && a.1.data_type().equals_datatype(b.1.data_type()) |
686 | 0 | }) |
687 | 0 | }) |
688 | | } |
689 | 0 | _ => self == other, |
690 | | } |
691 | 0 | } |
692 | | |
693 | | /// Returns the byte width of this type if it is a primitive type |
694 | | /// |
695 | | /// Returns `None` if not a primitive type |
696 | | #[inline] |
697 | 166 | pub fn primitive_width(&self) -> Option<usize> { |
698 | 0 | match self { |
699 | 1 | DataType::Null => None, |
700 | 1 | DataType::Boolean => None, |
701 | 3 | DataType::Int8 | DataType::UInt8 => Some(1), |
702 | 1 | DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2), |
703 | 64 | DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4), |
704 | 12 | DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8), |
705 | 16 | DataType::Timestamp(_, _) => Some(8), |
706 | 0 | DataType::Date32 | DataType::Time32(_) => Some(4), |
707 | 0 | DataType::Date64 | DataType::Time64(_) => Some(8), |
708 | 0 | DataType::Duration(_) => Some(8), |
709 | 0 | DataType::Interval(IntervalUnit::YearMonth) => Some(4), |
710 | 0 | DataType::Interval(IntervalUnit::DayTime) => Some(8), |
711 | 0 | DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), |
712 | 0 | DataType::Decimal32(_, _) => Some(4), |
713 | 0 | DataType::Decimal64(_, _) => Some(8), |
714 | 16 | DataType::Decimal128(_, _) => Some(16), |
715 | 0 | DataType::Decimal256(_, _) => Some(32), |
716 | 33 | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None, |
717 | 0 | DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None, |
718 | 1 | DataType::FixedSizeBinary(_) => None, |
719 | | DataType::List(_) |
720 | | | DataType::ListView(_) |
721 | | | DataType::LargeList(_) |
722 | | | DataType::LargeListView(_) |
723 | 0 | | DataType::Map(_, _) => None, |
724 | 0 | DataType::FixedSizeList(_, _) => None, |
725 | 1 | DataType::Struct(_) => None, |
726 | 5 | DataType::Union(_, _) => None, |
727 | 11 | DataType::Dictionary(_, _) => None, |
728 | 1 | DataType::RunEndEncoded(_, _) => None, |
729 | | } |
730 | 166 | } |
731 | | |
732 | | /// Return size of this instance in bytes. |
733 | | /// |
734 | | /// Includes the size of `Self`. |
735 | 0 | pub fn size(&self) -> usize { |
736 | 0 | std::mem::size_of_val(self) |
737 | 0 | + match self { |
738 | | DataType::Null |
739 | | | DataType::Boolean |
740 | | | DataType::Int8 |
741 | | | DataType::Int16 |
742 | | | DataType::Int32 |
743 | | | DataType::Int64 |
744 | | | DataType::UInt8 |
745 | | | DataType::UInt16 |
746 | | | DataType::UInt32 |
747 | | | DataType::UInt64 |
748 | | | DataType::Float16 |
749 | | | DataType::Float32 |
750 | | | DataType::Float64 |
751 | | | DataType::Date32 |
752 | | | DataType::Date64 |
753 | | | DataType::Time32(_) |
754 | | | DataType::Time64(_) |
755 | | | DataType::Duration(_) |
756 | | | DataType::Interval(_) |
757 | | | DataType::Binary |
758 | | | DataType::FixedSizeBinary(_) |
759 | | | DataType::LargeBinary |
760 | | | DataType::BinaryView |
761 | | | DataType::Utf8 |
762 | | | DataType::LargeUtf8 |
763 | | | DataType::Utf8View |
764 | | | DataType::Decimal32(_, _) |
765 | | | DataType::Decimal64(_, _) |
766 | | | DataType::Decimal128(_, _) |
767 | 0 | | DataType::Decimal256(_, _) => 0, |
768 | 0 | DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(), |
769 | 0 | DataType::List(field) |
770 | 0 | | DataType::ListView(field) |
771 | 0 | | DataType::FixedSizeList(field, _) |
772 | 0 | | DataType::LargeList(field) |
773 | 0 | | DataType::LargeListView(field) |
774 | 0 | | DataType::Map(field, _) => field.size(), |
775 | 0 | DataType::Struct(fields) => fields.size(), |
776 | 0 | DataType::Union(fields, _) => fields.size(), |
777 | 0 | DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), |
778 | 0 | DataType::RunEndEncoded(run_ends, values) => { |
779 | 0 | run_ends.size() - std::mem::size_of_val(run_ends) + values.size() |
780 | 0 | - std::mem::size_of_val(values) |
781 | | } |
782 | | } |
783 | 0 | } |
784 | | |
785 | | /// Check to see if `self` is a superset of `other` |
786 | | /// |
787 | | /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type |
788 | | /// else it will check to see if the DataType is equal to the other DataType |
789 | 0 | pub fn contains(&self, other: &DataType) -> bool { |
790 | 0 | match (self, other) { |
791 | 0 | (DataType::List(f1), DataType::List(f2)) |
792 | 0 | | (DataType::LargeList(f1), DataType::LargeList(f2)) |
793 | 0 | | (DataType::ListView(f1), DataType::ListView(f2)) |
794 | 0 | | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2), |
795 | 0 | (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => { |
796 | 0 | s1 == s2 && f1.contains(f2) |
797 | | } |
798 | 0 | (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2), |
799 | 0 | (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2), |
800 | 0 | (DataType::Union(f1, s1), DataType::Union(f2, s2)) => { |
801 | 0 | s1 == s2 |
802 | 0 | && f1 |
803 | 0 | .iter() |
804 | 0 | .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1))) |
805 | | } |
806 | 0 | (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { |
807 | 0 | k1.contains(k2) && v1.contains(v2) |
808 | | } |
809 | 0 | _ => self == other, |
810 | | } |
811 | 0 | } |
812 | | |
813 | | /// Create a [`DataType::List`] with elements of the specified type |
814 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
815 | | /// |
816 | | /// To specify field level metadata, construct the inner [`Field`] |
817 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
818 | 0 | pub fn new_list(data_type: DataType, nullable: bool) -> Self { |
819 | 0 | DataType::List(Arc::new(Field::new_list_field(data_type, nullable))) |
820 | 0 | } |
821 | | |
822 | | /// Create a [`DataType::LargeList`] with elements of the specified type |
823 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
824 | | /// |
825 | | /// To specify field level metadata, construct the inner [`Field`] |
826 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
827 | 0 | pub fn new_large_list(data_type: DataType, nullable: bool) -> Self { |
828 | 0 | DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable))) |
829 | 0 | } |
830 | | |
831 | | /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size |
832 | | /// and nullability, and conventionally named inner [`Field`] (`"item"`). |
833 | | /// |
834 | | /// To specify field level metadata, construct the inner [`Field`] |
835 | | /// directly via [`Field::new`] or [`Field::new_list_field`]. |
836 | 2 | pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self { |
837 | 2 | DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size) |
838 | 2 | } |
839 | | } |
840 | | |
841 | | /// The maximum precision for [DataType::Decimal32] values |
842 | | pub const DECIMAL32_MAX_PRECISION: u8 = 9; |
843 | | |
844 | | /// The maximum scale for [DataType::Decimal32] values |
845 | | pub const DECIMAL32_MAX_SCALE: i8 = 9; |
846 | | |
847 | | /// The maximum precision for [DataType::Decimal64] values |
848 | | pub const DECIMAL64_MAX_PRECISION: u8 = 18; |
849 | | |
850 | | /// The maximum scale for [DataType::Decimal64] values |
851 | | pub const DECIMAL64_MAX_SCALE: i8 = 18; |
852 | | |
853 | | /// The maximum precision for [DataType::Decimal128] values |
854 | | pub const DECIMAL128_MAX_PRECISION: u8 = 38; |
855 | | |
856 | | /// The maximum scale for [DataType::Decimal128] values |
857 | | pub const DECIMAL128_MAX_SCALE: i8 = 38; |
858 | | |
859 | | /// The maximum precision for [DataType::Decimal256] values |
860 | | pub const DECIMAL256_MAX_PRECISION: u8 = 76; |
861 | | |
862 | | /// The maximum scale for [DataType::Decimal256] values |
863 | | pub const DECIMAL256_MAX_SCALE: i8 = 76; |
864 | | |
865 | | /// The default scale for [DataType::Decimal32] values |
866 | | pub const DECIMAL32_DEFAULT_SCALE: i8 = 2; |
867 | | |
868 | | /// The default scale for [DataType::Decimal64] values |
869 | | pub const DECIMAL64_DEFAULT_SCALE: i8 = 6; |
870 | | |
871 | | /// The default scale for [DataType::Decimal128] and [DataType::Decimal256] |
872 | | /// values |
873 | | pub const DECIMAL_DEFAULT_SCALE: i8 = 10; |
874 | | |
875 | | #[cfg(test)] |
876 | | mod tests { |
877 | | use super::*; |
878 | | |
879 | | #[test] |
880 | | #[cfg(feature = "serde")] |
881 | | fn serde_struct_type() { |
882 | | use std::collections::HashMap; |
883 | | |
884 | | let kv_array = [("k".to_string(), "v".to_string())]; |
885 | | let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect(); |
886 | | |
887 | | // Non-empty map: should be converted as JSON obj { ... } |
888 | | let first_name = |
889 | | Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); |
890 | | |
891 | | // Empty map: should be omitted. |
892 | | let last_name = |
893 | | Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default()); |
894 | | |
895 | | let person = DataType::Struct(Fields::from(vec![ |
896 | | first_name, |
897 | | last_name, |
898 | | Field::new( |
899 | | "address", |
900 | | DataType::Struct(Fields::from(vec![ |
901 | | Field::new("street", DataType::Utf8, false), |
902 | | Field::new("zip", DataType::UInt16, false), |
903 | | ])), |
904 | | false, |
905 | | ), |
906 | | ])); |
907 | | |
908 | | let serialized = serde_json::to_string(&person).unwrap(); |
909 | | |
910 | | // NOTE that this is testing the default (derived) serialization format, not the |
911 | | // JSON format specified in metadata.md |
912 | | |
913 | | assert_eq!( |
914 | | "{\"Struct\":[\ |
915 | | {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ |
916 | | {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ |
917 | | {\"name\":\"address\",\"data_type\":{\"Struct\":\ |
918 | | [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ |
919 | | {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\ |
920 | | ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}", |
921 | | serialized |
922 | | ); |
923 | | |
924 | | let deserialized = serde_json::from_str(&serialized).unwrap(); |
925 | | |
926 | | assert_eq!(person, deserialized); |
927 | | } |
928 | | |
929 | | #[test] |
930 | | fn test_list_datatype_equality() { |
931 | | // tests that list type equality is checked while ignoring list names |
932 | | let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); |
933 | | let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true))); |
934 | | let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
935 | | let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true))); |
936 | | assert!(list_a.equals_datatype(&list_b)); |
937 | | assert!(!list_a.equals_datatype(&list_c)); |
938 | | assert!(!list_b.equals_datatype(&list_c)); |
939 | | assert!(!list_a.equals_datatype(&list_d)); |
940 | | |
941 | | let list_e = |
942 | | DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3); |
943 | | let list_f = |
944 | | DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3); |
945 | | let list_g = DataType::FixedSizeList( |
946 | | Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)), |
947 | | 3, |
948 | | ); |
949 | | assert!(list_e.equals_datatype(&list_f)); |
950 | | assert!(!list_e.equals_datatype(&list_g)); |
951 | | assert!(!list_f.equals_datatype(&list_g)); |
952 | | |
953 | | let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)])); |
954 | | let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)])); |
955 | | let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)])); |
956 | | let list_k = DataType::Struct(Fields::from(vec![ |
957 | | Field::new("f1", list_f.clone(), false), |
958 | | Field::new("f2", list_g.clone(), false), |
959 | | Field::new("f3", DataType::Utf8, true), |
960 | | ])); |
961 | | let list_l = DataType::Struct(Fields::from(vec![ |
962 | | Field::new("ff1", list_f.clone(), false), |
963 | | Field::new("ff2", list_g.clone(), false), |
964 | | Field::new("ff3", DataType::LargeUtf8, true), |
965 | | ])); |
966 | | let list_m = DataType::Struct(Fields::from(vec![ |
967 | | Field::new("ff1", list_f, false), |
968 | | Field::new("ff2", list_g, false), |
969 | | Field::new("ff3", DataType::Utf8, true), |
970 | | ])); |
971 | | assert!(list_h.equals_datatype(&list_i)); |
972 | | assert!(!list_h.equals_datatype(&list_j)); |
973 | | assert!(!list_k.equals_datatype(&list_l)); |
974 | | assert!(list_k.equals_datatype(&list_m)); |
975 | | |
976 | | let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true); |
977 | | let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true); |
978 | | let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false); |
979 | | let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true); |
980 | | let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true); |
981 | | |
982 | | assert!(list_n.equals_datatype(&list_o)); |
983 | | assert!(!list_n.equals_datatype(&list_p)); |
984 | | assert!(!list_n.equals_datatype(&list_q)); |
985 | | assert!(!list_n.equals_datatype(&list_r)); |
986 | | |
987 | | let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a)); |
988 | | let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone())); |
989 | | let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b)); |
990 | | let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c)); |
991 | | |
992 | | assert!(list_s.equals_datatype(&list_t)); |
993 | | assert!(!list_s.equals_datatype(&list_u)); |
994 | | assert!(!list_s.equals_datatype(&list_v)); |
995 | | |
996 | | let union_a = DataType::Union( |
997 | | UnionFields::new( |
998 | | vec![1, 2], |
999 | | vec![ |
1000 | | Field::new("f1", DataType::Utf8, false), |
1001 | | Field::new("f2", DataType::UInt8, false), |
1002 | | ], |
1003 | | ), |
1004 | | UnionMode::Sparse, |
1005 | | ); |
1006 | | let union_b = DataType::Union( |
1007 | | UnionFields::new( |
1008 | | vec![1, 2], |
1009 | | vec![ |
1010 | | Field::new("ff1", DataType::Utf8, false), |
1011 | | Field::new("ff2", DataType::UInt8, false), |
1012 | | ], |
1013 | | ), |
1014 | | UnionMode::Sparse, |
1015 | | ); |
1016 | | let union_c = DataType::Union( |
1017 | | UnionFields::new( |
1018 | | vec![2, 1], |
1019 | | vec![ |
1020 | | Field::new("fff2", DataType::UInt8, false), |
1021 | | Field::new("fff1", DataType::Utf8, false), |
1022 | | ], |
1023 | | ), |
1024 | | UnionMode::Sparse, |
1025 | | ); |
1026 | | let union_d = DataType::Union( |
1027 | | UnionFields::new( |
1028 | | vec![2, 1], |
1029 | | vec![ |
1030 | | Field::new("fff1", DataType::Int8, false), |
1031 | | Field::new("fff2", DataType::UInt8, false), |
1032 | | ], |
1033 | | ), |
1034 | | UnionMode::Sparse, |
1035 | | ); |
1036 | | let union_e = DataType::Union( |
1037 | | UnionFields::new( |
1038 | | vec![1, 2], |
1039 | | vec![ |
1040 | | Field::new("f1", DataType::Utf8, true), |
1041 | | Field::new("f2", DataType::UInt8, false), |
1042 | | ], |
1043 | | ), |
1044 | | UnionMode::Sparse, |
1045 | | ); |
1046 | | |
1047 | | assert!(union_a.equals_datatype(&union_b)); |
1048 | | assert!(union_a.equals_datatype(&union_c)); |
1049 | | assert!(!union_a.equals_datatype(&union_d)); |
1050 | | assert!(!union_a.equals_datatype(&union_e)); |
1051 | | |
1052 | | let list_w = DataType::RunEndEncoded( |
1053 | | Arc::new(Field::new("f1", DataType::Int64, true)), |
1054 | | Arc::new(Field::new("f2", DataType::Utf8, true)), |
1055 | | ); |
1056 | | let list_x = DataType::RunEndEncoded( |
1057 | | Arc::new(Field::new("ff1", DataType::Int64, true)), |
1058 | | Arc::new(Field::new("ff2", DataType::Utf8, true)), |
1059 | | ); |
1060 | | let list_y = DataType::RunEndEncoded( |
1061 | | Arc::new(Field::new("ff1", DataType::UInt16, true)), |
1062 | | Arc::new(Field::new("ff2", DataType::Utf8, true)), |
1063 | | ); |
1064 | | let list_z = DataType::RunEndEncoded( |
1065 | | Arc::new(Field::new("f1", DataType::Int64, false)), |
1066 | | Arc::new(Field::new("f2", DataType::Utf8, true)), |
1067 | | ); |
1068 | | |
1069 | | assert!(list_w.equals_datatype(&list_x)); |
1070 | | assert!(!list_w.equals_datatype(&list_y)); |
1071 | | assert!(!list_w.equals_datatype(&list_z)); |
1072 | | } |
1073 | | |
1074 | | #[test] |
1075 | | fn create_struct_type() { |
1076 | | let _person = DataType::Struct(Fields::from(vec![ |
1077 | | Field::new("first_name", DataType::Utf8, false), |
1078 | | Field::new("last_name", DataType::Utf8, false), |
1079 | | Field::new( |
1080 | | "address", |
1081 | | DataType::Struct(Fields::from(vec![ |
1082 | | Field::new("street", DataType::Utf8, false), |
1083 | | Field::new("zip", DataType::UInt16, false), |
1084 | | ])), |
1085 | | false, |
1086 | | ), |
1087 | | ])); |
1088 | | } |
1089 | | |
1090 | | #[test] |
1091 | | fn test_nested() { |
1092 | | let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1093 | | let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1094 | | let large_list_view = |
1095 | | DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true))); |
1096 | | |
1097 | | assert!(!DataType::is_nested(&DataType::Boolean)); |
1098 | | assert!(!DataType::is_nested(&DataType::Int32)); |
1099 | | assert!(!DataType::is_nested(&DataType::Utf8)); |
1100 | | assert!(DataType::is_nested(&list)); |
1101 | | assert!(DataType::is_nested(&list_view)); |
1102 | | assert!(DataType::is_nested(&large_list_view)); |
1103 | | |
1104 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1105 | | Box::new(DataType::Int32), |
1106 | | Box::new(DataType::Boolean) |
1107 | | ))); |
1108 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1109 | | Box::new(DataType::Int32), |
1110 | | Box::new(DataType::Int64) |
1111 | | ))); |
1112 | | assert!(!DataType::is_nested(&DataType::Dictionary( |
1113 | | Box::new(DataType::Int32), |
1114 | | Box::new(DataType::LargeUtf8) |
1115 | | ))); |
1116 | | assert!(DataType::is_nested(&DataType::Dictionary( |
1117 | | Box::new(DataType::Int32), |
1118 | | Box::new(list) |
1119 | | ))); |
1120 | | } |
1121 | | |
1122 | | #[test] |
1123 | | fn test_integer() { |
1124 | | // is_integer |
1125 | | assert!(DataType::is_integer(&DataType::Int32)); |
1126 | | assert!(DataType::is_integer(&DataType::UInt64)); |
1127 | | assert!(!DataType::is_integer(&DataType::Float16)); |
1128 | | |
1129 | | // is_signed_integer |
1130 | | assert!(DataType::is_signed_integer(&DataType::Int32)); |
1131 | | assert!(!DataType::is_signed_integer(&DataType::UInt64)); |
1132 | | assert!(!DataType::is_signed_integer(&DataType::Float16)); |
1133 | | |
1134 | | // is_unsigned_integer |
1135 | | assert!(!DataType::is_unsigned_integer(&DataType::Int32)); |
1136 | | assert!(DataType::is_unsigned_integer(&DataType::UInt64)); |
1137 | | assert!(!DataType::is_unsigned_integer(&DataType::Float16)); |
1138 | | |
1139 | | // is_dictionary_key_type |
1140 | | assert!(DataType::is_dictionary_key_type(&DataType::Int32)); |
1141 | | assert!(DataType::is_dictionary_key_type(&DataType::UInt64)); |
1142 | | assert!(!DataType::is_dictionary_key_type(&DataType::Float16)); |
1143 | | } |
1144 | | |
1145 | | #[test] |
1146 | | fn test_floating() { |
1147 | | assert!(DataType::is_floating(&DataType::Float16)); |
1148 | | assert!(!DataType::is_floating(&DataType::Int32)); |
1149 | | } |
1150 | | |
1151 | | #[test] |
1152 | | fn test_datatype_is_null() { |
1153 | | assert!(DataType::is_null(&DataType::Null)); |
1154 | | assert!(!DataType::is_null(&DataType::Int32)); |
1155 | | } |
1156 | | |
1157 | | #[test] |
1158 | | fn size_should_not_regress() { |
1159 | | assert_eq!(std::mem::size_of::<DataType>(), 24); |
1160 | | } |
1161 | | |
1162 | | #[test] |
1163 | | #[should_panic(expected = "duplicate type id: 1")] |
1164 | | fn test_union_with_duplicated_type_id() { |
1165 | | let type_ids = vec![1, 1]; |
1166 | | let _union = DataType::Union( |
1167 | | UnionFields::new( |
1168 | | type_ids, |
1169 | | vec![ |
1170 | | Field::new("f1", DataType::Int32, false), |
1171 | | Field::new("f2", DataType::Utf8, false), |
1172 | | ], |
1173 | | ), |
1174 | | UnionMode::Dense, |
1175 | | ); |
1176 | | } |
1177 | | |
1178 | | #[test] |
1179 | | fn test_try_from_str() { |
1180 | | let data_type: DataType = "Int32".try_into().unwrap(); |
1181 | | assert_eq!(data_type, DataType::Int32); |
1182 | | } |
1183 | | |
1184 | | #[test] |
1185 | | fn test_from_str() { |
1186 | | let data_type: DataType = "UInt64".parse().unwrap(); |
1187 | | assert_eq!(data_type, DataType::UInt64); |
1188 | | } |
1189 | | |
1190 | | #[test] |
1191 | | #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro |
1192 | | fn test_debug_format_field() { |
1193 | | // Make sure the `Debug` formatting of `DataType` is readable and not too long |
1194 | | insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r" |
1195 | | List( |
1196 | | Field { |
1197 | | data_type: Int8, |
1198 | | }, |
1199 | | ) |
1200 | | "); |
1201 | | } |
1202 | | } |