Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::fmt;
19
use std::str::FromStr;
20
use std::sync::Arc;
21
22
use crate::{ArrowError, Field, FieldRef, Fields, UnionFields};
23
24
/// Datatypes supported by this implementation of Apache Arrow.
25
///
26
/// The variants of this enum include primitive fixed size types as well as
27
/// parametric or nested types. See [`Schema.fbs`] for Arrow's specification.
28
///
29
/// # Examples
30
///
31
/// Primitive types
32
/// ```
33
/// # use arrow_schema::DataType;
34
/// // create a new 32-bit signed integer
35
/// let data_type = DataType::Int32;
36
/// ```
37
///
38
/// Nested Types
39
/// ```
40
/// # use arrow_schema::{DataType, Field};
41
/// # use std::sync::Arc;
42
/// // create a new list of 32-bit signed integers directly
43
/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
44
/// // Create the same list type with constructor
45
/// let list_data_type2 = DataType::new_list(DataType::Int32, true);
46
/// assert_eq!(list_data_type, list_data_type2);
47
/// ```
48
///
49
/// Dictionary Types
50
/// ```
51
/// # use arrow_schema::{DataType};
52
/// // String Dictionary (key type Int32 and value type Utf8)
53
/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
54
/// ```
55
///
56
/// Timestamp Types
57
/// ```
58
/// # use arrow_schema::{DataType, TimeUnit};
59
/// // timestamp with millisecond precision without timezone specified
60
/// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
61
/// // timestamp with nanosecond precision in UTC timezone
62
/// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));
63
///```
64
///
65
/// # Display and FromStr
66
///
67
/// The `Display` and `FromStr` implementations for `DataType` are
68
/// human-readable, parseable, and reversible.
69
///
70
/// ```
71
/// # use arrow_schema::DataType;
72
/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
73
/// let data_type_string = data_type.to_string();
74
/// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)");
75
/// // display can be parsed back into the original type
76
/// let parsed_data_type: DataType = data_type.to_string().parse().unwrap();
77
/// assert_eq!(data_type, parsed_data_type);
78
/// ```
79
///
80
/// # Nested Support
81
/// Currently, the Rust implementation supports the following nested types:
82
///  - `List<T>`
83
///  - `LargeList<T>`
84
///  - `FixedSizeList<T>`
85
///  - `Struct<T, U, V, ...>`
86
///  - `Union<T, U, V, ...>`
87
///  - `Map<K, V>`
88
///
89
/// Nested types can themselves be nested within other arrays.
90
/// For more information on these types please see
91
/// [the physical memory layout of Apache Arrow]
92
///
93
/// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
94
/// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout
95
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
96
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
97
pub enum DataType {
98
    /// Null type
99
    Null,
100
    /// A boolean datatype representing the values `true` and `false`.
101
    Boolean,
102
    /// A signed 8-bit integer.
103
    Int8,
104
    /// A signed 16-bit integer.
105
    Int16,
106
    /// A signed 32-bit integer.
107
    Int32,
108
    /// A signed 64-bit integer.
109
    Int64,
110
    /// An unsigned 8-bit integer.
111
    UInt8,
112
    /// An unsigned 16-bit integer.
113
    UInt16,
114
    /// An unsigned 32-bit integer.
115
    UInt32,
116
    /// An unsigned 64-bit integer.
117
    UInt64,
118
    /// A 16-bit floating point number.
119
    Float16,
120
    /// A 32-bit floating point number.
121
    Float32,
122
    /// A 64-bit floating point number.
123
    Float64,
124
    /// A timestamp with an optional timezone.
125
    ///
126
    /// Time is measured as a Unix epoch, counting the seconds from
127
    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
128
    /// as a signed 64-bit integer.
129
    ///
130
    /// The time zone is a string indicating the name of a time zone, one of:
131
    ///
132
    /// * As used in the Olson time zone database (the "tz database" or
133
    ///   "tzdata"), such as "America/New_York"
134
    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
135
    ///
136
    /// Timestamps with a non-empty timezone
137
    /// ------------------------------------
138
    ///
139
    /// If a Timestamp column has a non-empty timezone value, its epoch is
140
    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
141
    /// (the Unix epoch), regardless of the Timestamp's own timezone.
142
    ///
143
    /// Therefore, timestamp values with a non-empty timezone correspond to
144
    /// physical points in time together with some additional information about
145
    /// how the data was obtained and/or how to display it (the timezone).
146
    ///
147
    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
148
    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
149
    ///   application may prefer to display it as "January 1st 1970, 01h00" in
150
    ///   the Europe/Paris timezone (which is the same physical point in time).
151
    ///
152
    /// One consequence is that timestamp values with a non-empty timezone
153
    /// can be compared and ordered directly, since they all share the same
154
    /// well-known point of reference (the Unix epoch).
155
    ///
156
    /// Timestamps with an unset / empty timezone
157
    /// -----------------------------------------
158
    ///
159
    /// If a Timestamp column has no timezone value, its epoch is
160
    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
161
    ///
162
    /// Therefore, timestamp values without a timezone cannot be meaningfully
163
    /// interpreted as physical points in time, but only as calendar / clock
164
    /// indications ("wall clock time") in an unspecified timezone.
165
    ///
166
    ///   For example, the timestamp value 0 with an empty timezone string
167
    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
168
    ///   is not enough information to interpret it as a well-defined physical
169
    ///   point in time.
170
    ///
171
    /// One consequence is that timestamp values without a timezone cannot
172
    /// be reliably compared or ordered, since they may have different points of
173
    /// reference.  In particular, it is *not* possible to interpret an unset
174
    /// or empty timezone as the same as "UTC".
175
    ///
176
    /// Conversion between timezones
177
    /// ----------------------------
178
    ///
179
    /// If a Timestamp column has a non-empty timezone, changing the timezone
180
    /// to a different non-empty value is a metadata-only operation:
181
    /// the timestamp values need not change as their point of reference remains
182
    /// the same (the Unix epoch).
183
    ///
184
    /// However, if a Timestamp column has no timezone value, changing it to a
185
    /// non-empty value requires to think about the desired semantics.
186
    /// One possibility is to assume that the original timestamp values are
187
    /// relative to the epoch of the timezone being set; timestamp values should
188
    /// then adjusted to the Unix epoch (for example, changing the timezone from
189
    /// empty to "Europe/Paris" would require converting the timestamp values
190
    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
191
    /// nevertheless correct).
192
    ///
193
    /// ```
194
    /// # use arrow_schema::{DataType, TimeUnit};
195
    /// DataType::Timestamp(TimeUnit::Second, None);
196
    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
197
    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
198
    /// ```
199
    ///
200
    /// # Timezone representation
201
    /// ----------------------------
202
    /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00".
203
    /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous.
204
    ///
205
    /// Most arrow-rs functionalities use the absolute offset representation,
206
    /// such as [`PrimitiveArray::with_timezone_utc`] that applies a
207
    /// UTC timezone to timestamp arrays.
208
    ///
209
    /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc
210
    ///
211
    /// Timezone string parsing
212
    /// -----------------------
213
    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
214
    ///
215
    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
216
    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
217
    /// timezones.
218
    Timestamp(TimeUnit, Option<Arc<str>>),
219
    /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
220
    /// in days.
221
    Date32,
222
    /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
223
    /// in milliseconds.
224
    ///
225
    /// # Valid Ranges
226
    ///
227
    /// According to the Arrow specification ([Schema.fbs]), values of Date64
228
    /// are treated as the number of *days*, in milliseconds, since the UNIX
229
    /// epoch. Therefore, values of this type  must be evenly divisible by
230
    /// `86_400_000`, the number of milliseconds in a standard day.
231
    ///
232
    /// It is not valid to store milliseconds that do not represent an exact
233
    /// day. The reason for this restriction is compatibility with other
234
    /// language's native libraries (specifically Java), which historically
235
    /// lacked a dedicated date type and only supported timestamps.
236
    ///
237
    /// # Validation
238
    ///
239
    /// This library does not validate or enforce that Date64 values are evenly
240
    /// divisible by `86_400_000`  for performance and usability reasons. Date64
241
    /// values are treated similarly to `Timestamp(TimeUnit::Millisecond,
242
    /// None)`: values will be displayed with a time of day if the value does
243
    /// not represent an exact day, and arithmetic will be done at the
244
    /// millisecond granularity.
245
    ///
246
    /// # Recommendation
247
    ///
248
    /// Users should prefer [`Date32`] to cleanly represent the number
249
    /// of days, or one of the Timestamp variants to include time as part of the
250
    /// representation, depending on their use case.
251
    ///
252
    /// # Further Reading
253
    ///
254
    /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288).
255
    ///
256
    /// [`Date32`]: Self::Date32
257
    /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
258
    Date64,
259
    /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
260
    /// Must be either seconds or milliseconds.
261
    Time32(TimeUnit),
262
    /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
263
    /// Must be either microseconds or nanoseconds.
264
    Time64(TimeUnit),
265
    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
266
    Duration(TimeUnit),
267
    /// A "calendar" interval which models types that don't necessarily
268
    /// have a precise duration without the context of a base timestamp (e.g.
269
    /// days can differ in length during day light savings time transitions).
270
    Interval(IntervalUnit),
271
    /// Opaque binary data of variable length.
272
    ///
273
    /// A single Binary array can store up to [`i32::MAX`] bytes
274
    /// of binary data in total.
275
    Binary,
276
    /// Opaque binary data of fixed size.
277
    /// Enum parameter specifies the number of bytes per value.
278
    FixedSizeBinary(i32),
279
    /// Opaque binary data of variable length and 64-bit offsets.
280
    ///
281
    /// A single LargeBinary array can store up to [`i64::MAX`] bytes
282
    /// of binary data in total.
283
    LargeBinary,
284
    /// Opaque binary data of variable length.
285
    ///
286
    /// Logically the same as [`Binary`], but the internal representation uses a view
287
    /// struct that contains the string length and either the string's entire data
288
    /// inline (for small strings) or an inlined prefix, an index of another buffer,
289
    /// and an offset pointing to a slice in that buffer (for non-small strings).
290
    ///
291
    /// [`Binary`]: Self::Binary
292
    BinaryView,
293
    /// A variable-length string in Unicode with UTF-8 encoding.
294
    ///
295
    /// A single Utf8 array can store up to [`i32::MAX`] bytes
296
    /// of string data in total.
297
    Utf8,
298
    /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
299
    ///
300
    /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
301
    /// of string data in total.
302
    LargeUtf8,
303
    /// A variable-length string in Unicode with UTF-8 encoding
304
    ///
305
    /// Logically the same as [`Utf8`], but the internal representation uses a view
306
    /// struct that contains the string length and either the string's entire data
307
    /// inline (for small strings) or an inlined prefix, an index of another buffer,
308
    /// and an offset pointing to a slice in that buffer (for non-small strings).
309
    ///
310
    /// [`Utf8`]: Self::Utf8
311
    Utf8View,
312
    /// A list of some logical data type with variable length.
313
    ///
314
    /// A single List array can store up to [`i32::MAX`] elements in total.
315
    List(FieldRef),
316
317
    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length.
318
    ///
319
    /// Logically the same as [`List`], but the internal representation differs in how child
320
    /// data is referenced, allowing flexibility in how data is layed out.
321
    ///
322
    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
323
    ///
324
    /// [`List`]: Self::List
325
    ListView(FieldRef),
326
    /// A list of some logical data type with fixed length.
327
    FixedSizeList(FieldRef, i32),
328
    /// A list of some logical data type with variable length and 64-bit offsets.
329
    ///
330
    /// A single LargeList array can store up to [`i64::MAX`] elements in total.
331
    LargeList(FieldRef),
332
333
    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length and 64-bit offsets.
334
    ///
335
    /// Logically the same as [`LargeList`], but the internal representation differs in how child
336
    /// data is referenced, allowing flexibility in how data is layed out.
337
    ///
338
    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
339
    ///
340
    /// [`LargeList`]: Self::LargeList
341
    LargeListView(FieldRef),
342
    /// A nested datatype that contains a number of sub-fields.
343
    Struct(Fields),
344
    /// A nested datatype that can represent slots of differing types. Components:
345
    ///
346
    /// 1. [`UnionFields`]
347
    /// 2. The type of union (Sparse or Dense)
348
    Union(UnionFields, UnionMode),
349
    /// A dictionary encoded array (`key_type`, `value_type`), where
350
    /// each array element is an index of `key_type` into an
351
    /// associated dictionary of `value_type`.
352
    ///
353
    /// Dictionary arrays are used to store columns of `value_type`
354
    /// that contain many repeated values using less memory, but with
355
    /// a higher CPU overhead for some operations.
356
    ///
357
    /// This type mostly used to represent low cardinality string
358
    /// arrays or a limited set of primitive types as integers.
359
    Dictionary(Box<DataType>, Box<DataType>),
360
    /// Exact 32-bit width decimal value with precision and scale
361
    ///
362
    /// * precision is the total number of digits
363
    /// * scale is the number of digits past the decimal
364
    ///
365
    /// For example the number 123.45 has precision 5 and scale 2.
366
    ///
367
    /// In certain situations, scale could be negative number. For
368
    /// negative scale, it is the number of padding 0 to the right
369
    /// of the digits.
370
    ///
371
    /// For example the number 12300 could be treated as a decimal
372
    /// has precision 3 and scale -2.
373
    Decimal32(u8, i8),
374
    /// Exact 64-bit width decimal value with precision and scale
375
    ///
376
    /// * precision is the total number of digits
377
    /// * scale is the number of digits past the decimal
378
    ///
379
    /// For example the number 123.45 has precision 5 and scale 2.
380
    ///
381
    /// In certain situations, scale could be negative number. For
382
    /// negative scale, it is the number of padding 0 to the right
383
    /// of the digits.
384
    ///
385
    /// For example the number 12300 could be treated as a decimal
386
    /// has precision 3 and scale -2.
387
    Decimal64(u8, i8),
388
    /// Exact 128-bit width decimal value with precision and scale
389
    ///
390
    /// * precision is the total number of digits
391
    /// * scale is the number of digits past the decimal
392
    ///
393
    /// For example the number 123.45 has precision 5 and scale 2.
394
    ///
395
    /// In certain situations, scale could be negative number. For
396
    /// negative scale, it is the number of padding 0 to the right
397
    /// of the digits.
398
    ///
399
    /// For example the number 12300 could be treated as a decimal
400
    /// has precision 3 and scale -2.
401
    Decimal128(u8, i8),
402
    /// Exact 256-bit width decimal value with precision and scale
403
    ///
404
    /// * precision is the total number of digits
405
    /// * scale is the number of digits past the decimal
406
    ///
407
    /// For example the number 123.45 has precision 5 and scale 2.
408
    ///
409
    /// In certain situations, scale could be negative number. For
410
    /// negative scale, it is the number of padding 0 to the right
411
    /// of the digits.
412
    ///
413
    /// For example the number 12300 could be treated as a decimal
414
    /// has precision 3 and scale -2.
415
    Decimal256(u8, i8),
416
    /// A Map is a logical nested type that is represented as
417
    ///
418
    /// `List<entries: Struct<key: K, value: V>>`
419
    ///
420
    /// The keys and values are each respectively contiguous.
421
    /// The key and value types are not constrained, but keys should be
422
    /// hashable and unique.
423
    /// Whether the keys are sorted can be set in the `bool` after the `Field`.
424
    ///
425
    /// In a field with Map type, the field has a child Struct field, which then
426
    /// has two children: key type and the second the value type. The names of the
427
    /// child fields may be respectively "entries", "key", and "value", but this is
428
    /// not enforced.
429
    Map(FieldRef, bool),
430
    /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These
431
    /// encodings are well-suited for representing data containing sequences of the
432
    /// same value, called runs. Each run is represented as a value and an integer giving
433
    /// the index in the array where the run ends.
434
    ///
435
    /// A run-end encoded array has no buffers by itself, but has two child arrays. The
436
    /// first child array, called the run ends array, holds either 16, 32, or 64-bit
437
    /// signed integers. The actual values of each run are held in the second child array.
438
    ///
439
    /// These child arrays are prescribed the standard names of "run_ends" and "values"
440
    /// respectively.
441
    RunEndEncoded(FieldRef, FieldRef),
442
}
443
444
/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
445
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
446
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
447
pub enum TimeUnit {
448
    /// Time in seconds.
449
    Second,
450
    /// Time in milliseconds.
451
    Millisecond,
452
    /// Time in microseconds.
453
    Microsecond,
454
    /// Time in nanoseconds.
455
    Nanosecond,
456
}
457
458
/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
459
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
460
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
461
pub enum IntervalUnit {
462
    /// Indicates the number of elapsed whole months, stored as 4-byte integers.
463
    YearMonth,
464
    /// Indicates the number of elapsed days and milliseconds,
465
    /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
466
    DayTime,
467
    /// A triple of the number of elapsed months, days, and nanoseconds.
468
    /// The values are stored contiguously in 16 byte blocks. Months and
469
    /// days are encoded as 32 bit integers and nanoseconds is encoded as a
470
    /// 64 bit integer. All integers are signed. Each field is independent
471
    /// (e.g. there is no constraint that nanoseconds have the same sign
472
    /// as days or that the quantity of nanoseconds represents less
473
    /// than a day's worth of time).
474
    MonthDayNano,
475
}
476
477
/// Sparse or Dense union layouts
478
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
479
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
480
pub enum UnionMode {
481
    /// Sparse union layout
482
    Sparse,
483
    /// Dense union layout
484
    Dense,
485
}
486
487
impl fmt::Display for DataType {
488
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
489
0
        match &self {
490
0
            DataType::Struct(fields) => {
491
0
                write!(f, "Struct(")?;
492
0
                if !fields.is_empty() {
493
0
                    let fields_str = fields
494
0
                        .iter()
495
0
                        .map(|f| format!("{} {}", f.name(), f.data_type()))
496
0
                        .collect::<Vec<_>>()
497
0
                        .join(", ");
498
0
                    write!(f, "{fields_str}")?;
499
0
                }
500
0
                write!(f, ")")?;
501
0
                Ok(())
502
            }
503
0
            _ => write!(f, "{self:?}"),
504
        }
505
0
    }
506
}
507
508
/// Parses `str` into a `DataType`.
509
///
510
/// This is the reverse of [`DataType`]'s `Display`
511
/// impl, and maintains the invariant that
512
/// `DataType::try_from(&data_type.to_string()).unwrap() == data_type`
513
///
514
/// # Example
515
/// ```
516
/// use arrow_schema::DataType;
517
///
518
/// let data_type: DataType = "Int32".parse().unwrap();
519
/// assert_eq!(data_type, DataType::Int32);
520
/// ```
521
impl FromStr for DataType {
522
    type Err = ArrowError;
523
524
0
    fn from_str(s: &str) -> Result<Self, Self::Err> {
525
0
        crate::datatype_parse::parse_data_type(s)
526
0
    }
527
}
528
529
impl TryFrom<&str> for DataType {
530
    type Error = ArrowError;
531
532
0
    fn try_from(value: &str) -> Result<Self, Self::Error> {
533
0
        value.parse()
534
0
    }
535
}
536
537
impl DataType {
538
    /// Returns true if the type is primitive: (numeric, temporal).
539
    #[inline]
540
0
    pub fn is_primitive(&self) -> bool {
541
0
        self.is_numeric() || self.is_temporal()
542
0
    }
543
544
    /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*).
545
    #[inline]
546
0
    pub fn is_numeric(&self) -> bool {
547
        use DataType::*;
548
0
        matches!(
549
0
            self,
550
            UInt8
551
                | UInt16
552
                | UInt32
553
                | UInt64
554
                | Int8
555
                | Int16
556
                | Int32
557
                | Int64
558
                | Float16
559
                | Float32
560
                | Float64
561
                | Decimal32(_, _)
562
                | Decimal64(_, _)
563
                | Decimal128(_, _)
564
                | Decimal256(_, _)
565
        )
566
0
    }
567
568
    /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval).
569
    #[inline]
570
0
    pub fn is_temporal(&self) -> bool {
571
        use DataType::*;
572
0
        matches!(
573
0
            self,
574
            Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_)
575
        )
576
0
    }
577
578
    /// Returns true if this type is floating: (Float*).
579
    #[inline]
580
0
    pub fn is_floating(&self) -> bool {
581
        use DataType::*;
582
0
        matches!(self, Float16 | Float32 | Float64)
583
0
    }
584
585
    /// Returns true if this type is integer: (Int*, UInt*).
586
    #[inline]
587
0
    pub fn is_integer(&self) -> bool {
588
0
        self.is_signed_integer() || self.is_unsigned_integer()
589
0
    }
590
591
    /// Returns true if this type is signed integer: (Int*).
592
    #[inline]
593
0
    pub fn is_signed_integer(&self) -> bool {
594
        use DataType::*;
595
0
        matches!(self, Int8 | Int16 | Int32 | Int64)
596
0
    }
597
598
    /// Returns true if this type is unsigned integer: (UInt*).
599
    #[inline]
600
0
    pub fn is_unsigned_integer(&self) -> bool {
601
        use DataType::*;
602
0
        matches!(self, UInt8 | UInt16 | UInt32 | UInt64)
603
0
    }
604
605
    /// Returns true if this type is valid as a dictionary key
606
    #[inline]
607
0
    pub fn is_dictionary_key_type(&self) -> bool {
608
0
        self.is_integer()
609
0
    }
610
611
    /// Returns true if this type is valid for run-ends array in RunArray
612
    #[inline]
613
0
    pub fn is_run_ends_type(&self) -> bool {
614
        use DataType::*;
615
0
        matches!(self, Int16 | Int32 | Int64)
616
0
    }
617
618
    /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union,
619
    /// or Map), or a dictionary of a nested type
620
    #[inline]
621
0
    pub fn is_nested(&self) -> bool {
622
        use DataType::*;
623
0
        match self {
624
0
            Dictionary(_, v) => DataType::is_nested(v.as_ref()),
625
0
            RunEndEncoded(_, v) => DataType::is_nested(v.data_type()),
626
            List(_)
627
            | FixedSizeList(_, _)
628
            | LargeList(_)
629
            | ListView(_)
630
            | LargeListView(_)
631
            | Struct(_)
632
            | Union(_, _)
633
0
            | Map(_, _) => true,
634
0
            _ => false,
635
        }
636
0
    }
637
638
    /// Returns true if this type is DataType::Null.
639
    #[inline]
640
0
    pub fn is_null(&self) -> bool {
641
        use DataType::*;
642
0
        matches!(self, Null)
643
0
    }
644
645
    /// Compares the datatype with another, ignoring nested field names
646
    /// and metadata.
647
0
    pub fn equals_datatype(&self, other: &DataType) -> bool {
648
0
        match (&self, other) {
649
0
            (DataType::List(a), DataType::List(b))
650
0
            | (DataType::LargeList(a), DataType::LargeList(b))
651
0
            | (DataType::ListView(a), DataType::ListView(b))
652
0
            | (DataType::LargeListView(a), DataType::LargeListView(b)) => {
653
0
                a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type())
654
            }
655
0
            (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
656
0
                a_size == b_size
657
0
                    && a.is_nullable() == b.is_nullable()
658
0
                    && a.data_type().equals_datatype(b.data_type())
659
            }
660
0
            (DataType::Struct(a), DataType::Struct(b)) => {
661
0
                a.len() == b.len()
662
0
                    && a.iter().zip(b).all(|(a, b)| {
663
0
                        a.is_nullable() == b.is_nullable()
664
0
                            && a.data_type().equals_datatype(b.data_type())
665
0
                    })
666
            }
667
0
            (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => {
668
0
                a_field.is_nullable() == b_field.is_nullable()
669
0
                    && a_field.data_type().equals_datatype(b_field.data_type())
670
0
                    && a_is_sorted == b_is_sorted
671
            }
672
0
            (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => {
673
0
                a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value)
674
            }
675
            (
676
0
                DataType::RunEndEncoded(a_run_ends, a_values),
677
0
                DataType::RunEndEncoded(b_run_ends, b_values),
678
            ) => {
679
0
                a_run_ends.is_nullable() == b_run_ends.is_nullable()
680
0
                    && a_run_ends
681
0
                        .data_type()
682
0
                        .equals_datatype(b_run_ends.data_type())
683
0
                    && a_values.is_nullable() == b_values.is_nullable()
684
0
                    && a_values.data_type().equals_datatype(b_values.data_type())
685
            }
686
            (
687
0
                DataType::Union(a_union_fields, a_union_mode),
688
0
                DataType::Union(b_union_fields, b_union_mode),
689
            ) => {
690
0
                a_union_mode == b_union_mode
691
0
                    && a_union_fields.len() == b_union_fields.len()
692
0
                    && a_union_fields.iter().all(|a| {
693
0
                        b_union_fields.iter().any(|b| {
694
0
                            a.0 == b.0
695
0
                                && a.1.is_nullable() == b.1.is_nullable()
696
0
                                && a.1.data_type().equals_datatype(b.1.data_type())
697
0
                        })
698
0
                    })
699
            }
700
0
            _ => self == other,
701
        }
702
0
    }
703
704
    /// Returns the byte width of this type if it is a primitive type
705
    ///
706
    /// Returns `None` if not a primitive type
707
    #[inline]
708
3
    pub fn primitive_width(&self) -> Option<usize> {
709
0
        match self {
710
0
            DataType::Null => None,
711
0
            DataType::Boolean => None,
712
0
            DataType::Int8 | DataType::UInt8 => Some(1),
713
0
            DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
714
2
            DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
715
1
            DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
716
0
            DataType::Timestamp(_, _) => Some(8),
717
0
            DataType::Date32 | DataType::Time32(_) => Some(4),
718
0
            DataType::Date64 | DataType::Time64(_) => Some(8),
719
0
            DataType::Duration(_) => Some(8),
720
0
            DataType::Interval(IntervalUnit::YearMonth) => Some(4),
721
0
            DataType::Interval(IntervalUnit::DayTime) => Some(8),
722
0
            DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
723
0
            DataType::Decimal32(_, _) => Some(4),
724
0
            DataType::Decimal64(_, _) => Some(8),
725
0
            DataType::Decimal128(_, _) => Some(16),
726
0
            DataType::Decimal256(_, _) => Some(32),
727
0
            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
728
0
            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
729
0
            DataType::FixedSizeBinary(_) => None,
730
            DataType::List(_)
731
            | DataType::ListView(_)
732
            | DataType::LargeList(_)
733
            | DataType::LargeListView(_)
734
0
            | DataType::Map(_, _) => None,
735
0
            DataType::FixedSizeList(_, _) => None,
736
0
            DataType::Struct(_) => None,
737
0
            DataType::Union(_, _) => None,
738
0
            DataType::Dictionary(_, _) => None,
739
0
            DataType::RunEndEncoded(_, _) => None,
740
        }
741
3
    }
742
743
    /// Return size of this instance in bytes.
744
    ///
745
    /// Includes the size of `Self`.
746
0
    pub fn size(&self) -> usize {
747
0
        std::mem::size_of_val(self)
748
0
            + match self {
749
                DataType::Null
750
                | DataType::Boolean
751
                | DataType::Int8
752
                | DataType::Int16
753
                | DataType::Int32
754
                | DataType::Int64
755
                | DataType::UInt8
756
                | DataType::UInt16
757
                | DataType::UInt32
758
                | DataType::UInt64
759
                | DataType::Float16
760
                | DataType::Float32
761
                | DataType::Float64
762
                | DataType::Date32
763
                | DataType::Date64
764
                | DataType::Time32(_)
765
                | DataType::Time64(_)
766
                | DataType::Duration(_)
767
                | DataType::Interval(_)
768
                | DataType::Binary
769
                | DataType::FixedSizeBinary(_)
770
                | DataType::LargeBinary
771
                | DataType::BinaryView
772
                | DataType::Utf8
773
                | DataType::LargeUtf8
774
                | DataType::Utf8View
775
                | DataType::Decimal32(_, _)
776
                | DataType::Decimal64(_, _)
777
                | DataType::Decimal128(_, _)
778
0
                | DataType::Decimal256(_, _) => 0,
779
0
                DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
780
0
                DataType::List(field)
781
0
                | DataType::ListView(field)
782
0
                | DataType::FixedSizeList(field, _)
783
0
                | DataType::LargeList(field)
784
0
                | DataType::LargeListView(field)
785
0
                | DataType::Map(field, _) => field.size(),
786
0
                DataType::Struct(fields) => fields.size(),
787
0
                DataType::Union(fields, _) => fields.size(),
788
0
                DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
789
0
                DataType::RunEndEncoded(run_ends, values) => {
790
0
                    run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
791
0
                        - std::mem::size_of_val(values)
792
                }
793
            }
794
0
    }
795
796
    /// Check to see if `self` is a superset of `other`
797
    ///
798
    /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type
799
    /// else it will check to see if the DataType is equal to the other DataType
800
0
    pub fn contains(&self, other: &DataType) -> bool {
801
0
        match (self, other) {
802
0
            (DataType::List(f1), DataType::List(f2))
803
0
            | (DataType::LargeList(f1), DataType::LargeList(f2))
804
0
            | (DataType::ListView(f1), DataType::ListView(f2))
805
0
            | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2),
806
0
            (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => {
807
0
                s1 == s2 && f1.contains(f2)
808
            }
809
0
            (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2),
810
0
            (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2),
811
0
            (DataType::Union(f1, s1), DataType::Union(f2, s2)) => {
812
0
                s1 == s2
813
0
                    && f1
814
0
                        .iter()
815
0
                        .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1)))
816
            }
817
0
            (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
818
0
                k1.contains(k2) && v1.contains(v2)
819
            }
820
0
            _ => self == other,
821
        }
822
0
    }
823
824
    /// Create a [`DataType::List`] with elements of the specified type
825
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
826
    ///
827
    /// To specify field level metadata, construct the inner [`Field`]
828
    /// directly via [`Field::new`] or [`Field::new_list_field`].
829
0
    pub fn new_list(data_type: DataType, nullable: bool) -> Self {
830
0
        DataType::List(Arc::new(Field::new_list_field(data_type, nullable)))
831
0
    }
832
833
    /// Create a [`DataType::LargeList`] with elements of the specified type
834
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
835
    ///
836
    /// To specify field level metadata, construct the inner [`Field`]
837
    /// directly via [`Field::new`] or [`Field::new_list_field`].
838
0
    pub fn new_large_list(data_type: DataType, nullable: bool) -> Self {
839
0
        DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable)))
840
0
    }
841
842
    /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size
843
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
844
    ///
845
    /// To specify field level metadata, construct the inner [`Field`]
846
    /// directly via [`Field::new`] or [`Field::new_list_field`].
847
0
    pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self {
848
0
        DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size)
849
0
    }
850
}
851
852
/// The maximum precision for [DataType::Decimal32] values
853
pub const DECIMAL32_MAX_PRECISION: u8 = 9;
854
855
/// The maximum scale for [DataType::Decimal32] values
856
pub const DECIMAL32_MAX_SCALE: i8 = 9;
857
858
/// The maximum precision for [DataType::Decimal64] values
859
pub const DECIMAL64_MAX_PRECISION: u8 = 18;
860
861
/// The maximum scale for [DataType::Decimal64] values
862
pub const DECIMAL64_MAX_SCALE: i8 = 18;
863
864
/// The maximum precision for [DataType::Decimal128] values
865
pub const DECIMAL128_MAX_PRECISION: u8 = 38;
866
867
/// The maximum scale for [DataType::Decimal128] values
868
pub const DECIMAL128_MAX_SCALE: i8 = 38;
869
870
/// The maximum precision for [DataType::Decimal256] values
871
pub const DECIMAL256_MAX_PRECISION: u8 = 76;
872
873
/// The maximum scale for [DataType::Decimal256] values
874
pub const DECIMAL256_MAX_SCALE: i8 = 76;
875
876
/// The default scale for [DataType::Decimal32] values
877
pub const DECIMAL32_DEFAULT_SCALE: i8 = 2;
878
879
/// The default scale for [DataType::Decimal64] values
880
pub const DECIMAL64_DEFAULT_SCALE: i8 = 6;
881
882
/// The default scale for [DataType::Decimal128] and [DataType::Decimal256]
883
/// values
884
pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
885
886
#[cfg(test)]
887
mod tests {
888
    use super::*;
889
890
    #[test]
891
    #[cfg(feature = "serde")]
892
    fn serde_struct_type() {
893
        use std::collections::HashMap;
894
895
        let kv_array = [("k".to_string(), "v".to_string())];
896
        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
897
898
        // Non-empty map: should be converted as JSON obj { ... }
899
        let first_name =
900
            Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
901
902
        // Empty map: should be omitted.
903
        let last_name =
904
            Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default());
905
906
        let person = DataType::Struct(Fields::from(vec![
907
            first_name,
908
            last_name,
909
            Field::new(
910
                "address",
911
                DataType::Struct(Fields::from(vec![
912
                    Field::new("street", DataType::Utf8, false),
913
                    Field::new("zip", DataType::UInt16, false),
914
                ])),
915
                false,
916
            ),
917
        ]));
918
919
        let serialized = serde_json::to_string(&person).unwrap();
920
921
        // NOTE that this is testing the default (derived) serialization format, not the
922
        // JSON format specified in metadata.md
923
924
        assert_eq!(
925
            "{\"Struct\":[\
926
             {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
927
             {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
928
             {\"name\":\"address\",\"data_type\":{\"Struct\":\
929
             [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
930
             {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
931
             ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
932
            serialized
933
        );
934
935
        let deserialized = serde_json::from_str(&serialized).unwrap();
936
937
        assert_eq!(person, deserialized);
938
    }
939
940
    #[test]
941
    fn test_list_datatype_equality() {
942
        // tests that list type equality is checked while ignoring list names
943
        let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
944
        let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true)));
945
        let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
946
        let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true)));
947
        assert!(list_a.equals_datatype(&list_b));
948
        assert!(!list_a.equals_datatype(&list_c));
949
        assert!(!list_b.equals_datatype(&list_c));
950
        assert!(!list_a.equals_datatype(&list_d));
951
952
        let list_e =
953
            DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3);
954
        let list_f =
955
            DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3);
956
        let list_g = DataType::FixedSizeList(
957
            Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)),
958
            3,
959
        );
960
        assert!(list_e.equals_datatype(&list_f));
961
        assert!(!list_e.equals_datatype(&list_g));
962
        assert!(!list_f.equals_datatype(&list_g));
963
964
        let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
965
        let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
966
        let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
967
        let list_k = DataType::Struct(Fields::from(vec![
968
            Field::new("f1", list_f.clone(), false),
969
            Field::new("f2", list_g.clone(), false),
970
            Field::new("f3", DataType::Utf8, true),
971
        ]));
972
        let list_l = DataType::Struct(Fields::from(vec![
973
            Field::new("ff1", list_f.clone(), false),
974
            Field::new("ff2", list_g.clone(), false),
975
            Field::new("ff3", DataType::LargeUtf8, true),
976
        ]));
977
        let list_m = DataType::Struct(Fields::from(vec![
978
            Field::new("ff1", list_f, false),
979
            Field::new("ff2", list_g, false),
980
            Field::new("ff3", DataType::Utf8, true),
981
        ]));
982
        assert!(list_h.equals_datatype(&list_i));
983
        assert!(!list_h.equals_datatype(&list_j));
984
        assert!(!list_k.equals_datatype(&list_l));
985
        assert!(list_k.equals_datatype(&list_m));
986
987
        let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true);
988
        let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true);
989
        let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false);
990
        let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true);
991
        let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true);
992
993
        assert!(list_n.equals_datatype(&list_o));
994
        assert!(!list_n.equals_datatype(&list_p));
995
        assert!(!list_n.equals_datatype(&list_q));
996
        assert!(!list_n.equals_datatype(&list_r));
997
998
        let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a));
999
        let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone()));
1000
        let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b));
1001
        let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c));
1002
1003
        assert!(list_s.equals_datatype(&list_t));
1004
        assert!(!list_s.equals_datatype(&list_u));
1005
        assert!(!list_s.equals_datatype(&list_v));
1006
1007
        let union_a = DataType::Union(
1008
            UnionFields::new(
1009
                vec![1, 2],
1010
                vec![
1011
                    Field::new("f1", DataType::Utf8, false),
1012
                    Field::new("f2", DataType::UInt8, false),
1013
                ],
1014
            ),
1015
            UnionMode::Sparse,
1016
        );
1017
        let union_b = DataType::Union(
1018
            UnionFields::new(
1019
                vec![1, 2],
1020
                vec![
1021
                    Field::new("ff1", DataType::Utf8, false),
1022
                    Field::new("ff2", DataType::UInt8, false),
1023
                ],
1024
            ),
1025
            UnionMode::Sparse,
1026
        );
1027
        let union_c = DataType::Union(
1028
            UnionFields::new(
1029
                vec![2, 1],
1030
                vec![
1031
                    Field::new("fff2", DataType::UInt8, false),
1032
                    Field::new("fff1", DataType::Utf8, false),
1033
                ],
1034
            ),
1035
            UnionMode::Sparse,
1036
        );
1037
        let union_d = DataType::Union(
1038
            UnionFields::new(
1039
                vec![2, 1],
1040
                vec![
1041
                    Field::new("fff1", DataType::Int8, false),
1042
                    Field::new("fff2", DataType::UInt8, false),
1043
                ],
1044
            ),
1045
            UnionMode::Sparse,
1046
        );
1047
        let union_e = DataType::Union(
1048
            UnionFields::new(
1049
                vec![1, 2],
1050
                vec![
1051
                    Field::new("f1", DataType::Utf8, true),
1052
                    Field::new("f2", DataType::UInt8, false),
1053
                ],
1054
            ),
1055
            UnionMode::Sparse,
1056
        );
1057
1058
        assert!(union_a.equals_datatype(&union_b));
1059
        assert!(union_a.equals_datatype(&union_c));
1060
        assert!(!union_a.equals_datatype(&union_d));
1061
        assert!(!union_a.equals_datatype(&union_e));
1062
1063
        let list_w = DataType::RunEndEncoded(
1064
            Arc::new(Field::new("f1", DataType::Int64, true)),
1065
            Arc::new(Field::new("f2", DataType::Utf8, true)),
1066
        );
1067
        let list_x = DataType::RunEndEncoded(
1068
            Arc::new(Field::new("ff1", DataType::Int64, true)),
1069
            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1070
        );
1071
        let list_y = DataType::RunEndEncoded(
1072
            Arc::new(Field::new("ff1", DataType::UInt16, true)),
1073
            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1074
        );
1075
        let list_z = DataType::RunEndEncoded(
1076
            Arc::new(Field::new("f1", DataType::Int64, false)),
1077
            Arc::new(Field::new("f2", DataType::Utf8, true)),
1078
        );
1079
1080
        assert!(list_w.equals_datatype(&list_x));
1081
        assert!(!list_w.equals_datatype(&list_y));
1082
        assert!(!list_w.equals_datatype(&list_z));
1083
    }
1084
1085
    #[test]
1086
    fn create_struct_type() {
1087
        let _person = DataType::Struct(Fields::from(vec![
1088
            Field::new("first_name", DataType::Utf8, false),
1089
            Field::new("last_name", DataType::Utf8, false),
1090
            Field::new(
1091
                "address",
1092
                DataType::Struct(Fields::from(vec![
1093
                    Field::new("street", DataType::Utf8, false),
1094
                    Field::new("zip", DataType::UInt16, false),
1095
                ])),
1096
                false,
1097
            ),
1098
        ]));
1099
    }
1100
1101
    #[test]
1102
    fn test_nested() {
1103
        let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true)));
1104
        let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1105
        let large_list_view =
1106
            DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1107
1108
        assert!(!DataType::is_nested(&DataType::Boolean));
1109
        assert!(!DataType::is_nested(&DataType::Int32));
1110
        assert!(!DataType::is_nested(&DataType::Utf8));
1111
        assert!(DataType::is_nested(&list));
1112
        assert!(DataType::is_nested(&list_view));
1113
        assert!(DataType::is_nested(&large_list_view));
1114
1115
        assert!(!DataType::is_nested(&DataType::Dictionary(
1116
            Box::new(DataType::Int32),
1117
            Box::new(DataType::Boolean)
1118
        )));
1119
        assert!(!DataType::is_nested(&DataType::Dictionary(
1120
            Box::new(DataType::Int32),
1121
            Box::new(DataType::Int64)
1122
        )));
1123
        assert!(!DataType::is_nested(&DataType::Dictionary(
1124
            Box::new(DataType::Int32),
1125
            Box::new(DataType::LargeUtf8)
1126
        )));
1127
        assert!(DataType::is_nested(&DataType::Dictionary(
1128
            Box::new(DataType::Int32),
1129
            Box::new(list)
1130
        )));
1131
    }
1132
1133
    #[test]
1134
    fn test_integer() {
1135
        // is_integer
1136
        assert!(DataType::is_integer(&DataType::Int32));
1137
        assert!(DataType::is_integer(&DataType::UInt64));
1138
        assert!(!DataType::is_integer(&DataType::Float16));
1139
1140
        // is_signed_integer
1141
        assert!(DataType::is_signed_integer(&DataType::Int32));
1142
        assert!(!DataType::is_signed_integer(&DataType::UInt64));
1143
        assert!(!DataType::is_signed_integer(&DataType::Float16));
1144
1145
        // is_unsigned_integer
1146
        assert!(!DataType::is_unsigned_integer(&DataType::Int32));
1147
        assert!(DataType::is_unsigned_integer(&DataType::UInt64));
1148
        assert!(!DataType::is_unsigned_integer(&DataType::Float16));
1149
1150
        // is_dictionary_key_type
1151
        assert!(DataType::is_dictionary_key_type(&DataType::Int32));
1152
        assert!(DataType::is_dictionary_key_type(&DataType::UInt64));
1153
        assert!(!DataType::is_dictionary_key_type(&DataType::Float16));
1154
    }
1155
1156
    #[test]
1157
    fn test_floating() {
1158
        assert!(DataType::is_floating(&DataType::Float16));
1159
        assert!(!DataType::is_floating(&DataType::Int32));
1160
    }
1161
1162
    #[test]
1163
    fn test_datatype_is_null() {
1164
        assert!(DataType::is_null(&DataType::Null));
1165
        assert!(!DataType::is_null(&DataType::Int32));
1166
    }
1167
1168
    #[test]
1169
    fn size_should_not_regress() {
1170
        assert_eq!(std::mem::size_of::<DataType>(), 24);
1171
    }
1172
1173
    #[test]
1174
    #[should_panic(expected = "duplicate type id: 1")]
1175
    fn test_union_with_duplicated_type_id() {
1176
        let type_ids = vec![1, 1];
1177
        let _union = DataType::Union(
1178
            UnionFields::new(
1179
                type_ids,
1180
                vec![
1181
                    Field::new("f1", DataType::Int32, false),
1182
                    Field::new("f2", DataType::Utf8, false),
1183
                ],
1184
            ),
1185
            UnionMode::Dense,
1186
        );
1187
    }
1188
1189
    #[test]
1190
    fn test_try_from_str() {
1191
        let data_type: DataType = "Int32".try_into().unwrap();
1192
        assert_eq!(data_type, DataType::Int32);
1193
    }
1194
1195
    #[test]
1196
    fn test_from_str() {
1197
        let data_type: DataType = "UInt64".parse().unwrap();
1198
        assert_eq!(data_type, DataType::UInt64);
1199
    }
1200
}