Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::str::FromStr;
19
use std::sync::Arc;
20
21
use crate::{ArrowError, Field, FieldRef, Fields, UnionFields};
22
23
/// Datatypes supported by this implementation of Apache Arrow.
24
///
25
/// The variants of this enum include primitive fixed size types as well as
26
/// parametric or nested types. See [`Schema.fbs`] for Arrow's specification.
27
///
28
/// # Examples
29
///
30
/// Primitive types
31
/// ```
32
/// # use arrow_schema::DataType;
33
/// // create a new 32-bit signed integer
34
/// let data_type = DataType::Int32;
35
/// ```
36
///
37
/// Nested Types
38
/// ```
39
/// # use arrow_schema::{DataType, Field};
40
/// # use std::sync::Arc;
41
/// // create a new list of 32-bit signed integers directly
42
/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
43
/// // Create the same list type with constructor
44
/// let list_data_type2 = DataType::new_list(DataType::Int32, true);
45
/// assert_eq!(list_data_type, list_data_type2);
46
/// ```
47
///
48
/// Dictionary Types
49
/// ```
50
/// # use arrow_schema::{DataType};
51
/// // String Dictionary (key type Int32 and value type Utf8)
52
/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
53
/// ```
54
///
55
/// Timestamp Types
56
/// ```
57
/// # use arrow_schema::{DataType, TimeUnit};
58
/// // timestamp with millisecond precision without timezone specified
59
/// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
60
/// // timestamp with nanosecond precision in UTC timezone
61
/// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));
62
///```
63
///
64
/// # Display and FromStr
65
///
66
/// The `Display` and `FromStr` implementations for `DataType` are
67
/// human-readable, parseable, and reversible.
68
///
69
/// ```
70
/// # use arrow_schema::DataType;
71
/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
72
/// let data_type_string = data_type.to_string();
73
/// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)");
74
/// // display can be parsed back into the original type
75
/// let parsed_data_type: DataType = data_type.to_string().parse().unwrap();
76
/// assert_eq!(data_type, parsed_data_type);
77
/// ```
78
///
79
/// # Nested Support
80
/// Currently, the Rust implementation supports the following nested types:
81
///  - `List<T>`
82
///  - `LargeList<T>`
83
///  - `FixedSizeList<T>`
84
///  - `Struct<T, U, V, ...>`
85
///  - `Union<T, U, V, ...>`
86
///  - `Map<K, V>`
87
///
88
/// Nested types can themselves be nested within other arrays.
89
/// For more information on these types please see
90
/// [the physical memory layout of Apache Arrow]
91
///
92
/// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
93
/// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout
94
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
95
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
96
pub enum DataType {
97
    /// Null type
98
    Null,
99
    /// A boolean datatype representing the values `true` and `false`.
100
    Boolean,
101
    /// A signed 8-bit integer.
102
    Int8,
103
    /// A signed 16-bit integer.
104
    Int16,
105
    /// A signed 32-bit integer.
106
    Int32,
107
    /// A signed 64-bit integer.
108
    Int64,
109
    /// An unsigned 8-bit integer.
110
    UInt8,
111
    /// An unsigned 16-bit integer.
112
    UInt16,
113
    /// An unsigned 32-bit integer.
114
    UInt32,
115
    /// An unsigned 64-bit integer.
116
    UInt64,
117
    /// A 16-bit floating point number.
118
    Float16,
119
    /// A 32-bit floating point number.
120
    Float32,
121
    /// A 64-bit floating point number.
122
    Float64,
123
    /// A timestamp with an optional timezone.
124
    ///
125
    /// Time is measured as a Unix epoch, counting the seconds from
126
    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
127
    /// as a signed 64-bit integer.
128
    ///
129
    /// The time zone is a string indicating the name of a time zone, one of:
130
    ///
131
    /// * As used in the Olson time zone database (the "tz database" or
132
    ///   "tzdata"), such as "America/New_York"
133
    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
134
    ///
135
    /// Timestamps with a non-empty timezone
136
    /// ------------------------------------
137
    ///
138
    /// If a Timestamp column has a non-empty timezone value, its epoch is
139
    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
140
    /// (the Unix epoch), regardless of the Timestamp's own timezone.
141
    ///
142
    /// Therefore, timestamp values with a non-empty timezone correspond to
143
    /// physical points in time together with some additional information about
144
    /// how the data was obtained and/or how to display it (the timezone).
145
    ///
146
    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
147
    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
148
    ///   application may prefer to display it as "January 1st 1970, 01h00" in
149
    ///   the Europe/Paris timezone (which is the same physical point in time).
150
    ///
151
    /// One consequence is that timestamp values with a non-empty timezone
152
    /// can be compared and ordered directly, since they all share the same
153
    /// well-known point of reference (the Unix epoch).
154
    ///
155
    /// Timestamps with an unset / empty timezone
156
    /// -----------------------------------------
157
    ///
158
    /// If a Timestamp column has no timezone value, its epoch is
159
    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
160
    ///
161
    /// Therefore, timestamp values without a timezone cannot be meaningfully
162
    /// interpreted as physical points in time, but only as calendar / clock
163
    /// indications ("wall clock time") in an unspecified timezone.
164
    ///
165
    ///   For example, the timestamp value 0 with an empty timezone string
166
    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
167
    ///   is not enough information to interpret it as a well-defined physical
168
    ///   point in time.
169
    ///
170
    /// One consequence is that timestamp values without a timezone cannot
171
    /// be reliably compared or ordered, since they may have different points of
172
    /// reference.  In particular, it is *not* possible to interpret an unset
173
    /// or empty timezone as the same as "UTC".
174
    ///
175
    /// Conversion between timezones
176
    /// ----------------------------
177
    ///
178
    /// If a Timestamp column has a non-empty timezone, changing the timezone
179
    /// to a different non-empty value is a metadata-only operation:
180
    /// the timestamp values need not change as their point of reference remains
181
    /// the same (the Unix epoch).
182
    ///
183
    /// However, if a Timestamp column has no timezone value, changing it to a
184
    /// non-empty value requires to think about the desired semantics.
185
    /// One possibility is to assume that the original timestamp values are
186
    /// relative to the epoch of the timezone being set; timestamp values should
187
    /// then adjusted to the Unix epoch (for example, changing the timezone from
188
    /// empty to "Europe/Paris" would require converting the timestamp values
189
    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
190
    /// nevertheless correct).
191
    ///
192
    /// ```
193
    /// # use arrow_schema::{DataType, TimeUnit};
194
    /// DataType::Timestamp(TimeUnit::Second, None);
195
    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
196
    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
197
    /// ```
198
    ///
199
    /// # Timezone representation
200
    /// ----------------------------
201
    /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00".
202
    /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous.
203
    ///
204
    /// Most arrow-rs functionalities use the absolute offset representation,
205
    /// such as [`PrimitiveArray::with_timezone_utc`] that applies a
206
    /// UTC timezone to timestamp arrays.
207
    ///
208
    /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc
209
    ///
210
    /// Timezone string parsing
211
    /// -----------------------
212
    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
213
    ///
214
    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
215
    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
216
    /// timezones.
217
    Timestamp(TimeUnit, Option<Arc<str>>),
218
    /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
219
    /// in days.
220
    Date32,
221
    /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
222
    /// in milliseconds.
223
    ///
224
    /// # Valid Ranges
225
    ///
226
    /// According to the Arrow specification ([Schema.fbs]), values of Date64
227
    /// are treated as the number of *days*, in milliseconds, since the UNIX
228
    /// epoch. Therefore, values of this type  must be evenly divisible by
229
    /// `86_400_000`, the number of milliseconds in a standard day.
230
    ///
231
    /// It is not valid to store milliseconds that do not represent an exact
232
    /// day. The reason for this restriction is compatibility with other
233
    /// language's native libraries (specifically Java), which historically
234
    /// lacked a dedicated date type and only supported timestamps.
235
    ///
236
    /// # Validation
237
    ///
238
    /// This library does not validate or enforce that Date64 values are evenly
239
    /// divisible by `86_400_000`  for performance and usability reasons. Date64
240
    /// values are treated similarly to `Timestamp(TimeUnit::Millisecond,
241
    /// None)`: values will be displayed with a time of day if the value does
242
    /// not represent an exact day, and arithmetic will be done at the
243
    /// millisecond granularity.
244
    ///
245
    /// # Recommendation
246
    ///
247
    /// Users should prefer [`Date32`] to cleanly represent the number
248
    /// of days, or one of the Timestamp variants to include time as part of the
249
    /// representation, depending on their use case.
250
    ///
251
    /// # Further Reading
252
    ///
253
    /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288).
254
    ///
255
    /// [`Date32`]: Self::Date32
256
    /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
257
    Date64,
258
    /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
259
    /// Must be either seconds or milliseconds.
260
    Time32(TimeUnit),
261
    /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
262
    /// Must be either microseconds or nanoseconds.
263
    Time64(TimeUnit),
264
    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
265
    Duration(TimeUnit),
266
    /// A "calendar" interval which models types that don't necessarily
267
    /// have a precise duration without the context of a base timestamp (e.g.
268
    /// days can differ in length during day light savings time transitions).
269
    Interval(IntervalUnit),
270
    /// Opaque binary data of variable length.
271
    ///
272
    /// A single Binary array can store up to [`i32::MAX`] bytes
273
    /// of binary data in total.
274
    Binary,
275
    /// Opaque binary data of fixed size.
276
    /// Enum parameter specifies the number of bytes per value.
277
    FixedSizeBinary(i32),
278
    /// Opaque binary data of variable length and 64-bit offsets.
279
    ///
280
    /// A single LargeBinary array can store up to [`i64::MAX`] bytes
281
    /// of binary data in total.
282
    LargeBinary,
283
    /// Opaque binary data of variable length.
284
    ///
285
    /// Logically the same as [`Binary`], but the internal representation uses a view
286
    /// struct that contains the string length and either the string's entire data
287
    /// inline (for small strings) or an inlined prefix, an index of another buffer,
288
    /// and an offset pointing to a slice in that buffer (for non-small strings).
289
    ///
290
    /// [`Binary`]: Self::Binary
291
    BinaryView,
292
    /// A variable-length string in Unicode with UTF-8 encoding.
293
    ///
294
    /// A single Utf8 array can store up to [`i32::MAX`] bytes
295
    /// of string data in total.
296
    Utf8,
297
    /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
298
    ///
299
    /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
300
    /// of string data in total.
301
    LargeUtf8,
302
    /// A variable-length string in Unicode with UTF-8 encoding
303
    ///
304
    /// Logically the same as [`Utf8`], but the internal representation uses a view
305
    /// struct that contains the string length and either the string's entire data
306
    /// inline (for small strings) or an inlined prefix, an index of another buffer,
307
    /// and an offset pointing to a slice in that buffer (for non-small strings).
308
    ///
309
    /// [`Utf8`]: Self::Utf8
310
    Utf8View,
311
    /// A list of some logical data type with variable length.
312
    ///
313
    /// A single List array can store up to [`i32::MAX`] elements in total.
314
    List(FieldRef),
315
316
    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length.
317
    ///
318
    /// Logically the same as [`List`], but the internal representation differs in how child
319
    /// data is referenced, allowing flexibility in how data is layed out.
320
    ///
321
    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
322
    ///
323
    /// [`List`]: Self::List
324
    ListView(FieldRef),
325
    /// A list of some logical data type with fixed length.
326
    FixedSizeList(FieldRef, i32),
327
    /// A list of some logical data type with variable length and 64-bit offsets.
328
    ///
329
    /// A single LargeList array can store up to [`i64::MAX`] elements in total.
330
    LargeList(FieldRef),
331
332
    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length and 64-bit offsets.
333
    ///
334
    /// Logically the same as [`LargeList`], but the internal representation differs in how child
335
    /// data is referenced, allowing flexibility in how data is layed out.
336
    ///
337
    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
338
    ///
339
    /// [`LargeList`]: Self::LargeList
340
    LargeListView(FieldRef),
341
    /// A nested datatype that contains a number of sub-fields.
342
    Struct(Fields),
343
    /// A nested datatype that can represent slots of differing types. Components:
344
    ///
345
    /// 1. [`UnionFields`]
346
    /// 2. The type of union (Sparse or Dense)
347
    Union(UnionFields, UnionMode),
348
    /// A dictionary encoded array (`key_type`, `value_type`), where
349
    /// each array element is an index of `key_type` into an
350
    /// associated dictionary of `value_type`.
351
    ///
352
    /// Dictionary arrays are used to store columns of `value_type`
353
    /// that contain many repeated values using less memory, but with
354
    /// a higher CPU overhead for some operations.
355
    ///
356
    /// This type mostly used to represent low cardinality string
357
    /// arrays or a limited set of primitive types as integers.
358
    Dictionary(Box<DataType>, Box<DataType>),
359
    /// Exact 32-bit width decimal value with precision and scale
360
    ///
361
    /// * precision is the total number of digits
362
    /// * scale is the number of digits past the decimal
363
    ///
364
    /// For example the number 123.45 has precision 5 and scale 2.
365
    ///
366
    /// In certain situations, scale could be negative number. For
367
    /// negative scale, it is the number of padding 0 to the right
368
    /// of the digits.
369
    ///
370
    /// For example the number 12300 could be treated as a decimal
371
    /// has precision 3 and scale -2.
372
    Decimal32(u8, i8),
373
    /// Exact 64-bit width decimal value with precision and scale
374
    ///
375
    /// * precision is the total number of digits
376
    /// * scale is the number of digits past the decimal
377
    ///
378
    /// For example the number 123.45 has precision 5 and scale 2.
379
    ///
380
    /// In certain situations, scale could be negative number. For
381
    /// negative scale, it is the number of padding 0 to the right
382
    /// of the digits.
383
    ///
384
    /// For example the number 12300 could be treated as a decimal
385
    /// has precision 3 and scale -2.
386
    Decimal64(u8, i8),
387
    /// Exact 128-bit width decimal value with precision and scale
388
    ///
389
    /// * precision is the total number of digits
390
    /// * scale is the number of digits past the decimal
391
    ///
392
    /// For example the number 123.45 has precision 5 and scale 2.
393
    ///
394
    /// In certain situations, scale could be negative number. For
395
    /// negative scale, it is the number of padding 0 to the right
396
    /// of the digits.
397
    ///
398
    /// For example the number 12300 could be treated as a decimal
399
    /// has precision 3 and scale -2.
400
    Decimal128(u8, i8),
401
    /// Exact 256-bit width decimal value with precision and scale
402
    ///
403
    /// * precision is the total number of digits
404
    /// * scale is the number of digits past the decimal
405
    ///
406
    /// For example the number 123.45 has precision 5 and scale 2.
407
    ///
408
    /// In certain situations, scale could be negative number. For
409
    /// negative scale, it is the number of padding 0 to the right
410
    /// of the digits.
411
    ///
412
    /// For example the number 12300 could be treated as a decimal
413
    /// has precision 3 and scale -2.
414
    Decimal256(u8, i8),
415
    /// A Map is a logical nested type that is represented as
416
    ///
417
    /// `List<entries: Struct<key: K, value: V>>`
418
    ///
419
    /// The keys and values are each respectively contiguous.
420
    /// The key and value types are not constrained, but keys should be
421
    /// hashable and unique.
422
    /// Whether the keys are sorted can be set in the `bool` after the `Field`.
423
    ///
424
    /// In a field with Map type, the field has a child Struct field, which then
425
    /// has two children: key type and the second the value type. The names of the
426
    /// child fields may be respectively "entries", "key", and "value", but this is
427
    /// not enforced.
428
    Map(FieldRef, bool),
429
    /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These
430
    /// encodings are well-suited for representing data containing sequences of the
431
    /// same value, called runs. Each run is represented as a value and an integer giving
432
    /// the index in the array where the run ends.
433
    ///
434
    /// A run-end encoded array has no buffers by itself, but has two child arrays. The
435
    /// first child array, called the run ends array, holds either 16, 32, or 64-bit
436
    /// signed integers. The actual values of each run are held in the second child array.
437
    ///
438
    /// These child arrays are prescribed the standard names of "run_ends" and "values"
439
    /// respectively.
440
    RunEndEncoded(FieldRef, FieldRef),
441
}
442
443
/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
444
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
445
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
446
pub enum TimeUnit {
447
    /// Time in seconds.
448
    Second,
449
    /// Time in milliseconds.
450
    Millisecond,
451
    /// Time in microseconds.
452
    Microsecond,
453
    /// Time in nanoseconds.
454
    Nanosecond,
455
}
456
457
impl std::fmt::Display for TimeUnit {
458
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
459
0
        match self {
460
0
            TimeUnit::Second => write!(f, "s"),
461
0
            TimeUnit::Millisecond => write!(f, "ms"),
462
0
            TimeUnit::Microsecond => write!(f, "µs"),
463
0
            TimeUnit::Nanosecond => write!(f, "ns"),
464
        }
465
0
    }
466
}
467
468
/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
469
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
470
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
471
pub enum IntervalUnit {
472
    /// Indicates the number of elapsed whole months, stored as 4-byte integers.
473
    YearMonth,
474
    /// Indicates the number of elapsed days and milliseconds,
475
    /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
476
    DayTime,
477
    /// A triple of the number of elapsed months, days, and nanoseconds.
478
    /// The values are stored contiguously in 16 byte blocks. Months and
479
    /// days are encoded as 32 bit integers and nanoseconds is encoded as a
480
    /// 64 bit integer. All integers are signed. Each field is independent
481
    /// (e.g. there is no constraint that nanoseconds have the same sign
482
    /// as days or that the quantity of nanoseconds represents less
483
    /// than a day's worth of time).
484
    MonthDayNano,
485
}
486
487
/// Sparse or Dense union layouts
488
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
489
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
490
pub enum UnionMode {
491
    /// Sparse union layout
492
    Sparse,
493
    /// Dense union layout
494
    Dense,
495
}
496
497
/// Parses `str` into a `DataType`.
498
///
499
/// This is the reverse of [`DataType`]'s `Display`
500
/// impl, and maintains the invariant that
501
/// `DataType::try_from(&data_type.to_string()).unwrap() == data_type`
502
///
503
/// # Example
504
/// ```
505
/// use arrow_schema::DataType;
506
///
507
/// let data_type: DataType = "Int32".parse().unwrap();
508
/// assert_eq!(data_type, DataType::Int32);
509
/// ```
510
impl FromStr for DataType {
511
    type Err = ArrowError;
512
513
0
    fn from_str(s: &str) -> Result<Self, Self::Err> {
514
0
        crate::datatype_parse::parse_data_type(s)
515
0
    }
516
}
517
518
impl TryFrom<&str> for DataType {
519
    type Error = ArrowError;
520
521
0
    fn try_from(value: &str) -> Result<Self, Self::Error> {
522
0
        value.parse()
523
0
    }
524
}
525
526
impl DataType {
527
    /// Returns true if the type is primitive: (numeric, temporal).
528
    #[inline]
529
2
    pub fn is_primitive(&self) -> bool {
530
2
        self.is_numeric() || 
self0
.
is_temporal0
()
531
2
    }
532
533
    /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*).
534
    #[inline]
535
2
    pub fn is_numeric(&self) -> bool {
536
        use DataType::*;
537
0
        matches!(
538
2
            self,
539
            UInt8
540
                | UInt16
541
                | UInt32
542
                | UInt64
543
                | Int8
544
                | Int16
545
                | Int32
546
                | Int64
547
                | Float16
548
                | Float32
549
                | Float64
550
                | Decimal32(_, _)
551
                | Decimal64(_, _)
552
                | Decimal128(_, _)
553
                | Decimal256(_, _)
554
        )
555
2
    }
556
557
    /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval).
558
    #[inline]
559
0
    pub fn is_temporal(&self) -> bool {
560
        use DataType::*;
561
0
        matches!(
562
0
            self,
563
            Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_)
564
        )
565
0
    }
566
567
    /// Returns true if this type is floating: (Float*).
568
    #[inline]
569
0
    pub fn is_floating(&self) -> bool {
570
        use DataType::*;
571
0
        matches!(self, Float16 | Float32 | Float64)
572
0
    }
573
574
    /// Returns true if this type is integer: (Int*, UInt*).
575
    #[inline]
576
0
    pub fn is_integer(&self) -> bool {
577
0
        self.is_signed_integer() || self.is_unsigned_integer()
578
0
    }
579
580
    /// Returns true if this type is signed integer: (Int*).
581
    #[inline]
582
0
    pub fn is_signed_integer(&self) -> bool {
583
        use DataType::*;
584
0
        matches!(self, Int8 | Int16 | Int32 | Int64)
585
0
    }
586
587
    /// Returns true if this type is unsigned integer: (UInt*).
588
    #[inline]
589
0
    pub fn is_unsigned_integer(&self) -> bool {
590
        use DataType::*;
591
0
        matches!(self, UInt8 | UInt16 | UInt32 | UInt64)
592
0
    }
593
594
    /// Returns true if this type is valid as a dictionary key
595
    #[inline]
596
0
    pub fn is_dictionary_key_type(&self) -> bool {
597
0
        self.is_integer()
598
0
    }
599
600
    /// Returns true if this type is valid for run-ends array in RunArray
601
    #[inline]
602
40
    pub fn is_run_ends_type(&self) -> bool {
603
        use DataType::*;
604
40
        
matches!0
(self, Int16 | Int32 | Int64)
605
40
    }
606
607
    /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union,
608
    /// or Map), or a dictionary of a nested type
609
    #[inline]
610
0
    pub fn is_nested(&self) -> bool {
611
        use DataType::*;
612
0
        match self {
613
0
            Dictionary(_, v) => DataType::is_nested(v.as_ref()),
614
0
            RunEndEncoded(_, v) => DataType::is_nested(v.data_type()),
615
            List(_)
616
            | FixedSizeList(_, _)
617
            | LargeList(_)
618
            | ListView(_)
619
            | LargeListView(_)
620
            | Struct(_)
621
            | Union(_, _)
622
0
            | Map(_, _) => true,
623
0
            _ => false,
624
        }
625
0
    }
626
627
    /// Returns true if this type is DataType::Null.
628
    #[inline]
629
21
    pub fn is_null(&self) -> bool {
630
        use DataType::*;
631
21
        
matches!20
(self, Null)
632
21
    }
633
634
    /// Compares the datatype with another, ignoring nested field names
635
    /// and metadata.
636
0
    pub fn equals_datatype(&self, other: &DataType) -> bool {
637
0
        match (&self, other) {
638
0
            (DataType::List(a), DataType::List(b))
639
0
            | (DataType::LargeList(a), DataType::LargeList(b))
640
0
            | (DataType::ListView(a), DataType::ListView(b))
641
0
            | (DataType::LargeListView(a), DataType::LargeListView(b)) => {
642
0
                a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type())
643
            }
644
0
            (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
645
0
                a_size == b_size
646
0
                    && a.is_nullable() == b.is_nullable()
647
0
                    && a.data_type().equals_datatype(b.data_type())
648
            }
649
0
            (DataType::Struct(a), DataType::Struct(b)) => {
650
0
                a.len() == b.len()
651
0
                    && a.iter().zip(b).all(|(a, b)| {
652
0
                        a.is_nullable() == b.is_nullable()
653
0
                            && a.data_type().equals_datatype(b.data_type())
654
0
                    })
655
            }
656
0
            (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => {
657
0
                a_field.is_nullable() == b_field.is_nullable()
658
0
                    && a_field.data_type().equals_datatype(b_field.data_type())
659
0
                    && a_is_sorted == b_is_sorted
660
            }
661
0
            (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => {
662
0
                a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value)
663
            }
664
            (
665
0
                DataType::RunEndEncoded(a_run_ends, a_values),
666
0
                DataType::RunEndEncoded(b_run_ends, b_values),
667
            ) => {
668
0
                a_run_ends.is_nullable() == b_run_ends.is_nullable()
669
0
                    && a_run_ends
670
0
                        .data_type()
671
0
                        .equals_datatype(b_run_ends.data_type())
672
0
                    && a_values.is_nullable() == b_values.is_nullable()
673
0
                    && a_values.data_type().equals_datatype(b_values.data_type())
674
            }
675
            (
676
0
                DataType::Union(a_union_fields, a_union_mode),
677
0
                DataType::Union(b_union_fields, b_union_mode),
678
            ) => {
679
0
                a_union_mode == b_union_mode
680
0
                    && a_union_fields.len() == b_union_fields.len()
681
0
                    && a_union_fields.iter().all(|a| {
682
0
                        b_union_fields.iter().any(|b| {
683
0
                            a.0 == b.0
684
0
                                && a.1.is_nullable() == b.1.is_nullable()
685
0
                                && a.1.data_type().equals_datatype(b.1.data_type())
686
0
                        })
687
0
                    })
688
            }
689
0
            _ => self == other,
690
        }
691
0
    }
692
693
    /// Returns the byte width of this type if it is a primitive type
694
    ///
695
    /// Returns `None` if not a primitive type
696
    #[inline]
697
166
    pub fn primitive_width(&self) -> Option<usize> {
698
0
        match self {
699
1
            DataType::Null => None,
700
1
            DataType::Boolean => None,
701
3
            DataType::Int8 | DataType::UInt8 => Some(1),
702
1
            DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
703
64
            DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
704
12
            DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
705
16
            DataType::Timestamp(_, _) => Some(8),
706
0
            DataType::Date32 | DataType::Time32(_) => Some(4),
707
0
            DataType::Date64 | DataType::Time64(_) => Some(8),
708
0
            DataType::Duration(_) => Some(8),
709
0
            DataType::Interval(IntervalUnit::YearMonth) => Some(4),
710
0
            DataType::Interval(IntervalUnit::DayTime) => Some(8),
711
0
            DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
712
0
            DataType::Decimal32(_, _) => Some(4),
713
0
            DataType::Decimal64(_, _) => Some(8),
714
16
            DataType::Decimal128(_, _) => Some(16),
715
0
            DataType::Decimal256(_, _) => Some(32),
716
33
            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
717
0
            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
718
1
            DataType::FixedSizeBinary(_) => None,
719
            DataType::List(_)
720
            | DataType::ListView(_)
721
            | DataType::LargeList(_)
722
            | DataType::LargeListView(_)
723
0
            | DataType::Map(_, _) => None,
724
0
            DataType::FixedSizeList(_, _) => None,
725
1
            DataType::Struct(_) => None,
726
5
            DataType::Union(_, _) => None,
727
11
            DataType::Dictionary(_, _) => None,
728
1
            DataType::RunEndEncoded(_, _) => None,
729
        }
730
166
    }
731
732
    /// Return size of this instance in bytes.
733
    ///
734
    /// Includes the size of `Self`.
735
0
    pub fn size(&self) -> usize {
736
0
        std::mem::size_of_val(self)
737
0
            + match self {
738
                DataType::Null
739
                | DataType::Boolean
740
                | DataType::Int8
741
                | DataType::Int16
742
                | DataType::Int32
743
                | DataType::Int64
744
                | DataType::UInt8
745
                | DataType::UInt16
746
                | DataType::UInt32
747
                | DataType::UInt64
748
                | DataType::Float16
749
                | DataType::Float32
750
                | DataType::Float64
751
                | DataType::Date32
752
                | DataType::Date64
753
                | DataType::Time32(_)
754
                | DataType::Time64(_)
755
                | DataType::Duration(_)
756
                | DataType::Interval(_)
757
                | DataType::Binary
758
                | DataType::FixedSizeBinary(_)
759
                | DataType::LargeBinary
760
                | DataType::BinaryView
761
                | DataType::Utf8
762
                | DataType::LargeUtf8
763
                | DataType::Utf8View
764
                | DataType::Decimal32(_, _)
765
                | DataType::Decimal64(_, _)
766
                | DataType::Decimal128(_, _)
767
0
                | DataType::Decimal256(_, _) => 0,
768
0
                DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
769
0
                DataType::List(field)
770
0
                | DataType::ListView(field)
771
0
                | DataType::FixedSizeList(field, _)
772
0
                | DataType::LargeList(field)
773
0
                | DataType::LargeListView(field)
774
0
                | DataType::Map(field, _) => field.size(),
775
0
                DataType::Struct(fields) => fields.size(),
776
0
                DataType::Union(fields, _) => fields.size(),
777
0
                DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
778
0
                DataType::RunEndEncoded(run_ends, values) => {
779
0
                    run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
780
0
                        - std::mem::size_of_val(values)
781
                }
782
            }
783
0
    }
784
785
    /// Check to see if `self` is a superset of `other`
786
    ///
787
    /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type
788
    /// else it will check to see if the DataType is equal to the other DataType
789
0
    pub fn contains(&self, other: &DataType) -> bool {
790
0
        match (self, other) {
791
0
            (DataType::List(f1), DataType::List(f2))
792
0
            | (DataType::LargeList(f1), DataType::LargeList(f2))
793
0
            | (DataType::ListView(f1), DataType::ListView(f2))
794
0
            | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2),
795
0
            (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => {
796
0
                s1 == s2 && f1.contains(f2)
797
            }
798
0
            (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2),
799
0
            (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2),
800
0
            (DataType::Union(f1, s1), DataType::Union(f2, s2)) => {
801
0
                s1 == s2
802
0
                    && f1
803
0
                        .iter()
804
0
                        .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1)))
805
            }
806
0
            (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
807
0
                k1.contains(k2) && v1.contains(v2)
808
            }
809
0
            _ => self == other,
810
        }
811
0
    }
812
813
    /// Create a [`DataType::List`] with elements of the specified type
814
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
815
    ///
816
    /// To specify field level metadata, construct the inner [`Field`]
817
    /// directly via [`Field::new`] or [`Field::new_list_field`].
818
0
    pub fn new_list(data_type: DataType, nullable: bool) -> Self {
819
0
        DataType::List(Arc::new(Field::new_list_field(data_type, nullable)))
820
0
    }
821
822
    /// Create a [`DataType::LargeList`] with elements of the specified type
823
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
824
    ///
825
    /// To specify field level metadata, construct the inner [`Field`]
826
    /// directly via [`Field::new`] or [`Field::new_list_field`].
827
0
    pub fn new_large_list(data_type: DataType, nullable: bool) -> Self {
828
0
        DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable)))
829
0
    }
830
831
    /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size
832
    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
833
    ///
834
    /// To specify field level metadata, construct the inner [`Field`]
835
    /// directly via [`Field::new`] or [`Field::new_list_field`].
836
2
    pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self {
837
2
        DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size)
838
2
    }
839
}
840
841
/// The maximum precision for [DataType::Decimal32] values
842
pub const DECIMAL32_MAX_PRECISION: u8 = 9;
843
844
/// The maximum scale for [DataType::Decimal32] values
845
pub const DECIMAL32_MAX_SCALE: i8 = 9;
846
847
/// The maximum precision for [DataType::Decimal64] values
848
pub const DECIMAL64_MAX_PRECISION: u8 = 18;
849
850
/// The maximum scale for [DataType::Decimal64] values
851
pub const DECIMAL64_MAX_SCALE: i8 = 18;
852
853
/// The maximum precision for [DataType::Decimal128] values
854
pub const DECIMAL128_MAX_PRECISION: u8 = 38;
855
856
/// The maximum scale for [DataType::Decimal128] values
857
pub const DECIMAL128_MAX_SCALE: i8 = 38;
858
859
/// The maximum precision for [DataType::Decimal256] values
860
pub const DECIMAL256_MAX_PRECISION: u8 = 76;
861
862
/// The maximum scale for [DataType::Decimal256] values
863
pub const DECIMAL256_MAX_SCALE: i8 = 76;
864
865
/// The default scale for [DataType::Decimal32] values
866
pub const DECIMAL32_DEFAULT_SCALE: i8 = 2;
867
868
/// The default scale for [DataType::Decimal64] values
869
pub const DECIMAL64_DEFAULT_SCALE: i8 = 6;
870
871
/// The default scale for [DataType::Decimal128] and [DataType::Decimal256]
872
/// values
873
pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
874
875
#[cfg(test)]
876
mod tests {
877
    use super::*;
878
879
    #[test]
880
    #[cfg(feature = "serde")]
881
    fn serde_struct_type() {
882
        use std::collections::HashMap;
883
884
        let kv_array = [("k".to_string(), "v".to_string())];
885
        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
886
887
        // Non-empty map: should be converted as JSON obj { ... }
888
        let first_name =
889
            Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
890
891
        // Empty map: should be omitted.
892
        let last_name =
893
            Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default());
894
895
        let person = DataType::Struct(Fields::from(vec![
896
            first_name,
897
            last_name,
898
            Field::new(
899
                "address",
900
                DataType::Struct(Fields::from(vec![
901
                    Field::new("street", DataType::Utf8, false),
902
                    Field::new("zip", DataType::UInt16, false),
903
                ])),
904
                false,
905
            ),
906
        ]));
907
908
        let serialized = serde_json::to_string(&person).unwrap();
909
910
        // NOTE that this is testing the default (derived) serialization format, not the
911
        // JSON format specified in metadata.md
912
913
        assert_eq!(
914
            "{\"Struct\":[\
915
             {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
916
             {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
917
             {\"name\":\"address\",\"data_type\":{\"Struct\":\
918
             [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
919
             {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
920
             ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
921
            serialized
922
        );
923
924
        let deserialized = serde_json::from_str(&serialized).unwrap();
925
926
        assert_eq!(person, deserialized);
927
    }
928
929
    #[test]
930
    fn test_list_datatype_equality() {
931
        // tests that list type equality is checked while ignoring list names
932
        let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
933
        let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true)));
934
        let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
935
        let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true)));
936
        assert!(list_a.equals_datatype(&list_b));
937
        assert!(!list_a.equals_datatype(&list_c));
938
        assert!(!list_b.equals_datatype(&list_c));
939
        assert!(!list_a.equals_datatype(&list_d));
940
941
        let list_e =
942
            DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3);
943
        let list_f =
944
            DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3);
945
        let list_g = DataType::FixedSizeList(
946
            Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)),
947
            3,
948
        );
949
        assert!(list_e.equals_datatype(&list_f));
950
        assert!(!list_e.equals_datatype(&list_g));
951
        assert!(!list_f.equals_datatype(&list_g));
952
953
        let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
954
        let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
955
        let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
956
        let list_k = DataType::Struct(Fields::from(vec![
957
            Field::new("f1", list_f.clone(), false),
958
            Field::new("f2", list_g.clone(), false),
959
            Field::new("f3", DataType::Utf8, true),
960
        ]));
961
        let list_l = DataType::Struct(Fields::from(vec![
962
            Field::new("ff1", list_f.clone(), false),
963
            Field::new("ff2", list_g.clone(), false),
964
            Field::new("ff3", DataType::LargeUtf8, true),
965
        ]));
966
        let list_m = DataType::Struct(Fields::from(vec![
967
            Field::new("ff1", list_f, false),
968
            Field::new("ff2", list_g, false),
969
            Field::new("ff3", DataType::Utf8, true),
970
        ]));
971
        assert!(list_h.equals_datatype(&list_i));
972
        assert!(!list_h.equals_datatype(&list_j));
973
        assert!(!list_k.equals_datatype(&list_l));
974
        assert!(list_k.equals_datatype(&list_m));
975
976
        let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true);
977
        let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true);
978
        let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false);
979
        let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true);
980
        let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true);
981
982
        assert!(list_n.equals_datatype(&list_o));
983
        assert!(!list_n.equals_datatype(&list_p));
984
        assert!(!list_n.equals_datatype(&list_q));
985
        assert!(!list_n.equals_datatype(&list_r));
986
987
        let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a));
988
        let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone()));
989
        let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b));
990
        let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c));
991
992
        assert!(list_s.equals_datatype(&list_t));
993
        assert!(!list_s.equals_datatype(&list_u));
994
        assert!(!list_s.equals_datatype(&list_v));
995
996
        let union_a = DataType::Union(
997
            UnionFields::new(
998
                vec![1, 2],
999
                vec![
1000
                    Field::new("f1", DataType::Utf8, false),
1001
                    Field::new("f2", DataType::UInt8, false),
1002
                ],
1003
            ),
1004
            UnionMode::Sparse,
1005
        );
1006
        let union_b = DataType::Union(
1007
            UnionFields::new(
1008
                vec![1, 2],
1009
                vec![
1010
                    Field::new("ff1", DataType::Utf8, false),
1011
                    Field::new("ff2", DataType::UInt8, false),
1012
                ],
1013
            ),
1014
            UnionMode::Sparse,
1015
        );
1016
        let union_c = DataType::Union(
1017
            UnionFields::new(
1018
                vec![2, 1],
1019
                vec![
1020
                    Field::new("fff2", DataType::UInt8, false),
1021
                    Field::new("fff1", DataType::Utf8, false),
1022
                ],
1023
            ),
1024
            UnionMode::Sparse,
1025
        );
1026
        let union_d = DataType::Union(
1027
            UnionFields::new(
1028
                vec![2, 1],
1029
                vec![
1030
                    Field::new("fff1", DataType::Int8, false),
1031
                    Field::new("fff2", DataType::UInt8, false),
1032
                ],
1033
            ),
1034
            UnionMode::Sparse,
1035
        );
1036
        let union_e = DataType::Union(
1037
            UnionFields::new(
1038
                vec![1, 2],
1039
                vec![
1040
                    Field::new("f1", DataType::Utf8, true),
1041
                    Field::new("f2", DataType::UInt8, false),
1042
                ],
1043
            ),
1044
            UnionMode::Sparse,
1045
        );
1046
1047
        assert!(union_a.equals_datatype(&union_b));
1048
        assert!(union_a.equals_datatype(&union_c));
1049
        assert!(!union_a.equals_datatype(&union_d));
1050
        assert!(!union_a.equals_datatype(&union_e));
1051
1052
        let list_w = DataType::RunEndEncoded(
1053
            Arc::new(Field::new("f1", DataType::Int64, true)),
1054
            Arc::new(Field::new("f2", DataType::Utf8, true)),
1055
        );
1056
        let list_x = DataType::RunEndEncoded(
1057
            Arc::new(Field::new("ff1", DataType::Int64, true)),
1058
            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1059
        );
1060
        let list_y = DataType::RunEndEncoded(
1061
            Arc::new(Field::new("ff1", DataType::UInt16, true)),
1062
            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1063
        );
1064
        let list_z = DataType::RunEndEncoded(
1065
            Arc::new(Field::new("f1", DataType::Int64, false)),
1066
            Arc::new(Field::new("f2", DataType::Utf8, true)),
1067
        );
1068
1069
        assert!(list_w.equals_datatype(&list_x));
1070
        assert!(!list_w.equals_datatype(&list_y));
1071
        assert!(!list_w.equals_datatype(&list_z));
1072
    }
1073
1074
    #[test]
1075
    fn create_struct_type() {
1076
        let _person = DataType::Struct(Fields::from(vec![
1077
            Field::new("first_name", DataType::Utf8, false),
1078
            Field::new("last_name", DataType::Utf8, false),
1079
            Field::new(
1080
                "address",
1081
                DataType::Struct(Fields::from(vec![
1082
                    Field::new("street", DataType::Utf8, false),
1083
                    Field::new("zip", DataType::UInt16, false),
1084
                ])),
1085
                false,
1086
            ),
1087
        ]));
1088
    }
1089
1090
    #[test]
1091
    fn test_nested() {
1092
        let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true)));
1093
        let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1094
        let large_list_view =
1095
            DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1096
1097
        assert!(!DataType::is_nested(&DataType::Boolean));
1098
        assert!(!DataType::is_nested(&DataType::Int32));
1099
        assert!(!DataType::is_nested(&DataType::Utf8));
1100
        assert!(DataType::is_nested(&list));
1101
        assert!(DataType::is_nested(&list_view));
1102
        assert!(DataType::is_nested(&large_list_view));
1103
1104
        assert!(!DataType::is_nested(&DataType::Dictionary(
1105
            Box::new(DataType::Int32),
1106
            Box::new(DataType::Boolean)
1107
        )));
1108
        assert!(!DataType::is_nested(&DataType::Dictionary(
1109
            Box::new(DataType::Int32),
1110
            Box::new(DataType::Int64)
1111
        )));
1112
        assert!(!DataType::is_nested(&DataType::Dictionary(
1113
            Box::new(DataType::Int32),
1114
            Box::new(DataType::LargeUtf8)
1115
        )));
1116
        assert!(DataType::is_nested(&DataType::Dictionary(
1117
            Box::new(DataType::Int32),
1118
            Box::new(list)
1119
        )));
1120
    }
1121
1122
    #[test]
1123
    fn test_integer() {
1124
        // is_integer
1125
        assert!(DataType::is_integer(&DataType::Int32));
1126
        assert!(DataType::is_integer(&DataType::UInt64));
1127
        assert!(!DataType::is_integer(&DataType::Float16));
1128
1129
        // is_signed_integer
1130
        assert!(DataType::is_signed_integer(&DataType::Int32));
1131
        assert!(!DataType::is_signed_integer(&DataType::UInt64));
1132
        assert!(!DataType::is_signed_integer(&DataType::Float16));
1133
1134
        // is_unsigned_integer
1135
        assert!(!DataType::is_unsigned_integer(&DataType::Int32));
1136
        assert!(DataType::is_unsigned_integer(&DataType::UInt64));
1137
        assert!(!DataType::is_unsigned_integer(&DataType::Float16));
1138
1139
        // is_dictionary_key_type
1140
        assert!(DataType::is_dictionary_key_type(&DataType::Int32));
1141
        assert!(DataType::is_dictionary_key_type(&DataType::UInt64));
1142
        assert!(!DataType::is_dictionary_key_type(&DataType::Float16));
1143
    }
1144
1145
    #[test]
1146
    fn test_floating() {
1147
        assert!(DataType::is_floating(&DataType::Float16));
1148
        assert!(!DataType::is_floating(&DataType::Int32));
1149
    }
1150
1151
    #[test]
1152
    fn test_datatype_is_null() {
1153
        assert!(DataType::is_null(&DataType::Null));
1154
        assert!(!DataType::is_null(&DataType::Int32));
1155
    }
1156
1157
    #[test]
1158
    fn size_should_not_regress() {
1159
        assert_eq!(std::mem::size_of::<DataType>(), 24);
1160
    }
1161
1162
    #[test]
1163
    #[should_panic(expected = "duplicate type id: 1")]
1164
    fn test_union_with_duplicated_type_id() {
1165
        let type_ids = vec![1, 1];
1166
        let _union = DataType::Union(
1167
            UnionFields::new(
1168
                type_ids,
1169
                vec![
1170
                    Field::new("f1", DataType::Int32, false),
1171
                    Field::new("f2", DataType::Utf8, false),
1172
                ],
1173
            ),
1174
            UnionMode::Dense,
1175
        );
1176
    }
1177
1178
    #[test]
1179
    fn test_try_from_str() {
1180
        let data_type: DataType = "Int32".try_into().unwrap();
1181
        assert_eq!(data_type, DataType::Int32);
1182
    }
1183
1184
    #[test]
1185
    fn test_from_str() {
1186
        let data_type: DataType = "UInt64".parse().unwrap();
1187
        assert_eq!(data_type, DataType::UInt64);
1188
    }
1189
1190
    #[test]
1191
    #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro
1192
    fn test_debug_format_field() {
1193
        // Make sure the `Debug` formatting of `DataType` is readable and not too long
1194
        insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r"
1195
        List(
1196
            Field {
1197
                data_type: Int8,
1198
            },
1199
        )
1200
        ");
1201
    }
1202
}