Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype_parse.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields, UnionMode};
21
22
/// Parses a DataType from a string representation
23
///
24
/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
25
0
pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
26
0
    Parser::new(val).parse()
27
0
}
28
29
type ArrowResult<T> = Result<T, ArrowError>;
30
31
0
fn make_error(val: &str, msg: &str) -> ArrowError {
32
0
    let msg = format!(
33
0
        "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}"
34
    );
35
0
    ArrowError::ParseError(msg)
36
0
}
37
38
0
fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
39
0
    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
40
0
}
41
42
/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
43
#[derive(Debug)]
44
struct Parser<'a> {
45
    val: &'a str,
46
    tokenizer: Peekable<Tokenizer<'a>>,
47
}
48
49
impl<'a> Parser<'a> {
50
0
    fn new(val: &'a str) -> Self {
51
0
        Self {
52
0
            val,
53
0
            tokenizer: Tokenizer::new(val).peekable(),
54
0
        }
55
0
    }
56
57
0
    fn parse(mut self) -> ArrowResult<DataType> {
58
0
        let data_type = self.parse_next_type()?;
59
        // ensure that there is no trailing content
60
0
        if self.tokenizer.next().is_some() {
61
0
            Err(make_error(
62
0
                self.val,
63
0
                &format!("checking trailing content after parsing '{data_type}'"),
64
0
            ))
65
        } else {
66
0
            Ok(data_type)
67
        }
68
0
    }
69
70
    /// parses the next full DataType
71
0
    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
72
0
        match self.next_token()? {
73
0
            Token::SimpleType(data_type) => Ok(data_type),
74
0
            Token::Timestamp => self.parse_timestamp(),
75
0
            Token::Time32 => self.parse_time32(),
76
0
            Token::Time64 => self.parse_time64(),
77
0
            Token::Duration => self.parse_duration(),
78
0
            Token::Interval => self.parse_interval(),
79
0
            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
80
0
            Token::Decimal32 => self.parse_decimal_32(),
81
0
            Token::Decimal64 => self.parse_decimal_64(),
82
0
            Token::Decimal128 => self.parse_decimal_128(),
83
0
            Token::Decimal256 => self.parse_decimal_256(),
84
0
            Token::Dictionary => self.parse_dictionary(),
85
0
            Token::List => self.parse_list(),
86
0
            Token::ListView => self.parse_list_view(),
87
0
            Token::LargeList => self.parse_large_list(),
88
0
            Token::LargeListView => self.parse_large_list_view(),
89
0
            Token::FixedSizeList => self.parse_fixed_size_list(),
90
0
            Token::Struct => self.parse_struct(),
91
0
            Token::Union => self.parse_union(),
92
0
            Token::Map => self.parse_map(),
93
0
            Token::RunEndEncoded => self.parse_run_end_encoded(),
94
0
            tok => Err(make_error(
95
0
                self.val,
96
0
                &format!("finding next type, got unexpected '{tok}'"),
97
0
            )),
98
        }
99
0
    }
100
101
    /// parses Field, this is the inversion of `format_field` in `datatype_display.rs`.
102
    /// E.g: "a": nullable Int64
103
    ///
104
    /// TODO: support metadata: `"a": nullable Int64 metadata: {"foo": "value"}`
105
0
    fn parse_field(&mut self) -> ArrowResult<Field> {
106
0
        let name = self.parse_double_quoted_string("Field")?;
107
0
        self.expect_token(Token::Colon)?;
108
0
        let nullable = self.parse_opt_nullable();
109
0
        let data_type = self.parse_next_type()?;
110
0
        Ok(Field::new(name, data_type, nullable))
111
0
    }
112
113
    /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME`
114
    /// if no field name is specified.
115
    /// E.g: `nullable Int64, field: 'foo'` or `nullable Int64`
116
    ///
117
    /// TODO: support metadata: `nullable Int64, metadata: {"foo2": "value"}`
118
0
    fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
119
0
        let nullable = self.parse_opt_nullable();
120
0
        let data_type = self.parse_next_type()?;
121
122
        // the field name (if exists) must be after a comma
123
0
        let field_name = if self
124
0
            .tokenizer
125
0
            .next_if(|next| matches!(next, Ok(Token::Comma)))
126
0
            .is_none()
127
        {
128
0
            Field::LIST_FIELD_DEFAULT_NAME.into()
129
        } else {
130
            // expects: `field: 'field_name'`.
131
0
            self.expect_token(Token::Field)?;
132
0
            self.expect_token(Token::Colon)?;
133
0
            self.parse_single_quoted_string(context)?
134
        };
135
136
0
        Ok(Field::new(field_name, data_type, nullable))
137
0
    }
138
139
    /// Parses the List type (called after `List` has been consumed)
140
    /// E.g: List(nullable Int64, field: 'foo')
141
0
    fn parse_list(&mut self) -> ArrowResult<DataType> {
142
0
        self.expect_token(Token::LParen)?;
143
0
        let field = self.parse_list_field("List")?;
144
0
        self.expect_token(Token::RParen)?;
145
0
        Ok(DataType::List(Arc::new(field)))
146
0
    }
147
148
    /// Parses the ListView type (called after `ListView` has been consumed)
149
    /// E.g: ListView(nullable Int64, field: 'foo')
150
0
    fn parse_list_view(&mut self) -> ArrowResult<DataType> {
151
0
        self.expect_token(Token::LParen)?;
152
0
        let field = self.parse_list_field("ListView")?;
153
0
        self.expect_token(Token::RParen)?;
154
0
        Ok(DataType::ListView(Arc::new(field)))
155
0
    }
156
157
    /// Parses the LargeList type (called after `LargeList` has been consumed)
158
    /// E.g: LargeList(nullable Int64, field: 'foo')
159
0
    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
160
0
        self.expect_token(Token::LParen)?;
161
0
        let field = self.parse_list_field("LargeList")?;
162
0
        self.expect_token(Token::RParen)?;
163
0
        Ok(DataType::LargeList(Arc::new(field)))
164
0
    }
165
166
    /// Parses the LargeListView type (called after `LargeListView` has been consumed)
167
    /// E.g: LargeListView(nullable Int64, field: 'foo')
168
0
    fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
169
0
        self.expect_token(Token::LParen)?;
170
0
        let field = self.parse_list_field("LargeListView")?;
171
0
        self.expect_token(Token::RParen)?;
172
0
        Ok(DataType::LargeListView(Arc::new(field)))
173
0
    }
174
175
    /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed)
176
    /// E.g: FixedSizeList(5 x nullable Int64, field: 'foo')
177
0
    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
178
0
        self.expect_token(Token::LParen)?;
179
0
        let length = self.parse_i32("FixedSizeList")?;
180
0
        self.expect_token(Token::X)?;
181
0
        let field = self.parse_list_field("FixedSizeList")?;
182
0
        self.expect_token(Token::RParen)?;
183
0
        Ok(DataType::FixedSizeList(Arc::new(field), length))
184
0
    }
185
186
    /// Parses the next timeunit
187
0
    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
188
0
        match self.next_token()? {
189
0
            Token::TimeUnit(time_unit) => Ok(time_unit),
190
0
            tok => Err(make_error(
191
0
                self.val,
192
0
                &format!("finding TimeUnit for {context}, got {tok}"),
193
0
            )),
194
        }
195
0
    }
196
197
    /// Parses the next double quoted string
198
0
    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
199
0
        let token = self.next_token()?;
200
0
        if let Token::DoubleQuotedString(string) = token {
201
0
            Ok(string)
202
        } else {
203
0
            Err(make_error(
204
0
                self.val,
205
0
                &format!("expected double quoted string for {context}, got '{token}'"),
206
0
            ))
207
        }
208
0
    }
209
210
    /// Parses the next single quoted string
211
0
    fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
212
0
        let token = self.next_token()?;
213
0
        if let Token::SingleQuotedString(string) = token {
214
0
            Ok(string)
215
        } else {
216
0
            Err(make_error(
217
0
                self.val,
218
0
                &format!("expected single quoted string for {context}, got '{token}'"),
219
0
            ))
220
        }
221
0
    }
222
223
    /// Parses the next integer value
224
0
    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
225
0
        match self.next_token()? {
226
0
            Token::Integer(v) => Ok(v),
227
0
            tok => Err(make_error(
228
0
                self.val,
229
0
                &format!("finding i64 for {context}, got '{tok}'"),
230
0
            )),
231
        }
232
0
    }
233
234
    /// Parses the next i32 integer value
235
0
    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
236
0
        let length = self.parse_i64(context)?;
237
0
        length.try_into().map_err(|e| {
238
0
            make_error(
239
0
                self.val,
240
0
                &format!("converting {length} into i32 for {context}: {e}"),
241
            )
242
0
        })
243
0
    }
244
245
    /// Parses the next i8 integer value
246
0
    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
247
0
        let length = self.parse_i64(context)?;
248
0
        length.try_into().map_err(|e| {
249
0
            make_error(
250
0
                self.val,
251
0
                &format!("converting {length} into i8 for {context}: {e}"),
252
            )
253
0
        })
254
0
    }
255
256
    /// Parses the next u8 integer value
257
0
    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
258
0
        let length = self.parse_i64(context)?;
259
0
        length.try_into().map_err(|e| {
260
0
            make_error(
261
0
                self.val,
262
0
                &format!("converting {length} into u8 for {context}: {e}"),
263
            )
264
0
        })
265
0
    }
266
267
    /// Parses the next timestamp (called after `Timestamp` has been consumed)
268
0
    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
269
0
        self.expect_token(Token::LParen)?;
270
0
        let time_unit = self.parse_time_unit("Timestamp")?;
271
272
        let timezone;
273
0
        match self.next_token()? {
274
            Token::Comma => {
275
0
                match self.next_token()? {
276
                    // Support old style `Timestamp(Nanosecond, None)`
277
0
                    Token::None => {
278
0
                        timezone = None;
279
0
                    }
280
                    // Support old style `Timestamp(Nanosecond, Some("Timezone"))`
281
                    Token::Some => {
282
0
                        self.expect_token(Token::LParen)?;
283
0
                        timezone = Some(self.parse_double_quoted_string("Timezone")?);
284
0
                        self.expect_token(Token::RParen)?;
285
                    }
286
0
                    Token::DoubleQuotedString(tz) => {
287
0
                        // Support new style `Timestamp(Nanosecond, "Timezone")`
288
0
                        timezone = Some(tz);
289
0
                    }
290
0
                    tok => {
291
0
                        return Err(make_error(
292
0
                            self.val,
293
0
                            &format!("Expected None, Some, or a timezone string, got {tok:?}"),
294
0
                        ));
295
                    }
296
                };
297
0
                self.expect_token(Token::RParen)?;
298
            }
299
            // No timezone (e.g `Timestamp(ns)`)
300
0
            Token::RParen => {
301
0
                timezone = None;
302
0
            }
303
0
            next_token => {
304
0
                return Err(make_error(
305
0
                    self.val,
306
0
                    &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"),
307
0
                ));
308
            }
309
        }
310
0
        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
311
0
    }
312
313
    /// Parses the next Time32 (called after `Time32` has been consumed)
314
0
    fn parse_time32(&mut self) -> ArrowResult<DataType> {
315
0
        self.expect_token(Token::LParen)?;
316
0
        let time_unit = self.parse_time_unit("Time32")?;
317
0
        self.expect_token(Token::RParen)?;
318
0
        Ok(DataType::Time32(time_unit))
319
0
    }
320
321
    /// Parses the next Time64 (called after `Time64` has been consumed)
322
0
    fn parse_time64(&mut self) -> ArrowResult<DataType> {
323
0
        self.expect_token(Token::LParen)?;
324
0
        let time_unit = self.parse_time_unit("Time64")?;
325
0
        self.expect_token(Token::RParen)?;
326
0
        Ok(DataType::Time64(time_unit))
327
0
    }
328
329
    /// Parses the next Duration (called after `Duration` has been consumed)
330
0
    fn parse_duration(&mut self) -> ArrowResult<DataType> {
331
0
        self.expect_token(Token::LParen)?;
332
0
        let time_unit = self.parse_time_unit("Duration")?;
333
0
        self.expect_token(Token::RParen)?;
334
0
        Ok(DataType::Duration(time_unit))
335
0
    }
336
337
    /// Parses the next Interval (called after `Interval` has been consumed)
338
0
    fn parse_interval(&mut self) -> ArrowResult<DataType> {
339
0
        self.expect_token(Token::LParen)?;
340
0
        let interval_unit = match self.next_token()? {
341
0
            Token::IntervalUnit(interval_unit) => interval_unit,
342
0
            tok => {
343
0
                return Err(make_error(
344
0
                    self.val,
345
0
                    &format!("finding IntervalUnit for Interval, got {tok}"),
346
0
                ));
347
            }
348
        };
349
0
        self.expect_token(Token::RParen)?;
350
0
        Ok(DataType::Interval(interval_unit))
351
0
    }
352
353
    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
354
0
    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
355
0
        self.expect_token(Token::LParen)?;
356
0
        let length = self.parse_i32("FixedSizeBinary")?;
357
0
        self.expect_token(Token::RParen)?;
358
0
        Ok(DataType::FixedSizeBinary(length))
359
0
    }
360
361
    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
362
0
    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
363
0
        self.expect_token(Token::LParen)?;
364
0
        let precision = self.parse_u8("Decimal32")?;
365
0
        self.expect_token(Token::Comma)?;
366
0
        let scale = self.parse_i8("Decimal32")?;
367
0
        self.expect_token(Token::RParen)?;
368
0
        Ok(DataType::Decimal32(precision, scale))
369
0
    }
370
371
    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
372
0
    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
373
0
        self.expect_token(Token::LParen)?;
374
0
        let precision = self.parse_u8("Decimal64")?;
375
0
        self.expect_token(Token::Comma)?;
376
0
        let scale = self.parse_i8("Decimal64")?;
377
0
        self.expect_token(Token::RParen)?;
378
0
        Ok(DataType::Decimal64(precision, scale))
379
0
    }
380
381
    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
382
0
    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
383
0
        self.expect_token(Token::LParen)?;
384
0
        let precision = self.parse_u8("Decimal128")?;
385
0
        self.expect_token(Token::Comma)?;
386
0
        let scale = self.parse_i8("Decimal128")?;
387
0
        self.expect_token(Token::RParen)?;
388
0
        Ok(DataType::Decimal128(precision, scale))
389
0
    }
390
391
    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
392
0
    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
393
0
        self.expect_token(Token::LParen)?;
394
0
        let precision = self.parse_u8("Decimal256")?;
395
0
        self.expect_token(Token::Comma)?;
396
0
        let scale = self.parse_i8("Decimal256")?;
397
0
        self.expect_token(Token::RParen)?;
398
0
        Ok(DataType::Decimal256(precision, scale))
399
0
    }
400
401
    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
402
0
    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
403
0
        self.expect_token(Token::LParen)?;
404
0
        let key_type = self.parse_next_type()?;
405
0
        self.expect_token(Token::Comma)?;
406
0
        let value_type = self.parse_next_type()?;
407
0
        self.expect_token(Token::RParen)?;
408
0
        Ok(DataType::Dictionary(
409
0
            Box::new(key_type),
410
0
            Box::new(value_type),
411
0
        ))
412
0
    }
413
414
    /// Parses the next Struct (called after `Struct` has been consumed)
415
0
    fn parse_struct(&mut self) -> ArrowResult<DataType> {
416
0
        self.expect_token(Token::LParen)?;
417
0
        let mut fields = Vec::new();
418
        loop {
419
0
            if self
420
0
                .tokenizer
421
0
                .next_if(|next| matches!(next, Ok(Token::RParen)))
422
0
                .is_some()
423
            {
424
0
                break;
425
0
            }
426
427
0
            let field = self.parse_field()?;
428
0
            fields.push(Arc::new(field));
429
0
            match self.next_token()? {
430
0
                Token::Comma => continue,
431
0
                Token::RParen => break,
432
0
                tok => {
433
0
                    return Err(make_error(
434
0
                        self.val,
435
0
                        &format!(
436
0
                            "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"
437
0
                        ),
438
0
                    ));
439
                }
440
            }
441
        }
442
0
        Ok(DataType::Struct(Fields::from(fields)))
443
0
    }
444
445
    /// Parses the next Union (called after `Union` has been consumed)
446
    /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": nullable Utf8))
447
0
    fn parse_union(&mut self) -> ArrowResult<DataType> {
448
0
        self.expect_token(Token::LParen)?;
449
0
        let union_mode = self.parse_union_mode()?;
450
0
        let mut type_ids = vec![];
451
0
        let mut fields = vec![];
452
        loop {
453
0
            if self
454
0
                .tokenizer
455
0
                .next_if(|next| matches!(next, Ok(Token::RParen)))
456
0
                .is_some()
457
            {
458
0
                break;
459
0
            }
460
0
            self.expect_token(Token::Comma)?;
461
0
            let (type_id, field) = self.parse_union_field()?;
462
0
            type_ids.push(type_id);
463
0
            fields.push(field);
464
        }
465
0
        Ok(DataType::Union(
466
0
            UnionFields::new(type_ids, fields),
467
0
            union_mode,
468
0
        ))
469
0
    }
470
471
    /// Parses the next UnionMode
472
0
    fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
473
0
        match self.next_token()? {
474
0
            Token::UnionMode(union_mode) => Ok(union_mode),
475
0
            tok => Err(make_error(
476
0
                self.val,
477
0
                &format!("finding UnionMode for Union, got {tok}"),
478
0
            )),
479
        }
480
0
    }
481
482
    /// Parses the next UnionField
483
    /// 0: ("a": nullable Int32)
484
0
    fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
485
0
        let type_id = self.parse_i8("UnionField")?;
486
0
        self.expect_token(Token::Colon)?;
487
0
        self.expect_token(Token::LParen)?;
488
0
        let field = self.parse_field()?;
489
0
        self.expect_token(Token::RParen)?;
490
0
        Ok((type_id, field))
491
0
    }
492
493
    /// Parses the next Map (called after `Map` has been consumed)
494
    /// E.g: Map("entries": Struct("key": Utf8, "value": nullable Int32), sorted)
495
0
    fn parse_map(&mut self) -> ArrowResult<DataType> {
496
0
        self.expect_token(Token::LParen)?;
497
0
        let field = self.parse_field()?;
498
0
        self.expect_token(Token::Comma)?;
499
0
        let sorted = self.parse_map_sorted()?;
500
0
        self.expect_token(Token::RParen)?;
501
0
        Ok(DataType::Map(Arc::new(field), sorted))
502
0
    }
503
504
    /// Parses map's sorted
505
0
    fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
506
0
        match self.next_token()? {
507
0
            Token::MapSorted(sorted) => Ok(sorted),
508
0
            tok => Err(make_error(
509
0
                self.val,
510
0
                &format!("Expected sorted or unsorted for a map; got {tok:?}"),
511
0
            )),
512
        }
513
0
    }
514
515
    /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been consumed)
516
    /// E.g: RunEndEncoded("run_ends": UInt32, "values": nullable Int32)
517
0
    fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
518
0
        self.expect_token(Token::LParen)?;
519
0
        let run_ends = self.parse_field()?;
520
0
        self.expect_token(Token::Comma)?;
521
0
        let values = self.parse_field()?;
522
0
        self.expect_token(Token::RParen)?;
523
0
        Ok(DataType::RunEndEncoded(
524
0
            Arc::new(run_ends),
525
0
            Arc::new(values),
526
0
        ))
527
0
    }
528
529
    /// return and consume if the next token is `Token::Nullable`
530
0
    fn parse_opt_nullable(&mut self) -> bool {
531
0
        self.tokenizer
532
0
            .next_if(|next| matches!(next, Ok(Token::Nullable)))
533
0
            .is_some()
534
0
    }
535
536
    /// return the next token, or an error if there are none left
537
0
    fn next_token(&mut self) -> ArrowResult<Token> {
538
0
        match self.tokenizer.next() {
539
0
            None => Err(make_error(self.val, "finding next token")),
540
0
            Some(token) => token,
541
        }
542
0
    }
543
544
    /// consume the next token, returning OK(()) if it matches tok, and Err if not
545
0
    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
546
0
        let next_token = self.next_token()?;
547
0
        if next_token == tok {
548
0
            Ok(())
549
        } else {
550
0
            Err(make_error_expected(self.val, &tok, &next_token))
551
        }
552
0
    }
553
}
554
555
/// returns true if this character is a separator
556
0
fn is_separator(c: char) -> bool {
557
0
    c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
558
0
}
559
560
enum QuoteType {
561
    Double,
562
    Single,
563
}
564
565
#[derive(Debug)]
566
/// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for parsing
567
///
568
/// For example the string "Timestamp(ns)" would be parsed into:
569
///
570
/// * Token::Timestamp
571
/// * Token::Lparen
572
/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
573
/// * Token::Rparen,
574
struct Tokenizer<'a> {
575
    val: &'a str,
576
    chars: Peekable<Chars<'a>>,
577
    // temporary buffer for parsing words
578
    word: String,
579
}
580
581
impl<'a> Tokenizer<'a> {
582
0
    fn new(val: &'a str) -> Self {
583
0
        Self {
584
0
            val,
585
0
            chars: val.chars().peekable(),
586
0
            word: String::new(),
587
0
        }
588
0
    }
589
590
    /// returns the next char, without consuming it
591
0
    fn peek_next_char(&mut self) -> Option<char> {
592
0
        self.chars.peek().copied()
593
0
    }
594
595
    /// returns the next char, and consuming it
596
0
    fn next_char(&mut self) -> Option<char> {
597
0
        self.chars.next()
598
0
    }
599
600
    /// parse the characters in val starting at pos, until the next
601
    /// `,`, `(`, or `)` or end of line
602
0
    fn parse_word(&mut self) -> ArrowResult<Token> {
603
        // reset temp space
604
0
        self.word.clear();
605
        loop {
606
0
            match self.peek_next_char() {
607
0
                None => break,
608
0
                Some(c) if is_separator(c) => break,
609
0
                Some(c) => {
610
0
                    self.next_char();
611
0
                    self.word.push(c);
612
0
                }
613
            }
614
        }
615
616
0
        if let Some(c) = self.word.chars().next() {
617
            // if it started with a number, try parsing it as an integer
618
0
            if c == '-' || c.is_numeric() {
619
0
                let val: i64 = self.word.parse().map_err(|e| {
620
0
                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
621
0
                })?;
622
0
                return Ok(Token::Integer(val));
623
0
            }
624
0
        }
625
626
        // figure out what the word was
627
0
        let token = match self.word.as_str() {
628
0
            "Null" => Token::SimpleType(DataType::Null),
629
0
            "Boolean" => Token::SimpleType(DataType::Boolean),
630
631
0
            "Int8" => Token::SimpleType(DataType::Int8),
632
0
            "Int16" => Token::SimpleType(DataType::Int16),
633
0
            "Int32" => Token::SimpleType(DataType::Int32),
634
0
            "Int64" => Token::SimpleType(DataType::Int64),
635
636
0
            "UInt8" => Token::SimpleType(DataType::UInt8),
637
0
            "UInt16" => Token::SimpleType(DataType::UInt16),
638
0
            "UInt32" => Token::SimpleType(DataType::UInt32),
639
0
            "UInt64" => Token::SimpleType(DataType::UInt64),
640
641
0
            "Utf8" => Token::SimpleType(DataType::Utf8),
642
0
            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
643
0
            "Utf8View" => Token::SimpleType(DataType::Utf8View),
644
0
            "Binary" => Token::SimpleType(DataType::Binary),
645
0
            "BinaryView" => Token::SimpleType(DataType::BinaryView),
646
0
            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
647
648
0
            "Float16" => Token::SimpleType(DataType::Float16),
649
0
            "Float32" => Token::SimpleType(DataType::Float32),
650
0
            "Float64" => Token::SimpleType(DataType::Float64),
651
652
0
            "Date32" => Token::SimpleType(DataType::Date32),
653
0
            "Date64" => Token::SimpleType(DataType::Date64),
654
655
0
            "List" => Token::List,
656
0
            "ListView" => Token::ListView,
657
0
            "LargeList" => Token::LargeList,
658
0
            "LargeListView" => Token::LargeListView,
659
0
            "FixedSizeList" => Token::FixedSizeList,
660
661
0
            "s" | "Second" => Token::TimeUnit(TimeUnit::Second),
662
0
            "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
663
0
            "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
664
0
            "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
665
666
0
            "Timestamp" => Token::Timestamp,
667
0
            "Time32" => Token::Time32,
668
0
            "Time64" => Token::Time64,
669
0
            "Duration" => Token::Duration,
670
0
            "Interval" => Token::Interval,
671
0
            "Dictionary" => Token::Dictionary,
672
673
0
            "FixedSizeBinary" => Token::FixedSizeBinary,
674
675
0
            "Decimal32" => Token::Decimal32,
676
0
            "Decimal64" => Token::Decimal64,
677
0
            "Decimal128" => Token::Decimal128,
678
0
            "Decimal256" => Token::Decimal256,
679
680
0
            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
681
0
            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
682
0
            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
683
684
0
            "Some" => Token::Some,
685
0
            "None" => Token::None,
686
687
0
            "nullable" => Token::Nullable,
688
0
            "field" => Token::Field,
689
0
            "x" => Token::X,
690
691
0
            "Struct" => Token::Struct,
692
693
0
            "Union" => Token::Union,
694
0
            "Sparse" => Token::UnionMode(UnionMode::Sparse),
695
0
            "Dense" => Token::UnionMode(UnionMode::Dense),
696
697
0
            "Map" => Token::Map,
698
0
            "sorted" => Token::MapSorted(true),
699
0
            "unsorted" => Token::MapSorted(false),
700
701
0
            "RunEndEncoded" => Token::RunEndEncoded,
702
703
0
            token => {
704
0
                return Err(make_error(self.val, &format!("unknown token: {token}")));
705
            }
706
        };
707
0
        Ok(token)
708
0
    }
709
710
    /// Parses e.g. `"foo bar"`, `'foo bar'`
711
0
    fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
712
0
        let quote = match quote_type {
713
0
            QuoteType::Double => '\"',
714
0
            QuoteType::Single => '\'',
715
        };
716
717
0
        if self.next_char() != Some(quote) {
718
0
            return Err(make_error(self.val, "Expected \""));
719
0
        }
720
721
        // reset temp space
722
0
        self.word.clear();
723
724
0
        let mut is_escaped = false;
725
726
        loop {
727
0
            match self.next_char() {
728
                None => {
729
0
                    return Err(ArrowError::ParseError(format!(
730
0
                        "Unterminated string at: \"{}",
731
0
                        self.word
732
0
                    )));
733
                }
734
0
                Some(c) => match c {
735
0
                    '\\' => {
736
0
                        is_escaped = true;
737
0
                        self.word.push(c);
738
0
                    }
739
0
                    c if c == quote => {
740
0
                        if is_escaped {
741
0
                            self.word.push(c);
742
0
                            is_escaped = false;
743
0
                        } else {
744
0
                            break;
745
                        }
746
                    }
747
0
                    c => {
748
0
                        self.word.push(c);
749
0
                    }
750
                },
751
            }
752
        }
753
754
0
        let val: String = self.word.parse().map_err(|err| {
755
0
            ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
756
0
        })?;
757
758
0
        if val.is_empty() {
759
            // Using empty strings as field names is just asking for trouble
760
0
            return Err(make_error(self.val, "empty strings aren't allowed"));
761
0
        }
762
763
0
        match quote_type {
764
0
            QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
765
0
            QuoteType::Single => Ok(Token::SingleQuotedString(val)),
766
        }
767
0
    }
768
}
769
770
impl Iterator for Tokenizer<'_> {
771
    type Item = ArrowResult<Token>;
772
773
0
    fn next(&mut self) -> Option<Self::Item> {
774
        loop {
775
0
            match self.peek_next_char()? {
776
                ' ' => {
777
                    // skip whitespace
778
0
                    self.next_char();
779
0
                    continue;
780
                }
781
                '"' => {
782
0
                    return Some(self.parse_quoted_string(QuoteType::Double));
783
                }
784
                '\'' => {
785
0
                    return Some(self.parse_quoted_string(QuoteType::Single));
786
                }
787
                '(' => {
788
0
                    self.next_char();
789
0
                    return Some(Ok(Token::LParen));
790
                }
791
                ')' => {
792
0
                    self.next_char();
793
0
                    return Some(Ok(Token::RParen));
794
                }
795
                ',' => {
796
0
                    self.next_char();
797
0
                    return Some(Ok(Token::Comma));
798
                }
799
                ':' => {
800
0
                    self.next_char();
801
0
                    return Some(Ok(Token::Colon));
802
                }
803
0
                _ => return Some(self.parse_word()),
804
            }
805
        }
806
0
    }
807
}
808
809
/// Grammar is
810
///
811
#[derive(Debug, PartialEq)]
812
enum Token {
813
    // Null, or Int32
814
    SimpleType(DataType),
815
    Timestamp,
816
    Time32,
817
    Time64,
818
    Duration,
819
    Interval,
820
    FixedSizeBinary,
821
    Decimal32,
822
    Decimal64,
823
    Decimal128,
824
    Decimal256,
825
    Dictionary,
826
    TimeUnit(TimeUnit),
827
    IntervalUnit(IntervalUnit),
828
    LParen,
829
    RParen,
830
    Comma,
831
    Colon,
832
    Some,
833
    None,
834
    Integer(i64),
835
    DoubleQuotedString(String),
836
    SingleQuotedString(String),
837
    List,
838
    ListView,
839
    LargeList,
840
    LargeListView,
841
    FixedSizeList,
842
    Struct,
843
    Union,
844
    UnionMode(UnionMode),
845
    Map,
846
    MapSorted(bool),
847
    RunEndEncoded,
848
    Nullable,
849
    Field,
850
    X,
851
}
852
853
impl Display for Token {
854
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
855
0
        match self {
856
0
            Token::SimpleType(t) => write!(f, "{t}"),
857
0
            Token::List => write!(f, "List"),
858
0
            Token::ListView => write!(f, "ListView"),
859
0
            Token::LargeList => write!(f, "LargeList"),
860
0
            Token::LargeListView => write!(f, "LargeListView"),
861
0
            Token::FixedSizeList => write!(f, "FixedSizeList"),
862
0
            Token::Timestamp => write!(f, "Timestamp"),
863
0
            Token::Time32 => write!(f, "Time32"),
864
0
            Token::Time64 => write!(f, "Time64"),
865
0
            Token::Duration => write!(f, "Duration"),
866
0
            Token::Interval => write!(f, "Interval"),
867
0
            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
868
0
            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
869
0
            Token::LParen => write!(f, "("),
870
0
            Token::RParen => write!(f, ")"),
871
0
            Token::Comma => write!(f, ","),
872
0
            Token::Colon => write!(f, ":"),
873
0
            Token::Some => write!(f, "Some"),
874
0
            Token::None => write!(f, "None"),
875
0
            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
876
0
            Token::Decimal32 => write!(f, "Decimal32"),
877
0
            Token::Decimal64 => write!(f, "Decimal64"),
878
0
            Token::Decimal128 => write!(f, "Decimal128"),
879
0
            Token::Decimal256 => write!(f, "Decimal256"),
880
0
            Token::Dictionary => write!(f, "Dictionary"),
881
0
            Token::Integer(v) => write!(f, "Integer({v})"),
882
0
            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
883
0
            Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
884
0
            Token::Struct => write!(f, "Struct"),
885
0
            Token::Union => write!(f, "Union"),
886
0
            Token::UnionMode(m) => write!(f, "{m:?}"),
887
0
            Token::Map => write!(f, "Map"),
888
0
            Token::MapSorted(sorted) => {
889
0
                write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
890
            }
891
0
            Token::RunEndEncoded => write!(f, "RunEndEncoded"),
892
0
            Token::Nullable => write!(f, "nullable"),
893
0
            Token::Field => write!(f, "field"),
894
0
            Token::X => write!(f, "x"),
895
        }
896
0
    }
897
}
898
899
#[cfg(test)]
900
mod test {
901
    use super::*;
902
903
    #[test]
904
    fn test_parse_data_type() {
905
        // this ensures types can be parsed correctly from their string representations
906
        for dt in list_datatypes() {
907
            round_trip(dt)
908
        }
909
    }
910
911
    /// Ensure we converting data_type to a string, and then parse it as a type
912
    /// verifying it is the same
913
    fn round_trip(data_type: DataType) {
914
        let data_type_string = data_type.to_string();
915
        println!("Input '{data_type_string}' ({data_type:?})");
916
        let parsed_type = parse_data_type(&data_type_string).unwrap();
917
        assert_eq!(
918
            data_type, parsed_type,
919
            "Mismatch parsing {data_type_string}"
920
        );
921
    }
922
923
    fn list_datatypes() -> Vec<DataType> {
924
        vec![
925
            // ---------
926
            // Non Nested types
927
            // ---------
928
            DataType::Null,
929
            DataType::Boolean,
930
            DataType::Int8,
931
            DataType::Int16,
932
            DataType::Int32,
933
            DataType::Int64,
934
            DataType::UInt8,
935
            DataType::UInt16,
936
            DataType::UInt32,
937
            DataType::UInt64,
938
            DataType::Float16,
939
            DataType::Float32,
940
            DataType::Float64,
941
            DataType::Timestamp(TimeUnit::Second, None),
942
            DataType::Timestamp(TimeUnit::Millisecond, None),
943
            DataType::Timestamp(TimeUnit::Microsecond, None),
944
            DataType::Timestamp(TimeUnit::Nanosecond, None),
945
            // we can't cover all possible timezones, here we only test utc and +08:00
946
            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
947
            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
948
            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
949
            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
950
            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
951
            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
952
            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
953
            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
954
            DataType::Date32,
955
            DataType::Date64,
956
            DataType::Time32(TimeUnit::Second),
957
            DataType::Time32(TimeUnit::Millisecond),
958
            DataType::Time32(TimeUnit::Microsecond),
959
            DataType::Time32(TimeUnit::Nanosecond),
960
            DataType::Time64(TimeUnit::Second),
961
            DataType::Time64(TimeUnit::Millisecond),
962
            DataType::Time64(TimeUnit::Microsecond),
963
            DataType::Time64(TimeUnit::Nanosecond),
964
            DataType::Duration(TimeUnit::Second),
965
            DataType::Duration(TimeUnit::Millisecond),
966
            DataType::Duration(TimeUnit::Microsecond),
967
            DataType::Duration(TimeUnit::Nanosecond),
968
            DataType::Interval(IntervalUnit::YearMonth),
969
            DataType::Interval(IntervalUnit::DayTime),
970
            DataType::Interval(IntervalUnit::MonthDayNano),
971
            DataType::Binary,
972
            DataType::BinaryView,
973
            DataType::FixedSizeBinary(0),
974
            DataType::FixedSizeBinary(1234),
975
            DataType::FixedSizeBinary(-432),
976
            DataType::LargeBinary,
977
            DataType::Utf8,
978
            DataType::Utf8View,
979
            DataType::LargeUtf8,
980
            DataType::Decimal32(7, 8),
981
            DataType::Decimal64(6, 9),
982
            DataType::Decimal128(7, 12),
983
            DataType::Decimal256(6, 13),
984
            // ---------
985
            // Nested types
986
            // ---------
987
            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
988
            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
989
            DataType::Dictionary(
990
                Box::new(DataType::Int8),
991
                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
992
            ),
993
            DataType::Dictionary(
994
                Box::new(DataType::Int8),
995
                Box::new(DataType::FixedSizeBinary(23)),
996
            ),
997
            DataType::Dictionary(
998
                Box::new(DataType::Int8),
999
                Box::new(
1000
                    // nested dictionaries are probably a bad idea but they are possible
1001
                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
1002
                ),
1003
            ),
1004
            DataType::Struct(Fields::from(vec![
1005
                Field::new("f1", DataType::Int64, true),
1006
                Field::new("f2", DataType::Float64, true),
1007
                Field::new(
1008
                    "f3",
1009
                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
1010
                    true,
1011
                ),
1012
                Field::new(
1013
                    "f4",
1014
                    DataType::Dictionary(
1015
                        Box::new(DataType::Int8),
1016
                        Box::new(DataType::FixedSizeBinary(23)),
1017
                    ),
1018
                    true,
1019
                ),
1020
            ])),
1021
            DataType::Struct(Fields::from(vec![
1022
                Field::new("Int64", DataType::Int64, true),
1023
                Field::new("Float64", DataType::Float64, true),
1024
            ])),
1025
            DataType::Struct(Fields::from(vec![
1026
                Field::new("f1", DataType::Int64, true),
1027
                Field::new(
1028
                    "nested_struct",
1029
                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
1030
                    true,
1031
                ),
1032
            ])),
1033
            DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])),
1034
            DataType::Struct(Fields::empty()),
1035
            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
1036
            DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
1037
            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1038
            DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
1039
            DataType::List(Arc::new(Field::new(
1040
                "nested_list",
1041
                DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
1042
                true,
1043
            ))),
1044
            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1045
            DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1046
            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1047
            DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1048
            DataType::ListView(Arc::new(Field::new(
1049
                "nested_list_view",
1050
                DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1051
                true,
1052
            ))),
1053
            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
1054
            DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
1055
            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1056
            DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
1057
            DataType::LargeList(Arc::new(Field::new(
1058
                "nested_large_list",
1059
                DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
1060
                true,
1061
            ))),
1062
            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
1063
            DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
1064
            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1065
            DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
1066
            DataType::LargeListView(Arc::new(Field::new(
1067
                "nested_large_list_view",
1068
                DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
1069
                true,
1070
            ))),
1071
            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
1072
            DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
1073
            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
1074
            DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
1075
            DataType::FixedSizeList(
1076
                Arc::new(Field::new(
1077
                    "nested_fixed_size_list",
1078
                    DataType::FixedSizeList(
1079
                        Arc::new(Field::new("Int64", DataType::Int64, true)),
1080
                        2,
1081
                    ),
1082
                    true,
1083
                )),
1084
                2,
1085
            ),
1086
            DataType::Union(
1087
                UnionFields::new(
1088
                    vec![0, 1],
1089
                    vec![
1090
                        Field::new("Int32", DataType::Int32, false),
1091
                        Field::new("Utf8", DataType::Utf8, true),
1092
                    ],
1093
                ),
1094
                UnionMode::Sparse,
1095
            ),
1096
            DataType::Union(
1097
                UnionFields::new(
1098
                    vec![0, 1],
1099
                    vec![
1100
                        Field::new("Int32", DataType::Int32, false),
1101
                        Field::new("Utf8", DataType::Utf8, true),
1102
                    ],
1103
                ),
1104
                UnionMode::Dense,
1105
            ),
1106
            DataType::Union(
1107
                UnionFields::new(
1108
                    vec![0, 1],
1109
                    vec![
1110
                        Field::new_union(
1111
                            "nested_union",
1112
                            vec![0, 1],
1113
                            vec![
1114
                                Field::new("Int32", DataType::Int32, false),
1115
                                Field::new("Utf8", DataType::Utf8, true),
1116
                            ],
1117
                            UnionMode::Dense,
1118
                        ),
1119
                        Field::new("Utf8", DataType::Utf8, true),
1120
                    ],
1121
                ),
1122
                UnionMode::Sparse,
1123
            ),
1124
            DataType::Union(
1125
                UnionFields::new(vec![0], vec![Field::new("Int32", DataType::Int32, false)]),
1126
                UnionMode::Dense,
1127
            ),
1128
            DataType::Union(
1129
                UnionFields::new(Vec::<i8>::new(), Vec::<Field>::new()),
1130
                UnionMode::Sparse,
1131
            ),
1132
            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true),
1133
            DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false),
1134
            DataType::Map(
1135
                Arc::new(Field::new_map(
1136
                    "nested_map",
1137
                    "entries",
1138
                    Field::new("key", DataType::Utf8, false),
1139
                    Field::new("value", DataType::Int32, true),
1140
                    false,
1141
                    true,
1142
                )),
1143
                true,
1144
            ),
1145
            DataType::RunEndEncoded(
1146
                Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1147
                Arc::new(Field::new("values", DataType::Int32, true)),
1148
            ),
1149
            DataType::RunEndEncoded(
1150
                Arc::new(Field::new(
1151
                    "nested_run_end_encoded",
1152
                    DataType::RunEndEncoded(
1153
                        Arc::new(Field::new("run_ends", DataType::UInt32, false)),
1154
                        Arc::new(Field::new("values", DataType::Int32, true)),
1155
                    ),
1156
                    true,
1157
                )),
1158
                Arc::new(Field::new("values", DataType::Int32, true)),
1159
            ),
1160
        ]
1161
    }
1162
1163
    #[test]
1164
    fn test_parse_data_type_whitespace_tolerance() {
1165
        // (string to parse, expected DataType)
1166
        let cases = [
1167
            ("Int8", DataType::Int8),
1168
            (
1169
                "Timestamp        (ns)",
1170
                DataType::Timestamp(TimeUnit::Nanosecond, None),
1171
            ),
1172
            (
1173
                "Timestamp        (ns)  ",
1174
                DataType::Timestamp(TimeUnit::Nanosecond, None),
1175
            ),
1176
            (
1177
                "          Timestamp        (ns               )",
1178
                DataType::Timestamp(TimeUnit::Nanosecond, None),
1179
            ),
1180
            (
1181
                "Timestamp        (ns               )  ",
1182
                DataType::Timestamp(TimeUnit::Nanosecond, None),
1183
            ),
1184
        ];
1185
1186
        for (data_type_string, expected_data_type) in cases {
1187
            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1188
            assert_eq!(
1189
                parsed_data_type, expected_data_type,
1190
                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1191
            );
1192
        }
1193
    }
1194
1195
    /// Ensure that old style types can still be parsed
1196
    #[test]
1197
    fn test_parse_data_type_backwards_compatibility() {
1198
        use DataType::*;
1199
        use IntervalUnit::*;
1200
        use TimeUnit::*;
1201
        // List below created with:
1202
        // for t in list_datatypes() {
1203
        // println!(r#"("{t}", {t:?}),"#)
1204
        // }
1205
        // (string to parse, expected DataType)
1206
        let cases = [
1207
            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1208
            ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
1209
            ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
1210
            ("Timestamp(Second, None)", Timestamp(Second, None)),
1211
            ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
1212
            // Timezones
1213
            (
1214
                r#"Timestamp(Nanosecond, Some("+00:00"))"#,
1215
                Timestamp(Nanosecond, Some("+00:00".into())),
1216
            ),
1217
            (
1218
                r#"Timestamp(Microsecond, Some("+00:00"))"#,
1219
                Timestamp(Microsecond, Some("+00:00".into())),
1220
            ),
1221
            (
1222
                r#"Timestamp(Millisecond, Some("+00:00"))"#,
1223
                Timestamp(Millisecond, Some("+00:00".into())),
1224
            ),
1225
            (
1226
                r#"Timestamp(Second, Some("+00:00"))"#,
1227
                Timestamp(Second, Some("+00:00".into())),
1228
            ),
1229
            ("Null", Null),
1230
            ("Boolean", Boolean),
1231
            ("Int8", Int8),
1232
            ("Int16", Int16),
1233
            ("Int32", Int32),
1234
            ("Int64", Int64),
1235
            ("UInt8", UInt8),
1236
            ("UInt16", UInt16),
1237
            ("UInt32", UInt32),
1238
            ("UInt64", UInt64),
1239
            ("Float16", Float16),
1240
            ("Float32", Float32),
1241
            ("Float64", Float64),
1242
            ("Timestamp(s)", Timestamp(Second, None)),
1243
            ("Timestamp(ms)", Timestamp(Millisecond, None)),
1244
            ("Timestamp(µs)", Timestamp(Microsecond, None)),
1245
            ("Timestamp(ns)", Timestamp(Nanosecond, None)),
1246
            (
1247
                r#"Timestamp(ns, "+00:00")"#,
1248
                Timestamp(Nanosecond, Some("+00:00".into())),
1249
            ),
1250
            (
1251
                r#"Timestamp(µs, "+00:00")"#,
1252
                Timestamp(Microsecond, Some("+00:00".into())),
1253
            ),
1254
            (
1255
                r#"Timestamp(ms, "+00:00")"#,
1256
                Timestamp(Millisecond, Some("+00:00".into())),
1257
            ),
1258
            (
1259
                r#"Timestamp(s, "+00:00")"#,
1260
                Timestamp(Second, Some("+00:00".into())),
1261
            ),
1262
            (
1263
                r#"Timestamp(ns, "+08:00")"#,
1264
                Timestamp(Nanosecond, Some("+08:00".into())),
1265
            ),
1266
            (
1267
                r#"Timestamp(µs, "+08:00")"#,
1268
                Timestamp(Microsecond, Some("+08:00".into())),
1269
            ),
1270
            (
1271
                r#"Timestamp(ms, "+08:00")"#,
1272
                Timestamp(Millisecond, Some("+08:00".into())),
1273
            ),
1274
            (
1275
                r#"Timestamp(s, "+08:00")"#,
1276
                Timestamp(Second, Some("+08:00".into())),
1277
            ),
1278
            ("Date32", Date32),
1279
            ("Date64", Date64),
1280
            ("Time32(s)", Time32(Second)),
1281
            ("Time32(ms)", Time32(Millisecond)),
1282
            ("Time32(µs)", Time32(Microsecond)),
1283
            ("Time32(ns)", Time32(Nanosecond)),
1284
            ("Time64(s)", Time64(Second)),
1285
            ("Time64(ms)", Time64(Millisecond)),
1286
            ("Time64(µs)", Time64(Microsecond)),
1287
            ("Time64(ns)", Time64(Nanosecond)),
1288
            ("Duration(s)", Duration(Second)),
1289
            ("Duration(ms)", Duration(Millisecond)),
1290
            ("Duration(µs)", Duration(Microsecond)),
1291
            ("Duration(ns)", Duration(Nanosecond)),
1292
            ("Interval(YearMonth)", Interval(YearMonth)),
1293
            ("Interval(DayTime)", Interval(DayTime)),
1294
            ("Interval(MonthDayNano)", Interval(MonthDayNano)),
1295
            ("Binary", Binary),
1296
            ("BinaryView", BinaryView),
1297
            ("FixedSizeBinary(0)", FixedSizeBinary(0)),
1298
            ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
1299
            ("FixedSizeBinary(-432)", FixedSizeBinary(-432)),
1300
            ("LargeBinary", LargeBinary),
1301
            ("Utf8", Utf8),
1302
            ("Utf8View", Utf8View),
1303
            ("LargeUtf8", LargeUtf8),
1304
            ("Decimal32(7, 8)", Decimal32(7, 8)),
1305
            ("Decimal64(6, 9)", Decimal64(6, 9)),
1306
            ("Decimal128(7, 12)", Decimal128(7, 12)),
1307
            ("Decimal256(6, 13)", Decimal256(6, 13)),
1308
            (
1309
                "Dictionary(Int32, Utf8)",
1310
                Dictionary(Box::new(Int32), Box::new(Utf8)),
1311
            ),
1312
            (
1313
                "Dictionary(Int8, Utf8)",
1314
                Dictionary(Box::new(Int8), Box::new(Utf8)),
1315
            ),
1316
            (
1317
                "Dictionary(Int8, Timestamp(ns))",
1318
                Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))),
1319
            ),
1320
            (
1321
                "Dictionary(Int8, FixedSizeBinary(23))",
1322
                Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1323
            ),
1324
            (
1325
                "Dictionary(Int8, Dictionary(Int8, Utf8))",
1326
                Dictionary(
1327
                    Box::new(Int8),
1328
                    Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
1329
                ),
1330
            ),
1331
            (
1332
                r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#,
1333
                Struct(Fields::from(vec![
1334
                    Field::new("f1", Int64, true),
1335
                    Field::new("f2", Float64, true),
1336
                    Field::new("f3", Timestamp(Second, Some("+08:00".into())), true),
1337
                    Field::new(
1338
                        "f4",
1339
                        Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
1340
                        true,
1341
                    ),
1342
                ])),
1343
            ),
1344
            (
1345
                r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#,
1346
                Struct(Fields::from(vec![
1347
                    Field::new("Int64", Int64, true),
1348
                    Field::new("Float64", Float64, true),
1349
                ])),
1350
            ),
1351
            (
1352
                r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#,
1353
                Struct(Fields::from(vec![
1354
                    Field::new("f1", Int64, true),
1355
                    Field::new(
1356
                        "nested_struct",
1357
                        Struct(Fields::from(vec![Field::new("n1", Int64, true)])),
1358
                        true,
1359
                    ),
1360
                ])),
1361
            ),
1362
            (r#"Struct()"#, Struct(Fields::empty())),
1363
        ];
1364
1365
        for (data_type_string, expected_data_type) in cases {
1366
            let parsed_data_type = parse_data_type(data_type_string).unwrap();
1367
            assert_eq!(
1368
                parsed_data_type, expected_data_type,
1369
                "Parsing '{data_type_string}', expecting '{expected_data_type}'"
1370
            );
1371
        }
1372
    }
1373
1374
    #[test]
1375
    fn parse_data_type_errors() {
1376
        // (string to parse, expected error message)
1377
        let cases = [
1378
            ("", "Unsupported type ''"),
1379
            ("", "Error finding next token"),
1380
            ("null", "Unsupported type 'null'"),
1381
            ("Nu", "Unsupported type 'Nu'"),
1382
            (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"),
1383
            (
1384
                r#"Timestamp(ns, "+00:00)"#,
1385
                r#"Unterminated string at: "+00:00)"#,
1386
            ),
1387
            (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#),
1388
            (
1389
                r#"Timestamp(ns, "+00:00"")"#,
1390
                r#"Parser error: Unterminated string at: ")"#,
1391
            ),
1392
            ("Timestamp(ns, ", "Error finding next token"),
1393
            (
1394
                "Float32 Float32",
1395
                "trailing content after parsing 'Float32'",
1396
            ),
1397
            ("Int32, ", "trailing content after parsing 'Int32'"),
1398
            ("Int32(3), ", "trailing content after parsing 'Int32'"),
1399
            (
1400
                "FixedSizeBinary(Int32), ",
1401
                "Error finding i64 for FixedSizeBinary, got 'Int32'",
1402
            ),
1403
            (
1404
                "FixedSizeBinary(3.0), ",
1405
                "Error parsing 3.0 as integer: invalid digit found in string",
1406
            ),
1407
            // too large for i32
1408
            (
1409
                "FixedSizeBinary(4000000000), ",
1410
                "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted",
1411
            ),
1412
            // can't have negative precision
1413
            (
1414
                "Decimal32(-3, 5)",
1415
                "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted",
1416
            ),
1417
            (
1418
                "Decimal64(-3, 5)",
1419
                "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted",
1420
            ),
1421
            (
1422
                "Decimal128(-3, 5)",
1423
                "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted",
1424
            ),
1425
            (
1426
                "Decimal256(-3, 5)",
1427
                "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted",
1428
            ),
1429
            (
1430
                "Decimal32(3, 500)",
1431
                "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted",
1432
            ),
1433
            (
1434
                "Decimal64(3, 500)",
1435
                "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted",
1436
            ),
1437
            (
1438
                "Decimal128(3, 500)",
1439
                "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted",
1440
            ),
1441
            (
1442
                "Decimal256(3, 500)",
1443
                "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted",
1444
            ),
1445
            ("Struct(f1 Int64)", "Error unknown token: f1"),
1446
            ("Struct(\"f1\" Int64)", "Expected ':'"),
1447
            (
1448
                "Struct(\"f1\": )",
1449
                "Error finding next type, got unexpected ')'",
1450
            ),
1451
        ];
1452
1453
        for (data_type_string, expected_message) in cases {
1454
            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
1455
            match parse_data_type(data_type_string) {
1456
                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
1457
                Err(e) => {
1458
                    let message = e.to_string();
1459
                    assert!(
1460
                        message.contains(expected_message),
1461
                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
1462
                    );
1463
1464
                    if !message.contains("Unterminated string") {
1465
                        // errors should also contain a help message
1466
                        assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
1467
                    }
1468
                }
1469
            }
1470
        }
1471
    }
1472
1473
    #[test]
1474
    fn parse_error_type() {
1475
        let err = parse_data_type("foobar").unwrap_err();
1476
        assert!(matches!(err, ArrowError::ParseError(_)));
1477
        assert_eq!(
1478
            err.to_string(),
1479
            "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"
1480
        );
1481
    }
1482
}