Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype_parse.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
19
20
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
21
22
0
pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
23
0
    Parser::new(val).parse()
24
0
}
25
26
type ArrowResult<T> = Result<T, ArrowError>;
27
28
0
fn make_error(val: &str, msg: &str) -> ArrowError {
29
0
    let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" );
30
0
    ArrowError::ParseError(msg)
31
0
}
32
33
0
fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError {
34
0
    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
35
0
}
36
37
#[derive(Debug)]
38
/// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs>
39
struct Parser<'a> {
40
    val: &'a str,
41
    tokenizer: Tokenizer<'a>,
42
}
43
44
impl<'a> Parser<'a> {
45
0
    fn new(val: &'a str) -> Self {
46
0
        Self {
47
0
            val,
48
0
            tokenizer: Tokenizer::new(val),
49
0
        }
50
0
    }
51
52
0
    fn parse(mut self) -> ArrowResult<DataType> {
53
0
        let data_type = self.parse_next_type()?;
54
        // ensure that there is no trailing content
55
0
        if self.tokenizer.next().is_some() {
56
0
            Err(make_error(
57
0
                self.val,
58
0
                &format!("checking trailing content after parsing '{data_type}'"),
59
0
            ))
60
        } else {
61
0
            Ok(data_type)
62
        }
63
0
    }
64
65
    /// parses the next full DataType
66
0
    fn parse_next_type(&mut self) -> ArrowResult<DataType> {
67
0
        match self.next_token()? {
68
0
            Token::SimpleType(data_type) => Ok(data_type),
69
0
            Token::Timestamp => self.parse_timestamp(),
70
0
            Token::Time32 => self.parse_time32(),
71
0
            Token::Time64 => self.parse_time64(),
72
0
            Token::Duration => self.parse_duration(),
73
0
            Token::Interval => self.parse_interval(),
74
0
            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
75
0
            Token::Decimal32 => self.parse_decimal_32(),
76
0
            Token::Decimal64 => self.parse_decimal_64(),
77
0
            Token::Decimal128 => self.parse_decimal_128(),
78
0
            Token::Decimal256 => self.parse_decimal_256(),
79
0
            Token::Dictionary => self.parse_dictionary(),
80
0
            Token::List => self.parse_list(),
81
0
            Token::LargeList => self.parse_large_list(),
82
0
            Token::FixedSizeList => self.parse_fixed_size_list(),
83
0
            Token::Struct => self.parse_struct(),
84
0
            Token::FieldName(word) => {
85
0
                Err(make_error(self.val, &format!("unrecognized word: {word}")))
86
            }
87
0
            tok => Err(make_error(
88
0
                self.val,
89
0
                &format!("finding next type, got unexpected '{tok}'"),
90
0
            )),
91
        }
92
0
    }
93
94
    /// Parses the List type
95
0
    fn parse_list(&mut self) -> ArrowResult<DataType> {
96
0
        self.expect_token(Token::LParen)?;
97
0
        let data_type = self.parse_next_type()?;
98
0
        self.expect_token(Token::RParen)?;
99
0
        Ok(DataType::List(Arc::new(Field::new_list_field(
100
0
            data_type, true,
101
0
        ))))
102
0
    }
103
104
    /// Parses the LargeList type
105
0
    fn parse_large_list(&mut self) -> ArrowResult<DataType> {
106
0
        self.expect_token(Token::LParen)?;
107
0
        let data_type = self.parse_next_type()?;
108
0
        self.expect_token(Token::RParen)?;
109
0
        Ok(DataType::LargeList(Arc::new(Field::new_list_field(
110
0
            data_type, true,
111
0
        ))))
112
0
    }
113
114
    /// Parses the FixedSizeList type
115
0
    fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
116
0
        self.expect_token(Token::LParen)?;
117
0
        let length = self.parse_i32("FixedSizeList")?;
118
0
        self.expect_token(Token::Comma)?;
119
0
        let data_type = self.parse_next_type()?;
120
0
        self.expect_token(Token::RParen)?;
121
0
        Ok(DataType::FixedSizeList(
122
0
            Arc::new(Field::new_list_field(data_type, true)),
123
0
            length,
124
0
        ))
125
0
    }
126
127
    /// Parses the next timeunit
128
0
    fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> {
129
0
        match self.next_token()? {
130
0
            Token::TimeUnit(time_unit) => Ok(time_unit),
131
0
            tok => Err(make_error(
132
0
                self.val,
133
0
                &format!("finding TimeUnit for {context}, got {tok}"),
134
0
            )),
135
        }
136
0
    }
137
138
    /// Parses the next timezone
139
0
    fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> {
140
0
        match self.next_token()? {
141
0
            Token::None => Ok(None),
142
            Token::Some => {
143
0
                self.expect_token(Token::LParen)?;
144
0
                let timezone = self.parse_double_quoted_string("Timezone")?;
145
0
                self.expect_token(Token::RParen)?;
146
0
                Ok(Some(timezone))
147
            }
148
0
            tok => Err(make_error(
149
0
                self.val,
150
0
                &format!("finding Timezone for {context}, got {tok}"),
151
0
            )),
152
        }
153
0
    }
154
155
    /// Parses the next double quoted string
156
0
    fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
157
0
        match self.next_token()? {
158
0
            Token::DoubleQuotedString(s) => Ok(s),
159
0
            Token::FieldName(word) => {
160
0
                Err(make_error(self.val, &format!("unrecognized word: {word}")))
161
            }
162
0
            tok => Err(make_error(
163
0
                self.val,
164
0
                &format!("finding double quoted string for {context}, got '{tok}'"),
165
0
            )),
166
        }
167
0
    }
168
169
    /// Parses the next integer value
170
0
    fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
171
0
        match self.next_token()? {
172
0
            Token::Integer(v) => Ok(v),
173
0
            tok => Err(make_error(
174
0
                self.val,
175
0
                &format!("finding i64 for {context}, got '{tok}'"),
176
0
            )),
177
        }
178
0
    }
179
180
    /// Parses the next i32 integer value
181
0
    fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> {
182
0
        let length = self.parse_i64(context)?;
183
0
        length.try_into().map_err(|e| {
184
0
            make_error(
185
0
                self.val,
186
0
                &format!("converting {length} into i32 for {context}: {e}"),
187
            )
188
0
        })
189
0
    }
190
191
    /// Parses the next i8 integer value
192
0
    fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> {
193
0
        let length = self.parse_i64(context)?;
194
0
        length.try_into().map_err(|e| {
195
0
            make_error(
196
0
                self.val,
197
0
                &format!("converting {length} into i8 for {context}: {e}"),
198
            )
199
0
        })
200
0
    }
201
202
    /// Parses the next u8 integer value
203
0
    fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> {
204
0
        let length = self.parse_i64(context)?;
205
0
        length.try_into().map_err(|e| {
206
0
            make_error(
207
0
                self.val,
208
0
                &format!("converting {length} into u8 for {context}: {e}"),
209
            )
210
0
        })
211
0
    }
212
213
    /// Parses the next timestamp (called after `Timestamp` has been consumed)
214
0
    fn parse_timestamp(&mut self) -> ArrowResult<DataType> {
215
0
        self.expect_token(Token::LParen)?;
216
0
        let time_unit = self.parse_time_unit("Timestamp")?;
217
0
        self.expect_token(Token::Comma)?;
218
0
        let timezone = self.parse_timezone("Timestamp")?;
219
0
        self.expect_token(Token::RParen)?;
220
0
        Ok(DataType::Timestamp(time_unit, timezone.map(Into::into)))
221
0
    }
222
223
    /// Parses the next Time32 (called after `Time32` has been consumed)
224
0
    fn parse_time32(&mut self) -> ArrowResult<DataType> {
225
0
        self.expect_token(Token::LParen)?;
226
0
        let time_unit = self.parse_time_unit("Time32")?;
227
0
        self.expect_token(Token::RParen)?;
228
0
        Ok(DataType::Time32(time_unit))
229
0
    }
230
231
    /// Parses the next Time64 (called after `Time64` has been consumed)
232
0
    fn parse_time64(&mut self) -> ArrowResult<DataType> {
233
0
        self.expect_token(Token::LParen)?;
234
0
        let time_unit = self.parse_time_unit("Time64")?;
235
0
        self.expect_token(Token::RParen)?;
236
0
        Ok(DataType::Time64(time_unit))
237
0
    }
238
239
    /// Parses the next Duration (called after `Duration` has been consumed)
240
0
    fn parse_duration(&mut self) -> ArrowResult<DataType> {
241
0
        self.expect_token(Token::LParen)?;
242
0
        let time_unit = self.parse_time_unit("Duration")?;
243
0
        self.expect_token(Token::RParen)?;
244
0
        Ok(DataType::Duration(time_unit))
245
0
    }
246
247
    /// Parses the next Interval (called after `Interval` has been consumed)
248
0
    fn parse_interval(&mut self) -> ArrowResult<DataType> {
249
0
        self.expect_token(Token::LParen)?;
250
0
        let interval_unit = match self.next_token()? {
251
0
            Token::IntervalUnit(interval_unit) => interval_unit,
252
0
            tok => {
253
0
                return Err(make_error(
254
0
                    self.val,
255
0
                    &format!("finding IntervalUnit for Interval, got {tok}"),
256
0
                ))
257
            }
258
        };
259
0
        self.expect_token(Token::RParen)?;
260
0
        Ok(DataType::Interval(interval_unit))
261
0
    }
262
263
    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed)
264
0
    fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> {
265
0
        self.expect_token(Token::LParen)?;
266
0
        let length = self.parse_i32("FixedSizeBinary")?;
267
0
        self.expect_token(Token::RParen)?;
268
0
        Ok(DataType::FixedSizeBinary(length))
269
0
    }
270
271
    /// Parses the next Decimal32 (called after `Decimal32` has been consumed)
272
0
    fn parse_decimal_32(&mut self) -> ArrowResult<DataType> {
273
0
        self.expect_token(Token::LParen)?;
274
0
        let precision = self.parse_u8("Decimal32")?;
275
0
        self.expect_token(Token::Comma)?;
276
0
        let scale = self.parse_i8("Decimal32")?;
277
0
        self.expect_token(Token::RParen)?;
278
0
        Ok(DataType::Decimal32(precision, scale))
279
0
    }
280
281
    /// Parses the next Decimal64 (called after `Decimal64` has been consumed)
282
0
    fn parse_decimal_64(&mut self) -> ArrowResult<DataType> {
283
0
        self.expect_token(Token::LParen)?;
284
0
        let precision = self.parse_u8("Decimal64")?;
285
0
        self.expect_token(Token::Comma)?;
286
0
        let scale = self.parse_i8("Decimal64")?;
287
0
        self.expect_token(Token::RParen)?;
288
0
        Ok(DataType::Decimal64(precision, scale))
289
0
    }
290
291
    /// Parses the next Decimal128 (called after `Decimal128` has been consumed)
292
0
    fn parse_decimal_128(&mut self) -> ArrowResult<DataType> {
293
0
        self.expect_token(Token::LParen)?;
294
0
        let precision = self.parse_u8("Decimal128")?;
295
0
        self.expect_token(Token::Comma)?;
296
0
        let scale = self.parse_i8("Decimal128")?;
297
0
        self.expect_token(Token::RParen)?;
298
0
        Ok(DataType::Decimal128(precision, scale))
299
0
    }
300
301
    /// Parses the next Decimal256 (called after `Decimal256` has been consumed)
302
0
    fn parse_decimal_256(&mut self) -> ArrowResult<DataType> {
303
0
        self.expect_token(Token::LParen)?;
304
0
        let precision = self.parse_u8("Decimal256")?;
305
0
        self.expect_token(Token::Comma)?;
306
0
        let scale = self.parse_i8("Decimal256")?;
307
0
        self.expect_token(Token::RParen)?;
308
0
        Ok(DataType::Decimal256(precision, scale))
309
0
    }
310
311
    /// Parses the next Dictionary (called after `Dictionary` has been consumed)
312
0
    fn parse_dictionary(&mut self) -> ArrowResult<DataType> {
313
0
        self.expect_token(Token::LParen)?;
314
0
        let key_type = self.parse_next_type()?;
315
0
        self.expect_token(Token::Comma)?;
316
0
        let value_type = self.parse_next_type()?;
317
0
        self.expect_token(Token::RParen)?;
318
0
        Ok(DataType::Dictionary(
319
0
            Box::new(key_type),
320
0
            Box::new(value_type),
321
0
        ))
322
0
    }
323
0
    fn parse_struct(&mut self) -> ArrowResult<DataType> {
324
0
        self.expect_token(Token::LParen)?;
325
0
        let mut fields = Vec::new();
326
        loop {
327
0
            let field_name = match self.next_token()? {
328
                // It's valid to have a name that is a type name
329
0
                Token::SimpleType(data_type) => data_type.to_string(),
330
0
                Token::FieldName(name) => name,
331
                Token::RParen => {
332
0
                    if fields.is_empty() {
333
0
                        break;
334
                    } else {
335
0
                        return Err(make_error(
336
0
                            self.val,
337
0
                            "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
338
0
                        ));
339
                    }
340
                }
341
0
                tok => {
342
0
                    return Err(make_error(
343
0
                        self.val,
344
0
                        &format!("Expected a word for the name of Struct, but got {tok}"),
345
0
                    ))
346
                }
347
            };
348
0
            let field_type = self.parse_next_type()?;
349
0
            fields.push(Arc::new(Field::new(field_name, field_type, true)));
350
0
            match self.next_token()? {
351
0
                Token::Comma => continue,
352
0
                Token::RParen => break,
353
0
                tok => {
354
0
                    return Err(make_error(
355
0
                        self.val,
356
0
                        &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"),
357
0
                    ))
358
                }
359
            }
360
        }
361
0
        Ok(DataType::Struct(Fields::from(fields)))
362
0
    }
363
364
    /// return the next token, or an error if there are none left
365
0
    fn next_token(&mut self) -> ArrowResult<Token> {
366
0
        match self.tokenizer.next() {
367
0
            None => Err(make_error(self.val, "finding next token")),
368
0
            Some(token) => token,
369
        }
370
0
    }
371
372
    /// consume the next token, returning OK(()) if it matches tok, and Err if not
373
0
    fn expect_token(&mut self, tok: Token) -> ArrowResult<()> {
374
0
        let next_token = self.next_token()?;
375
0
        if next_token == tok {
376
0
            Ok(())
377
        } else {
378
0
            Err(make_error_expected(self.val, &tok, &next_token))
379
        }
380
0
    }
381
}
382
383
/// returns true if this character is a separator
384
0
fn is_separator(c: char) -> bool {
385
0
    c == '(' || c == ')' || c == ',' || c == ' '
386
0
}
387
388
#[derive(Debug)]
389
/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing
390
///
391
/// For example the string "Timestamp(Nanosecond, None)" would be parsed into:
392
///
393
/// * Token::Timestamp
394
/// * Token::Lparen
395
/// * Token::IntervalUnit(IntervalUnit::Nanosecond)
396
/// * Token::Comma,
397
/// * Token::None,
398
/// * Token::Rparen,
399
struct Tokenizer<'a> {
400
    val: &'a str,
401
    chars: Peekable<Chars<'a>>,
402
    // temporary buffer for parsing words
403
    word: String,
404
}
405
406
impl<'a> Tokenizer<'a> {
407
0
    fn new(val: &'a str) -> Self {
408
0
        Self {
409
0
            val,
410
0
            chars: val.chars().peekable(),
411
0
            word: String::new(),
412
0
        }
413
0
    }
414
415
    /// returns the next char, without consuming it
416
0
    fn peek_next_char(&mut self) -> Option<char> {
417
0
        self.chars.peek().copied()
418
0
    }
419
420
    /// returns the next char, and consuming it
421
0
    fn next_char(&mut self) -> Option<char> {
422
0
        self.chars.next()
423
0
    }
424
425
    /// parse the characters in val starting at pos, until the next
426
    /// `,`, `(`, or `)` or end of line
427
0
    fn parse_word(&mut self) -> ArrowResult<Token> {
428
        // reset temp space
429
0
        self.word.clear();
430
        loop {
431
0
            match self.peek_next_char() {
432
0
                None => break,
433
0
                Some(c) if is_separator(c) => break,
434
0
                Some(c) => {
435
0
                    self.next_char();
436
0
                    self.word.push(c);
437
0
                }
438
            }
439
        }
440
441
0
        if let Some(c) = self.word.chars().next() {
442
            // if it started with a number, try parsing it as an integer
443
0
            if c == '-' || c.is_numeric() {
444
0
                let val: i64 = self.word.parse().map_err(|e| {
445
0
                    make_error(self.val, &format!("parsing {} as integer: {e}", self.word))
446
0
                })?;
447
0
                return Ok(Token::Integer(val));
448
            }
449
            // if it started with a double quote `"`, try parsing it as a double quoted string
450
0
            else if c == '"' {
451
0
                let len = self.word.chars().count();
452
453
                // to verify it's double quoted
454
0
                if let Some(last_c) = self.word.chars().last() {
455
0
                    if last_c != '"' || len < 2 {
456
0
                        return Err(make_error(
457
0
                            self.val,
458
0
                            &format!(
459
0
                                "parsing {} as double quoted string: last char must be \"",
460
0
                                self.word
461
0
                            ),
462
0
                        ));
463
0
                    }
464
0
                }
465
466
0
                if len == 2 {
467
0
                    return Err(make_error(
468
0
                        self.val,
469
0
                        &format!(
470
0
                            "parsing {} as double quoted string: empty string isn't supported",
471
0
                            self.word
472
0
                        ),
473
0
                    ));
474
0
                }
475
476
0
                let val: String = self.word.parse().map_err(|e| {
477
0
                    make_error(
478
0
                        self.val,
479
0
                        &format!("parsing {} as double quoted string: {e}", self.word),
480
                    )
481
0
                })?;
482
483
0
                let s = val[1..len - 1].to_string();
484
0
                if s.contains('"') {
485
0
                    return Err(make_error(
486
0
                        self.val,
487
0
                        &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
488
0
                    ));
489
0
                }
490
491
0
                return Ok(Token::DoubleQuotedString(s));
492
0
            }
493
0
        }
494
495
        // figure out what the word was
496
0
        let token = match self.word.as_str() {
497
0
            "Null" => Token::SimpleType(DataType::Null),
498
0
            "Boolean" => Token::SimpleType(DataType::Boolean),
499
500
0
            "Int8" => Token::SimpleType(DataType::Int8),
501
0
            "Int16" => Token::SimpleType(DataType::Int16),
502
0
            "Int32" => Token::SimpleType(DataType::Int32),
503
0
            "Int64" => Token::SimpleType(DataType::Int64),
504
505
0
            "UInt8" => Token::SimpleType(DataType::UInt8),
506
0
            "UInt16" => Token::SimpleType(DataType::UInt16),
507
0
            "UInt32" => Token::SimpleType(DataType::UInt32),
508
0
            "UInt64" => Token::SimpleType(DataType::UInt64),
509
510
0
            "Utf8" => Token::SimpleType(DataType::Utf8),
511
0
            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
512
0
            "Utf8View" => Token::SimpleType(DataType::Utf8View),
513
0
            "Binary" => Token::SimpleType(DataType::Binary),
514
0
            "BinaryView" => Token::SimpleType(DataType::BinaryView),
515
0
            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
516
517
0
            "Float16" => Token::SimpleType(DataType::Float16),
518
0
            "Float32" => Token::SimpleType(DataType::Float32),
519
0
            "Float64" => Token::SimpleType(DataType::Float64),
520
521
0
            "Date32" => Token::SimpleType(DataType::Date32),
522
0
            "Date64" => Token::SimpleType(DataType::Date64),
523
524
0
            "List" => Token::List,
525
0
            "LargeList" => Token::LargeList,
526
0
            "FixedSizeList" => Token::FixedSizeList,
527
528
0
            "Second" => Token::TimeUnit(TimeUnit::Second),
529
0
            "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
530
0
            "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
531
0
            "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
532
533
0
            "Timestamp" => Token::Timestamp,
534
0
            "Time32" => Token::Time32,
535
0
            "Time64" => Token::Time64,
536
0
            "Duration" => Token::Duration,
537
0
            "Interval" => Token::Interval,
538
0
            "Dictionary" => Token::Dictionary,
539
540
0
            "FixedSizeBinary" => Token::FixedSizeBinary,
541
542
0
            "Decimal32" => Token::Decimal32,
543
0
            "Decimal64" => Token::Decimal64,
544
0
            "Decimal128" => Token::Decimal128,
545
0
            "Decimal256" => Token::Decimal256,
546
547
0
            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
548
0
            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
549
0
            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
550
551
0
            "Some" => Token::Some,
552
0
            "None" => Token::None,
553
554
0
            "Struct" => Token::Struct,
555
            // If we don't recognize the word, treat it as a field name
556
0
            word => Token::FieldName(word.to_string()),
557
        };
558
0
        Ok(token)
559
0
    }
560
}
561
562
impl Iterator for Tokenizer<'_> {
563
    type Item = ArrowResult<Token>;
564
565
0
    fn next(&mut self) -> Option<Self::Item> {
566
        loop {
567
0
            match self.peek_next_char()? {
568
                ' ' => {
569
                    // skip whitespace
570
0
                    self.next_char();
571
0
                    continue;
572
                }
573
                '(' => {
574
0
                    self.next_char();
575
0
                    return Some(Ok(Token::LParen));
576
                }
577
                ')' => {
578
0
                    self.next_char();
579
0
                    return Some(Ok(Token::RParen));
580
                }
581
                ',' => {
582
0
                    self.next_char();
583
0
                    return Some(Ok(Token::Comma));
584
                }
585
0
                _ => return Some(self.parse_word()),
586
            }
587
        }
588
0
    }
589
}
590
591
/// Grammar is
592
///
593
#[derive(Debug, PartialEq)]
594
enum Token {
595
    // Null, or Int32
596
    SimpleType(DataType),
597
    Timestamp,
598
    Time32,
599
    Time64,
600
    Duration,
601
    Interval,
602
    FixedSizeBinary,
603
    Decimal32,
604
    Decimal64,
605
    Decimal128,
606
    Decimal256,
607
    Dictionary,
608
    TimeUnit(TimeUnit),
609
    IntervalUnit(IntervalUnit),
610
    LParen,
611
    RParen,
612
    Comma,
613
    Some,
614
    None,
615
    Integer(i64),
616
    DoubleQuotedString(String),
617
    List,
618
    LargeList,
619
    FixedSizeList,
620
    Struct,
621
    FieldName(String),
622
}
623
624
impl Display for Token {
625
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
626
0
        match self {
627
0
            Token::SimpleType(t) => write!(f, "{t}"),
628
0
            Token::List => write!(f, "List"),
629
0
            Token::LargeList => write!(f, "LargeList"),
630
0
            Token::FixedSizeList => write!(f, "FixedSizeList"),
631
0
            Token::Timestamp => write!(f, "Timestamp"),
632
0
            Token::Time32 => write!(f, "Time32"),
633
0
            Token::Time64 => write!(f, "Time64"),
634
0
            Token::Duration => write!(f, "Duration"),
635
0
            Token::Interval => write!(f, "Interval"),
636
0
            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
637
0
            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
638
0
            Token::LParen => write!(f, "("),
639
0
            Token::RParen => write!(f, ")"),
640
0
            Token::Comma => write!(f, ","),
641
0
            Token::Some => write!(f, "Some"),
642
0
            Token::None => write!(f, "None"),
643
0
            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
644
0
            Token::Decimal32 => write!(f, "Decimal32"),
645
0
            Token::Decimal64 => write!(f, "Decimal64"),
646
0
            Token::Decimal128 => write!(f, "Decimal128"),
647
0
            Token::Decimal256 => write!(f, "Decimal256"),
648
0
            Token::Dictionary => write!(f, "Dictionary"),
649
0
            Token::Integer(v) => write!(f, "Integer({v})"),
650
0
            Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
651
0
            Token::Struct => write!(f, "Struct"),
652
0
            Token::FieldName(s) => write!(f, "FieldName({s})"),
653
        }
654
0
    }
655
}
656
657
#[cfg(test)]
658
mod test {
659
    use super::*;
660
661
    #[test]
662
    fn test_parse_data_type() {
663
        // this ensures types can be parsed correctly from their string representations
664
        for dt in list_datatypes() {
665
            round_trip(dt)
666
        }
667
    }
668
669
    /// convert data_type to a string, and then parse it as a type
670
    /// verifying it is the same
671
    fn round_trip(data_type: DataType) {
672
        let data_type_string = data_type.to_string();
673
        println!("Input '{data_type_string}' ({data_type:?})");
674
        let parsed_type = parse_data_type(&data_type_string).unwrap();
675
        assert_eq!(
676
            data_type, parsed_type,
677
            "Mismatch parsing {data_type_string}"
678
        );
679
    }
680
681
    fn list_datatypes() -> Vec<DataType> {
682
        vec![
683
            // ---------
684
            // Non Nested types
685
            // ---------
686
            DataType::Null,
687
            DataType::Boolean,
688
            DataType::Int8,
689
            DataType::Int16,
690
            DataType::Int32,
691
            DataType::Int64,
692
            DataType::UInt8,
693
            DataType::UInt16,
694
            DataType::UInt32,
695
            DataType::UInt64,
696
            DataType::Float16,
697
            DataType::Float32,
698
            DataType::Float64,
699
            DataType::Timestamp(TimeUnit::Second, None),
700
            DataType::Timestamp(TimeUnit::Millisecond, None),
701
            DataType::Timestamp(TimeUnit::Microsecond, None),
702
            DataType::Timestamp(TimeUnit::Nanosecond, None),
703
            // we can't cover all possible timezones, here we only test utc and +08:00
704
            DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())),
705
            DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
706
            DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
707
            DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())),
708
            DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())),
709
            DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())),
710
            DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())),
711
            DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
712
            DataType::Date32,
713
            DataType::Date64,
714
            DataType::Time32(TimeUnit::Second),
715
            DataType::Time32(TimeUnit::Millisecond),
716
            DataType::Time32(TimeUnit::Microsecond),
717
            DataType::Time32(TimeUnit::Nanosecond),
718
            DataType::Time64(TimeUnit::Second),
719
            DataType::Time64(TimeUnit::Millisecond),
720
            DataType::Time64(TimeUnit::Microsecond),
721
            DataType::Time64(TimeUnit::Nanosecond),
722
            DataType::Duration(TimeUnit::Second),
723
            DataType::Duration(TimeUnit::Millisecond),
724
            DataType::Duration(TimeUnit::Microsecond),
725
            DataType::Duration(TimeUnit::Nanosecond),
726
            DataType::Interval(IntervalUnit::YearMonth),
727
            DataType::Interval(IntervalUnit::DayTime),
728
            DataType::Interval(IntervalUnit::MonthDayNano),
729
            DataType::Binary,
730
            DataType::BinaryView,
731
            DataType::FixedSizeBinary(0),
732
            DataType::FixedSizeBinary(1234),
733
            DataType::FixedSizeBinary(-432),
734
            DataType::LargeBinary,
735
            DataType::Utf8,
736
            DataType::Utf8View,
737
            DataType::LargeUtf8,
738
            DataType::Decimal32(7, 8),
739
            DataType::Decimal64(6, 9),
740
            DataType::Decimal128(7, 12),
741
            DataType::Decimal256(6, 13),
742
            // ---------
743
            // Nested types
744
            // ---------
745
            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
746
            DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
747
            DataType::Dictionary(
748
                Box::new(DataType::Int8),
749
                Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)),
750
            ),
751
            DataType::Dictionary(
752
                Box::new(DataType::Int8),
753
                Box::new(DataType::FixedSizeBinary(23)),
754
            ),
755
            DataType::Dictionary(
756
                Box::new(DataType::Int8),
757
                Box::new(
758
                    // nested dictionaries are probably a bad idea but they are possible
759
                    DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
760
                ),
761
            ),
762
            DataType::Struct(Fields::from(vec![
763
                Field::new("f1", DataType::Int64, true),
764
                Field::new("f2", DataType::Float64, true),
765
                Field::new(
766
                    "f3",
767
                    DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())),
768
                    true,
769
                ),
770
                Field::new(
771
                    "f4",
772
                    DataType::Dictionary(
773
                        Box::new(DataType::Int8),
774
                        Box::new(DataType::FixedSizeBinary(23)),
775
                    ),
776
                    true,
777
                ),
778
            ])),
779
            DataType::Struct(Fields::from(vec![
780
                Field::new("Int64", DataType::Int64, true),
781
                Field::new("Float64", DataType::Float64, true),
782
            ])),
783
            DataType::Struct(Fields::from(vec![
784
                Field::new("f1", DataType::Int64, true),
785
                Field::new(
786
                    "nested_struct",
787
                    DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])),
788
                    true,
789
                ),
790
            ])),
791
            DataType::Struct(Fields::empty()),
792
            // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
793
        ]
794
    }
795
796
    #[test]
797
    fn test_parse_data_type_whitespace_tolerance() {
798
        // (string to parse, expected DataType)
799
        let cases = [
800
            ("Int8", DataType::Int8),
801
            (
802
                "Timestamp        (Nanosecond,      None)",
803
                DataType::Timestamp(TimeUnit::Nanosecond, None),
804
            ),
805
            (
806
                "Timestamp        (Nanosecond,      None)  ",
807
                DataType::Timestamp(TimeUnit::Nanosecond, None),
808
            ),
809
            (
810
                "          Timestamp        (Nanosecond,      None               )",
811
                DataType::Timestamp(TimeUnit::Nanosecond, None),
812
            ),
813
            (
814
                "Timestamp        (Nanosecond,      None               )  ",
815
                DataType::Timestamp(TimeUnit::Nanosecond, None),
816
            ),
817
        ];
818
819
        for (data_type_string, expected_data_type) in cases {
820
            println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'");
821
            let parsed_data_type = parse_data_type(data_type_string).unwrap();
822
            assert_eq!(parsed_data_type, expected_data_type);
823
        }
824
    }
825
826
    #[test]
827
    fn parse_data_type_errors() {
828
        // (string to parse, expected error message)
829
        let cases = [
830
            ("", "Unsupported type ''"),
831
            ("", "Error finding next token"),
832
            ("null", "Unsupported type 'null'"),
833
            ("Nu", "Unsupported type 'Nu'"),
834
            (
835
                r#"Timestamp(Nanosecond, Some(+00:00))"#,
836
                "Error unrecognized word: +00:00",
837
            ),
838
            (
839
                r#"Timestamp(Nanosecond, Some("+00:00))"#,
840
                r#"parsing "+00:00 as double quoted string: last char must be ""#,
841
            ),
842
            (
843
                r#"Timestamp(Nanosecond, Some(""))"#,
844
                r#"parsing "" as double quoted string: empty string isn't supported"#,
845
            ),
846
            (
847
                r#"Timestamp(Nanosecond, Some("+00:00""))"#,
848
                r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
849
            ),
850
            ("Timestamp(Nanosecond, ", "Error finding next token"),
851
            (
852
                "Float32 Float32",
853
                "trailing content after parsing 'Float32'",
854
            ),
855
            ("Int32, ", "trailing content after parsing 'Int32'"),
856
            ("Int32(3), ", "trailing content after parsing 'Int32'"),
857
            ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"),
858
            ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"),
859
            // too large for i32
860
            ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"),
861
            // can't have negative precision
862
            ("Decimal32(-3, 5)", "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted"),
863
            ("Decimal64(-3, 5)", "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted"),
864
            ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"),
865
            ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"),
866
            ("Decimal32(3, 500)", "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted"),
867
            ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"),
868
            ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
869
            ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
870
            ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
871
            ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
872
            ("Struct(f1)", "Error finding next type, got unexpected ')'"),
873
        ];
874
875
        for (data_type_string, expected_message) in cases {
876
            println!("Parsing '{data_type_string}', expecting '{expected_message}'");
877
            match parse_data_type(data_type_string) {
878
                Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"),
879
                Err(e) => {
880
                    let message = e.to_string();
881
                    assert!(
882
                        message.contains(expected_message),
883
                        "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
884
                    );
885
                    // errors should also contain  a help message
886
                    assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"));
887
                }
888
            }
889
        }
890
    }
891
892
    #[test]
893
    fn parse_error_type() {
894
        let err = parse_data_type("foobar").unwrap_err();
895
        assert!(matches!(err, ArrowError::ParseError(_)));
896
        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar");
897
    }
898
}