/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype_parse.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc}; |
19 | | |
20 | | use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit}; |
21 | | |
22 | 0 | pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> { |
23 | 0 | Parser::new(val).parse() |
24 | 0 | } |
25 | | |
26 | | type ArrowResult<T> = Result<T, ArrowError>; |
27 | | |
28 | 0 | fn make_error(val: &str, msg: &str) -> ArrowError { |
29 | 0 | let msg = format!("Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error {msg}" ); |
30 | 0 | ArrowError::ParseError(msg) |
31 | 0 | } |
32 | | |
33 | 0 | fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError { |
34 | 0 | make_error(val, &format!("Expected '{expected}', got '{actual}'")) |
35 | 0 | } |
36 | | |
37 | | #[derive(Debug)] |
38 | | /// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs> |
39 | | struct Parser<'a> { |
40 | | val: &'a str, |
41 | | tokenizer: Tokenizer<'a>, |
42 | | } |
43 | | |
44 | | impl<'a> Parser<'a> { |
45 | 0 | fn new(val: &'a str) -> Self { |
46 | 0 | Self { |
47 | 0 | val, |
48 | 0 | tokenizer: Tokenizer::new(val), |
49 | 0 | } |
50 | 0 | } |
51 | | |
52 | 0 | fn parse(mut self) -> ArrowResult<DataType> { |
53 | 0 | let data_type = self.parse_next_type()?; |
54 | | // ensure that there is no trailing content |
55 | 0 | if self.tokenizer.next().is_some() { |
56 | 0 | Err(make_error( |
57 | 0 | self.val, |
58 | 0 | &format!("checking trailing content after parsing '{data_type}'"), |
59 | 0 | )) |
60 | | } else { |
61 | 0 | Ok(data_type) |
62 | | } |
63 | 0 | } |
64 | | |
65 | | /// parses the next full DataType |
66 | 0 | fn parse_next_type(&mut self) -> ArrowResult<DataType> { |
67 | 0 | match self.next_token()? { |
68 | 0 | Token::SimpleType(data_type) => Ok(data_type), |
69 | 0 | Token::Timestamp => self.parse_timestamp(), |
70 | 0 | Token::Time32 => self.parse_time32(), |
71 | 0 | Token::Time64 => self.parse_time64(), |
72 | 0 | Token::Duration => self.parse_duration(), |
73 | 0 | Token::Interval => self.parse_interval(), |
74 | 0 | Token::FixedSizeBinary => self.parse_fixed_size_binary(), |
75 | 0 | Token::Decimal32 => self.parse_decimal_32(), |
76 | 0 | Token::Decimal64 => self.parse_decimal_64(), |
77 | 0 | Token::Decimal128 => self.parse_decimal_128(), |
78 | 0 | Token::Decimal256 => self.parse_decimal_256(), |
79 | 0 | Token::Dictionary => self.parse_dictionary(), |
80 | 0 | Token::List => self.parse_list(), |
81 | 0 | Token::LargeList => self.parse_large_list(), |
82 | 0 | Token::FixedSizeList => self.parse_fixed_size_list(), |
83 | 0 | Token::Struct => self.parse_struct(), |
84 | 0 | Token::FieldName(word) => { |
85 | 0 | Err(make_error(self.val, &format!("unrecognized word: {word}"))) |
86 | | } |
87 | 0 | tok => Err(make_error( |
88 | 0 | self.val, |
89 | 0 | &format!("finding next type, got unexpected '{tok}'"), |
90 | 0 | )), |
91 | | } |
92 | 0 | } |
93 | | |
94 | | /// Parses the List type |
95 | 0 | fn parse_list(&mut self) -> ArrowResult<DataType> { |
96 | 0 | self.expect_token(Token::LParen)?; |
97 | 0 | let data_type = self.parse_next_type()?; |
98 | 0 | self.expect_token(Token::RParen)?; |
99 | 0 | Ok(DataType::List(Arc::new(Field::new_list_field( |
100 | 0 | data_type, true, |
101 | 0 | )))) |
102 | 0 | } |
103 | | |
104 | | /// Parses the LargeList type |
105 | 0 | fn parse_large_list(&mut self) -> ArrowResult<DataType> { |
106 | 0 | self.expect_token(Token::LParen)?; |
107 | 0 | let data_type = self.parse_next_type()?; |
108 | 0 | self.expect_token(Token::RParen)?; |
109 | 0 | Ok(DataType::LargeList(Arc::new(Field::new_list_field( |
110 | 0 | data_type, true, |
111 | 0 | )))) |
112 | 0 | } |
113 | | |
114 | | /// Parses the FixedSizeList type |
115 | 0 | fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> { |
116 | 0 | self.expect_token(Token::LParen)?; |
117 | 0 | let length = self.parse_i32("FixedSizeList")?; |
118 | 0 | self.expect_token(Token::Comma)?; |
119 | 0 | let data_type = self.parse_next_type()?; |
120 | 0 | self.expect_token(Token::RParen)?; |
121 | 0 | Ok(DataType::FixedSizeList( |
122 | 0 | Arc::new(Field::new_list_field(data_type, true)), |
123 | 0 | length, |
124 | 0 | )) |
125 | 0 | } |
126 | | |
127 | | /// Parses the next timeunit |
128 | 0 | fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> { |
129 | 0 | match self.next_token()? { |
130 | 0 | Token::TimeUnit(time_unit) => Ok(time_unit), |
131 | 0 | tok => Err(make_error( |
132 | 0 | self.val, |
133 | 0 | &format!("finding TimeUnit for {context}, got {tok}"), |
134 | 0 | )), |
135 | | } |
136 | 0 | } |
137 | | |
138 | | /// Parses the next timezone |
139 | 0 | fn parse_timezone(&mut self, context: &str) -> ArrowResult<Option<String>> { |
140 | 0 | match self.next_token()? { |
141 | 0 | Token::None => Ok(None), |
142 | | Token::Some => { |
143 | 0 | self.expect_token(Token::LParen)?; |
144 | 0 | let timezone = self.parse_double_quoted_string("Timezone")?; |
145 | 0 | self.expect_token(Token::RParen)?; |
146 | 0 | Ok(Some(timezone)) |
147 | | } |
148 | 0 | tok => Err(make_error( |
149 | 0 | self.val, |
150 | 0 | &format!("finding Timezone for {context}, got {tok}"), |
151 | 0 | )), |
152 | | } |
153 | 0 | } |
154 | | |
155 | | /// Parses the next double quoted string |
156 | 0 | fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> { |
157 | 0 | match self.next_token()? { |
158 | 0 | Token::DoubleQuotedString(s) => Ok(s), |
159 | 0 | Token::FieldName(word) => { |
160 | 0 | Err(make_error(self.val, &format!("unrecognized word: {word}"))) |
161 | | } |
162 | 0 | tok => Err(make_error( |
163 | 0 | self.val, |
164 | 0 | &format!("finding double quoted string for {context}, got '{tok}'"), |
165 | 0 | )), |
166 | | } |
167 | 0 | } |
168 | | |
169 | | /// Parses the next integer value |
170 | 0 | fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> { |
171 | 0 | match self.next_token()? { |
172 | 0 | Token::Integer(v) => Ok(v), |
173 | 0 | tok => Err(make_error( |
174 | 0 | self.val, |
175 | 0 | &format!("finding i64 for {context}, got '{tok}'"), |
176 | 0 | )), |
177 | | } |
178 | 0 | } |
179 | | |
180 | | /// Parses the next i32 integer value |
181 | 0 | fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> { |
182 | 0 | let length = self.parse_i64(context)?; |
183 | 0 | length.try_into().map_err(|e| { |
184 | 0 | make_error( |
185 | 0 | self.val, |
186 | 0 | &format!("converting {length} into i32 for {context}: {e}"), |
187 | | ) |
188 | 0 | }) |
189 | 0 | } |
190 | | |
191 | | /// Parses the next i8 integer value |
192 | 0 | fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> { |
193 | 0 | let length = self.parse_i64(context)?; |
194 | 0 | length.try_into().map_err(|e| { |
195 | 0 | make_error( |
196 | 0 | self.val, |
197 | 0 | &format!("converting {length} into i8 for {context}: {e}"), |
198 | | ) |
199 | 0 | }) |
200 | 0 | } |
201 | | |
202 | | /// Parses the next u8 integer value |
203 | 0 | fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> { |
204 | 0 | let length = self.parse_i64(context)?; |
205 | 0 | length.try_into().map_err(|e| { |
206 | 0 | make_error( |
207 | 0 | self.val, |
208 | 0 | &format!("converting {length} into u8 for {context}: {e}"), |
209 | | ) |
210 | 0 | }) |
211 | 0 | } |
212 | | |
213 | | /// Parses the next timestamp (called after `Timestamp` has been consumed) |
214 | 0 | fn parse_timestamp(&mut self) -> ArrowResult<DataType> { |
215 | 0 | self.expect_token(Token::LParen)?; |
216 | 0 | let time_unit = self.parse_time_unit("Timestamp")?; |
217 | 0 | self.expect_token(Token::Comma)?; |
218 | 0 | let timezone = self.parse_timezone("Timestamp")?; |
219 | 0 | self.expect_token(Token::RParen)?; |
220 | 0 | Ok(DataType::Timestamp(time_unit, timezone.map(Into::into))) |
221 | 0 | } |
222 | | |
223 | | /// Parses the next Time32 (called after `Time32` has been consumed) |
224 | 0 | fn parse_time32(&mut self) -> ArrowResult<DataType> { |
225 | 0 | self.expect_token(Token::LParen)?; |
226 | 0 | let time_unit = self.parse_time_unit("Time32")?; |
227 | 0 | self.expect_token(Token::RParen)?; |
228 | 0 | Ok(DataType::Time32(time_unit)) |
229 | 0 | } |
230 | | |
231 | | /// Parses the next Time64 (called after `Time64` has been consumed) |
232 | 0 | fn parse_time64(&mut self) -> ArrowResult<DataType> { |
233 | 0 | self.expect_token(Token::LParen)?; |
234 | 0 | let time_unit = self.parse_time_unit("Time64")?; |
235 | 0 | self.expect_token(Token::RParen)?; |
236 | 0 | Ok(DataType::Time64(time_unit)) |
237 | 0 | } |
238 | | |
239 | | /// Parses the next Duration (called after `Duration` has been consumed) |
240 | 0 | fn parse_duration(&mut self) -> ArrowResult<DataType> { |
241 | 0 | self.expect_token(Token::LParen)?; |
242 | 0 | let time_unit = self.parse_time_unit("Duration")?; |
243 | 0 | self.expect_token(Token::RParen)?; |
244 | 0 | Ok(DataType::Duration(time_unit)) |
245 | 0 | } |
246 | | |
247 | | /// Parses the next Interval (called after `Interval` has been consumed) |
248 | 0 | fn parse_interval(&mut self) -> ArrowResult<DataType> { |
249 | 0 | self.expect_token(Token::LParen)?; |
250 | 0 | let interval_unit = match self.next_token()? { |
251 | 0 | Token::IntervalUnit(interval_unit) => interval_unit, |
252 | 0 | tok => { |
253 | 0 | return Err(make_error( |
254 | 0 | self.val, |
255 | 0 | &format!("finding IntervalUnit for Interval, got {tok}"), |
256 | 0 | )) |
257 | | } |
258 | | }; |
259 | 0 | self.expect_token(Token::RParen)?; |
260 | 0 | Ok(DataType::Interval(interval_unit)) |
261 | 0 | } |
262 | | |
263 | | /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed) |
264 | 0 | fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> { |
265 | 0 | self.expect_token(Token::LParen)?; |
266 | 0 | let length = self.parse_i32("FixedSizeBinary")?; |
267 | 0 | self.expect_token(Token::RParen)?; |
268 | 0 | Ok(DataType::FixedSizeBinary(length)) |
269 | 0 | } |
270 | | |
271 | | /// Parses the next Decimal32 (called after `Decimal32` has been consumed) |
272 | 0 | fn parse_decimal_32(&mut self) -> ArrowResult<DataType> { |
273 | 0 | self.expect_token(Token::LParen)?; |
274 | 0 | let precision = self.parse_u8("Decimal32")?; |
275 | 0 | self.expect_token(Token::Comma)?; |
276 | 0 | let scale = self.parse_i8("Decimal32")?; |
277 | 0 | self.expect_token(Token::RParen)?; |
278 | 0 | Ok(DataType::Decimal32(precision, scale)) |
279 | 0 | } |
280 | | |
281 | | /// Parses the next Decimal64 (called after `Decimal64` has been consumed) |
282 | 0 | fn parse_decimal_64(&mut self) -> ArrowResult<DataType> { |
283 | 0 | self.expect_token(Token::LParen)?; |
284 | 0 | let precision = self.parse_u8("Decimal64")?; |
285 | 0 | self.expect_token(Token::Comma)?; |
286 | 0 | let scale = self.parse_i8("Decimal64")?; |
287 | 0 | self.expect_token(Token::RParen)?; |
288 | 0 | Ok(DataType::Decimal64(precision, scale)) |
289 | 0 | } |
290 | | |
291 | | /// Parses the next Decimal128 (called after `Decimal128` has been consumed) |
292 | 0 | fn parse_decimal_128(&mut self) -> ArrowResult<DataType> { |
293 | 0 | self.expect_token(Token::LParen)?; |
294 | 0 | let precision = self.parse_u8("Decimal128")?; |
295 | 0 | self.expect_token(Token::Comma)?; |
296 | 0 | let scale = self.parse_i8("Decimal128")?; |
297 | 0 | self.expect_token(Token::RParen)?; |
298 | 0 | Ok(DataType::Decimal128(precision, scale)) |
299 | 0 | } |
300 | | |
301 | | /// Parses the next Decimal256 (called after `Decimal256` has been consumed) |
302 | 0 | fn parse_decimal_256(&mut self) -> ArrowResult<DataType> { |
303 | 0 | self.expect_token(Token::LParen)?; |
304 | 0 | let precision = self.parse_u8("Decimal256")?; |
305 | 0 | self.expect_token(Token::Comma)?; |
306 | 0 | let scale = self.parse_i8("Decimal256")?; |
307 | 0 | self.expect_token(Token::RParen)?; |
308 | 0 | Ok(DataType::Decimal256(precision, scale)) |
309 | 0 | } |
310 | | |
311 | | /// Parses the next Dictionary (called after `Dictionary` has been consumed) |
312 | 0 | fn parse_dictionary(&mut self) -> ArrowResult<DataType> { |
313 | 0 | self.expect_token(Token::LParen)?; |
314 | 0 | let key_type = self.parse_next_type()?; |
315 | 0 | self.expect_token(Token::Comma)?; |
316 | 0 | let value_type = self.parse_next_type()?; |
317 | 0 | self.expect_token(Token::RParen)?; |
318 | 0 | Ok(DataType::Dictionary( |
319 | 0 | Box::new(key_type), |
320 | 0 | Box::new(value_type), |
321 | 0 | )) |
322 | 0 | } |
323 | 0 | fn parse_struct(&mut self) -> ArrowResult<DataType> { |
324 | 0 | self.expect_token(Token::LParen)?; |
325 | 0 | let mut fields = Vec::new(); |
326 | | loop { |
327 | 0 | let field_name = match self.next_token()? { |
328 | | // It's valid to have a name that is a type name |
329 | 0 | Token::SimpleType(data_type) => data_type.to_string(), |
330 | 0 | Token::FieldName(name) => name, |
331 | | Token::RParen => { |
332 | 0 | if fields.is_empty() { |
333 | 0 | break; |
334 | | } else { |
335 | 0 | return Err(make_error( |
336 | 0 | self.val, |
337 | 0 | "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma", |
338 | 0 | )); |
339 | | } |
340 | | } |
341 | 0 | tok => { |
342 | 0 | return Err(make_error( |
343 | 0 | self.val, |
344 | 0 | &format!("Expected a word for the name of Struct, but got {tok}"), |
345 | 0 | )) |
346 | | } |
347 | | }; |
348 | 0 | let field_type = self.parse_next_type()?; |
349 | 0 | fields.push(Arc::new(Field::new(field_name, field_type, true))); |
350 | 0 | match self.next_token()? { |
351 | 0 | Token::Comma => continue, |
352 | 0 | Token::RParen => break, |
353 | 0 | tok => { |
354 | 0 | return Err(make_error( |
355 | 0 | self.val, |
356 | 0 | &format!("Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'"), |
357 | 0 | )) |
358 | | } |
359 | | } |
360 | | } |
361 | 0 | Ok(DataType::Struct(Fields::from(fields))) |
362 | 0 | } |
363 | | |
364 | | /// return the next token, or an error if there are none left |
365 | 0 | fn next_token(&mut self) -> ArrowResult<Token> { |
366 | 0 | match self.tokenizer.next() { |
367 | 0 | None => Err(make_error(self.val, "finding next token")), |
368 | 0 | Some(token) => token, |
369 | | } |
370 | 0 | } |
371 | | |
372 | | /// consume the next token, returning OK(()) if it matches tok, and Err if not |
373 | 0 | fn expect_token(&mut self, tok: Token) -> ArrowResult<()> { |
374 | 0 | let next_token = self.next_token()?; |
375 | 0 | if next_token == tok { |
376 | 0 | Ok(()) |
377 | | } else { |
378 | 0 | Err(make_error_expected(self.val, &tok, &next_token)) |
379 | | } |
380 | 0 | } |
381 | | } |
382 | | |
383 | | /// returns true if this character is a separator |
384 | 0 | fn is_separator(c: char) -> bool { |
385 | 0 | c == '(' || c == ')' || c == ',' || c == ' ' |
386 | 0 | } |
387 | | |
388 | | #[derive(Debug)] |
389 | | /// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing |
390 | | /// |
391 | | /// For example the string "Timestamp(Nanosecond, None)" would be parsed into: |
392 | | /// |
393 | | /// * Token::Timestamp |
394 | | /// * Token::Lparen |
395 | | /// * Token::IntervalUnit(IntervalUnit::Nanosecond) |
396 | | /// * Token::Comma, |
397 | | /// * Token::None, |
398 | | /// * Token::Rparen, |
399 | | struct Tokenizer<'a> { |
400 | | val: &'a str, |
401 | | chars: Peekable<Chars<'a>>, |
402 | | // temporary buffer for parsing words |
403 | | word: String, |
404 | | } |
405 | | |
406 | | impl<'a> Tokenizer<'a> { |
407 | 0 | fn new(val: &'a str) -> Self { |
408 | 0 | Self { |
409 | 0 | val, |
410 | 0 | chars: val.chars().peekable(), |
411 | 0 | word: String::new(), |
412 | 0 | } |
413 | 0 | } |
414 | | |
415 | | /// returns the next char, without consuming it |
416 | 0 | fn peek_next_char(&mut self) -> Option<char> { |
417 | 0 | self.chars.peek().copied() |
418 | 0 | } |
419 | | |
420 | | /// returns the next char, and consuming it |
421 | 0 | fn next_char(&mut self) -> Option<char> { |
422 | 0 | self.chars.next() |
423 | 0 | } |
424 | | |
425 | | /// parse the characters in val starting at pos, until the next |
426 | | /// `,`, `(`, or `)` or end of line |
427 | 0 | fn parse_word(&mut self) -> ArrowResult<Token> { |
428 | | // reset temp space |
429 | 0 | self.word.clear(); |
430 | | loop { |
431 | 0 | match self.peek_next_char() { |
432 | 0 | None => break, |
433 | 0 | Some(c) if is_separator(c) => break, |
434 | 0 | Some(c) => { |
435 | 0 | self.next_char(); |
436 | 0 | self.word.push(c); |
437 | 0 | } |
438 | | } |
439 | | } |
440 | | |
441 | 0 | if let Some(c) = self.word.chars().next() { |
442 | | // if it started with a number, try parsing it as an integer |
443 | 0 | if c == '-' || c.is_numeric() { |
444 | 0 | let val: i64 = self.word.parse().map_err(|e| { |
445 | 0 | make_error(self.val, &format!("parsing {} as integer: {e}", self.word)) |
446 | 0 | })?; |
447 | 0 | return Ok(Token::Integer(val)); |
448 | | } |
449 | | // if it started with a double quote `"`, try parsing it as a double quoted string |
450 | 0 | else if c == '"' { |
451 | 0 | let len = self.word.chars().count(); |
452 | | |
453 | | // to verify it's double quoted |
454 | 0 | if let Some(last_c) = self.word.chars().last() { |
455 | 0 | if last_c != '"' || len < 2 { |
456 | 0 | return Err(make_error( |
457 | 0 | self.val, |
458 | 0 | &format!( |
459 | 0 | "parsing {} as double quoted string: last char must be \"", |
460 | 0 | self.word |
461 | 0 | ), |
462 | 0 | )); |
463 | 0 | } |
464 | 0 | } |
465 | | |
466 | 0 | if len == 2 { |
467 | 0 | return Err(make_error( |
468 | 0 | self.val, |
469 | 0 | &format!( |
470 | 0 | "parsing {} as double quoted string: empty string isn't supported", |
471 | 0 | self.word |
472 | 0 | ), |
473 | 0 | )); |
474 | 0 | } |
475 | | |
476 | 0 | let val: String = self.word.parse().map_err(|e| { |
477 | 0 | make_error( |
478 | 0 | self.val, |
479 | 0 | &format!("parsing {} as double quoted string: {e}", self.word), |
480 | | ) |
481 | 0 | })?; |
482 | | |
483 | 0 | let s = val[1..len - 1].to_string(); |
484 | 0 | if s.contains('"') { |
485 | 0 | return Err(make_error( |
486 | 0 | self.val, |
487 | 0 | &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word), |
488 | 0 | )); |
489 | 0 | } |
490 | | |
491 | 0 | return Ok(Token::DoubleQuotedString(s)); |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | | // figure out what the word was |
496 | 0 | let token = match self.word.as_str() { |
497 | 0 | "Null" => Token::SimpleType(DataType::Null), |
498 | 0 | "Boolean" => Token::SimpleType(DataType::Boolean), |
499 | | |
500 | 0 | "Int8" => Token::SimpleType(DataType::Int8), |
501 | 0 | "Int16" => Token::SimpleType(DataType::Int16), |
502 | 0 | "Int32" => Token::SimpleType(DataType::Int32), |
503 | 0 | "Int64" => Token::SimpleType(DataType::Int64), |
504 | | |
505 | 0 | "UInt8" => Token::SimpleType(DataType::UInt8), |
506 | 0 | "UInt16" => Token::SimpleType(DataType::UInt16), |
507 | 0 | "UInt32" => Token::SimpleType(DataType::UInt32), |
508 | 0 | "UInt64" => Token::SimpleType(DataType::UInt64), |
509 | | |
510 | 0 | "Utf8" => Token::SimpleType(DataType::Utf8), |
511 | 0 | "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8), |
512 | 0 | "Utf8View" => Token::SimpleType(DataType::Utf8View), |
513 | 0 | "Binary" => Token::SimpleType(DataType::Binary), |
514 | 0 | "BinaryView" => Token::SimpleType(DataType::BinaryView), |
515 | 0 | "LargeBinary" => Token::SimpleType(DataType::LargeBinary), |
516 | | |
517 | 0 | "Float16" => Token::SimpleType(DataType::Float16), |
518 | 0 | "Float32" => Token::SimpleType(DataType::Float32), |
519 | 0 | "Float64" => Token::SimpleType(DataType::Float64), |
520 | | |
521 | 0 | "Date32" => Token::SimpleType(DataType::Date32), |
522 | 0 | "Date64" => Token::SimpleType(DataType::Date64), |
523 | | |
524 | 0 | "List" => Token::List, |
525 | 0 | "LargeList" => Token::LargeList, |
526 | 0 | "FixedSizeList" => Token::FixedSizeList, |
527 | | |
528 | 0 | "Second" => Token::TimeUnit(TimeUnit::Second), |
529 | 0 | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond), |
530 | 0 | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond), |
531 | 0 | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond), |
532 | | |
533 | 0 | "Timestamp" => Token::Timestamp, |
534 | 0 | "Time32" => Token::Time32, |
535 | 0 | "Time64" => Token::Time64, |
536 | 0 | "Duration" => Token::Duration, |
537 | 0 | "Interval" => Token::Interval, |
538 | 0 | "Dictionary" => Token::Dictionary, |
539 | | |
540 | 0 | "FixedSizeBinary" => Token::FixedSizeBinary, |
541 | | |
542 | 0 | "Decimal32" => Token::Decimal32, |
543 | 0 | "Decimal64" => Token::Decimal64, |
544 | 0 | "Decimal128" => Token::Decimal128, |
545 | 0 | "Decimal256" => Token::Decimal256, |
546 | | |
547 | 0 | "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth), |
548 | 0 | "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime), |
549 | 0 | "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano), |
550 | | |
551 | 0 | "Some" => Token::Some, |
552 | 0 | "None" => Token::None, |
553 | | |
554 | 0 | "Struct" => Token::Struct, |
555 | | // If we don't recognize the word, treat it as a field name |
556 | 0 | word => Token::FieldName(word.to_string()), |
557 | | }; |
558 | 0 | Ok(token) |
559 | 0 | } |
560 | | } |
561 | | |
562 | | impl Iterator for Tokenizer<'_> { |
563 | | type Item = ArrowResult<Token>; |
564 | | |
565 | 0 | fn next(&mut self) -> Option<Self::Item> { |
566 | | loop { |
567 | 0 | match self.peek_next_char()? { |
568 | | ' ' => { |
569 | | // skip whitespace |
570 | 0 | self.next_char(); |
571 | 0 | continue; |
572 | | } |
573 | | '(' => { |
574 | 0 | self.next_char(); |
575 | 0 | return Some(Ok(Token::LParen)); |
576 | | } |
577 | | ')' => { |
578 | 0 | self.next_char(); |
579 | 0 | return Some(Ok(Token::RParen)); |
580 | | } |
581 | | ',' => { |
582 | 0 | self.next_char(); |
583 | 0 | return Some(Ok(Token::Comma)); |
584 | | } |
585 | 0 | _ => return Some(self.parse_word()), |
586 | | } |
587 | | } |
588 | 0 | } |
589 | | } |
590 | | |
591 | | /// Grammar is |
592 | | /// |
593 | | #[derive(Debug, PartialEq)] |
594 | | enum Token { |
595 | | // Null, or Int32 |
596 | | SimpleType(DataType), |
597 | | Timestamp, |
598 | | Time32, |
599 | | Time64, |
600 | | Duration, |
601 | | Interval, |
602 | | FixedSizeBinary, |
603 | | Decimal32, |
604 | | Decimal64, |
605 | | Decimal128, |
606 | | Decimal256, |
607 | | Dictionary, |
608 | | TimeUnit(TimeUnit), |
609 | | IntervalUnit(IntervalUnit), |
610 | | LParen, |
611 | | RParen, |
612 | | Comma, |
613 | | Some, |
614 | | None, |
615 | | Integer(i64), |
616 | | DoubleQuotedString(String), |
617 | | List, |
618 | | LargeList, |
619 | | FixedSizeList, |
620 | | Struct, |
621 | | FieldName(String), |
622 | | } |
623 | | |
624 | | impl Display for Token { |
625 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
626 | 0 | match self { |
627 | 0 | Token::SimpleType(t) => write!(f, "{t}"), |
628 | 0 | Token::List => write!(f, "List"), |
629 | 0 | Token::LargeList => write!(f, "LargeList"), |
630 | 0 | Token::FixedSizeList => write!(f, "FixedSizeList"), |
631 | 0 | Token::Timestamp => write!(f, "Timestamp"), |
632 | 0 | Token::Time32 => write!(f, "Time32"), |
633 | 0 | Token::Time64 => write!(f, "Time64"), |
634 | 0 | Token::Duration => write!(f, "Duration"), |
635 | 0 | Token::Interval => write!(f, "Interval"), |
636 | 0 | Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"), |
637 | 0 | Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"), |
638 | 0 | Token::LParen => write!(f, "("), |
639 | 0 | Token::RParen => write!(f, ")"), |
640 | 0 | Token::Comma => write!(f, ","), |
641 | 0 | Token::Some => write!(f, "Some"), |
642 | 0 | Token::None => write!(f, "None"), |
643 | 0 | Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), |
644 | 0 | Token::Decimal32 => write!(f, "Decimal32"), |
645 | 0 | Token::Decimal64 => write!(f, "Decimal64"), |
646 | 0 | Token::Decimal128 => write!(f, "Decimal128"), |
647 | 0 | Token::Decimal256 => write!(f, "Decimal256"), |
648 | 0 | Token::Dictionary => write!(f, "Dictionary"), |
649 | 0 | Token::Integer(v) => write!(f, "Integer({v})"), |
650 | 0 | Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), |
651 | 0 | Token::Struct => write!(f, "Struct"), |
652 | 0 | Token::FieldName(s) => write!(f, "FieldName({s})"), |
653 | | } |
654 | 0 | } |
655 | | } |
656 | | |
657 | | #[cfg(test)] |
658 | | mod test { |
659 | | use super::*; |
660 | | |
661 | | #[test] |
662 | | fn test_parse_data_type() { |
663 | | // this ensures types can be parsed correctly from their string representations |
664 | | for dt in list_datatypes() { |
665 | | round_trip(dt) |
666 | | } |
667 | | } |
668 | | |
669 | | /// convert data_type to a string, and then parse it as a type |
670 | | /// verifying it is the same |
671 | | fn round_trip(data_type: DataType) { |
672 | | let data_type_string = data_type.to_string(); |
673 | | println!("Input '{data_type_string}' ({data_type:?})"); |
674 | | let parsed_type = parse_data_type(&data_type_string).unwrap(); |
675 | | assert_eq!( |
676 | | data_type, parsed_type, |
677 | | "Mismatch parsing {data_type_string}" |
678 | | ); |
679 | | } |
680 | | |
681 | | fn list_datatypes() -> Vec<DataType> { |
682 | | vec![ |
683 | | // --------- |
684 | | // Non Nested types |
685 | | // --------- |
686 | | DataType::Null, |
687 | | DataType::Boolean, |
688 | | DataType::Int8, |
689 | | DataType::Int16, |
690 | | DataType::Int32, |
691 | | DataType::Int64, |
692 | | DataType::UInt8, |
693 | | DataType::UInt16, |
694 | | DataType::UInt32, |
695 | | DataType::UInt64, |
696 | | DataType::Float16, |
697 | | DataType::Float32, |
698 | | DataType::Float64, |
699 | | DataType::Timestamp(TimeUnit::Second, None), |
700 | | DataType::Timestamp(TimeUnit::Millisecond, None), |
701 | | DataType::Timestamp(TimeUnit::Microsecond, None), |
702 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
703 | | // we can't cover all possible timezones, here we only test utc and +08:00 |
704 | | DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), |
705 | | DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), |
706 | | DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), |
707 | | DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())), |
708 | | DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())), |
709 | | DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())), |
710 | | DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())), |
711 | | DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), |
712 | | DataType::Date32, |
713 | | DataType::Date64, |
714 | | DataType::Time32(TimeUnit::Second), |
715 | | DataType::Time32(TimeUnit::Millisecond), |
716 | | DataType::Time32(TimeUnit::Microsecond), |
717 | | DataType::Time32(TimeUnit::Nanosecond), |
718 | | DataType::Time64(TimeUnit::Second), |
719 | | DataType::Time64(TimeUnit::Millisecond), |
720 | | DataType::Time64(TimeUnit::Microsecond), |
721 | | DataType::Time64(TimeUnit::Nanosecond), |
722 | | DataType::Duration(TimeUnit::Second), |
723 | | DataType::Duration(TimeUnit::Millisecond), |
724 | | DataType::Duration(TimeUnit::Microsecond), |
725 | | DataType::Duration(TimeUnit::Nanosecond), |
726 | | DataType::Interval(IntervalUnit::YearMonth), |
727 | | DataType::Interval(IntervalUnit::DayTime), |
728 | | DataType::Interval(IntervalUnit::MonthDayNano), |
729 | | DataType::Binary, |
730 | | DataType::BinaryView, |
731 | | DataType::FixedSizeBinary(0), |
732 | | DataType::FixedSizeBinary(1234), |
733 | | DataType::FixedSizeBinary(-432), |
734 | | DataType::LargeBinary, |
735 | | DataType::Utf8, |
736 | | DataType::Utf8View, |
737 | | DataType::LargeUtf8, |
738 | | DataType::Decimal32(7, 8), |
739 | | DataType::Decimal64(6, 9), |
740 | | DataType::Decimal128(7, 12), |
741 | | DataType::Decimal256(6, 13), |
742 | | // --------- |
743 | | // Nested types |
744 | | // --------- |
745 | | DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), |
746 | | DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), |
747 | | DataType::Dictionary( |
748 | | Box::new(DataType::Int8), |
749 | | Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)), |
750 | | ), |
751 | | DataType::Dictionary( |
752 | | Box::new(DataType::Int8), |
753 | | Box::new(DataType::FixedSizeBinary(23)), |
754 | | ), |
755 | | DataType::Dictionary( |
756 | | Box::new(DataType::Int8), |
757 | | Box::new( |
758 | | // nested dictionaries are probably a bad idea but they are possible |
759 | | DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), |
760 | | ), |
761 | | ), |
762 | | DataType::Struct(Fields::from(vec![ |
763 | | Field::new("f1", DataType::Int64, true), |
764 | | Field::new("f2", DataType::Float64, true), |
765 | | Field::new( |
766 | | "f3", |
767 | | DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), |
768 | | true, |
769 | | ), |
770 | | Field::new( |
771 | | "f4", |
772 | | DataType::Dictionary( |
773 | | Box::new(DataType::Int8), |
774 | | Box::new(DataType::FixedSizeBinary(23)), |
775 | | ), |
776 | | true, |
777 | | ), |
778 | | ])), |
779 | | DataType::Struct(Fields::from(vec![ |
780 | | Field::new("Int64", DataType::Int64, true), |
781 | | Field::new("Float64", DataType::Float64, true), |
782 | | ])), |
783 | | DataType::Struct(Fields::from(vec![ |
784 | | Field::new("f1", DataType::Int64, true), |
785 | | Field::new( |
786 | | "nested_struct", |
787 | | DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])), |
788 | | true, |
789 | | ), |
790 | | ])), |
791 | | DataType::Struct(Fields::empty()), |
792 | | // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc) |
793 | | ] |
794 | | } |
795 | | |
796 | | #[test] |
797 | | fn test_parse_data_type_whitespace_tolerance() { |
798 | | // (string to parse, expected DataType) |
799 | | let cases = [ |
800 | | ("Int8", DataType::Int8), |
801 | | ( |
802 | | "Timestamp (Nanosecond, None)", |
803 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
804 | | ), |
805 | | ( |
806 | | "Timestamp (Nanosecond, None) ", |
807 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
808 | | ), |
809 | | ( |
810 | | " Timestamp (Nanosecond, None )", |
811 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
812 | | ), |
813 | | ( |
814 | | "Timestamp (Nanosecond, None ) ", |
815 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
816 | | ), |
817 | | ]; |
818 | | |
819 | | for (data_type_string, expected_data_type) in cases { |
820 | | println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'"); |
821 | | let parsed_data_type = parse_data_type(data_type_string).unwrap(); |
822 | | assert_eq!(parsed_data_type, expected_data_type); |
823 | | } |
824 | | } |
825 | | |
826 | | #[test] |
827 | | fn parse_data_type_errors() { |
828 | | // (string to parse, expected error message) |
829 | | let cases = [ |
830 | | ("", "Unsupported type ''"), |
831 | | ("", "Error finding next token"), |
832 | | ("null", "Unsupported type 'null'"), |
833 | | ("Nu", "Unsupported type 'Nu'"), |
834 | | ( |
835 | | r#"Timestamp(Nanosecond, Some(+00:00))"#, |
836 | | "Error unrecognized word: +00:00", |
837 | | ), |
838 | | ( |
839 | | r#"Timestamp(Nanosecond, Some("+00:00))"#, |
840 | | r#"parsing "+00:00 as double quoted string: last char must be ""#, |
841 | | ), |
842 | | ( |
843 | | r#"Timestamp(Nanosecond, Some(""))"#, |
844 | | r#"parsing "" as double quoted string: empty string isn't supported"#, |
845 | | ), |
846 | | ( |
847 | | r#"Timestamp(Nanosecond, Some("+00:00""))"#, |
848 | | r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#, |
849 | | ), |
850 | | ("Timestamp(Nanosecond, ", "Error finding next token"), |
851 | | ( |
852 | | "Float32 Float32", |
853 | | "trailing content after parsing 'Float32'", |
854 | | ), |
855 | | ("Int32, ", "trailing content after parsing 'Int32'"), |
856 | | ("Int32(3), ", "trailing content after parsing 'Int32'"), |
857 | | ("FixedSizeBinary(Int32), ", "Error finding i64 for FixedSizeBinary, got 'Int32'"), |
858 | | ("FixedSizeBinary(3.0), ", "Error parsing 3.0 as integer: invalid digit found in string"), |
859 | | // too large for i32 |
860 | | ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"), |
861 | | // can't have negative precision |
862 | | ("Decimal32(-3, 5)", "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted"), |
863 | | ("Decimal64(-3, 5)", "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted"), |
864 | | ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"), |
865 | | ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"), |
866 | | ("Decimal32(3, 500)", "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted"), |
867 | | ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"), |
868 | | ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"), |
869 | | ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"), |
870 | | ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"), |
871 | | ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"), |
872 | | ("Struct(f1)", "Error finding next type, got unexpected ')'"), |
873 | | ]; |
874 | | |
875 | | for (data_type_string, expected_message) in cases { |
876 | | println!("Parsing '{data_type_string}', expecting '{expected_message}'"); |
877 | | match parse_data_type(data_type_string) { |
878 | | Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"), |
879 | | Err(e) => { |
880 | | let message = e.to_string(); |
881 | | assert!( |
882 | | message.contains(expected_message), |
883 | | "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n" |
884 | | ); |
885 | | // errors should also contain a help message |
886 | | assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'")); |
887 | | } |
888 | | } |
889 | | } |
890 | | } |
891 | | |
892 | | #[test] |
893 | | fn parse_error_type() { |
894 | | let err = parse_data_type("foobar").unwrap_err(); |
895 | | assert!(matches!(err, ArrowError::ParseError(_))); |
896 | | assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar"); |
897 | | } |
898 | | } |