/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/datatype_parse.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc}; |
19 | | |
20 | | use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields, UnionMode}; |
21 | | |
22 | | /// Parses a DataType from a string representation |
23 | | /// |
24 | | /// For example, the string "Int32" would be parsed into [`DataType::Int32`] |
25 | 0 | pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> { |
26 | 0 | Parser::new(val).parse() |
27 | 0 | } |
28 | | |
29 | | type ArrowResult<T> = Result<T, ArrowError>; |
30 | | |
31 | 0 | fn make_error(val: &str, msg: &str) -> ArrowError { |
32 | 0 | let msg = format!( |
33 | 0 | "Unsupported type '{val}'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error {msg}" |
34 | | ); |
35 | 0 | ArrowError::ParseError(msg) |
36 | 0 | } |
37 | | |
38 | 0 | fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowError { |
39 | 0 | make_error(val, &format!("Expected '{expected}', got '{actual}'")) |
40 | 0 | } |
41 | | |
42 | | /// Implementation of `parse_data_type`, modeled after <https://github.com/sqlparser-rs/sqlparser-rs> |
43 | | #[derive(Debug)] |
44 | | struct Parser<'a> { |
45 | | val: &'a str, |
46 | | tokenizer: Peekable<Tokenizer<'a>>, |
47 | | } |
48 | | |
49 | | impl<'a> Parser<'a> { |
50 | 0 | fn new(val: &'a str) -> Self { |
51 | 0 | Self { |
52 | 0 | val, |
53 | 0 | tokenizer: Tokenizer::new(val).peekable(), |
54 | 0 | } |
55 | 0 | } |
56 | | |
57 | 0 | fn parse(mut self) -> ArrowResult<DataType> { |
58 | 0 | let data_type = self.parse_next_type()?; |
59 | | // ensure that there is no trailing content |
60 | 0 | if self.tokenizer.next().is_some() { |
61 | 0 | Err(make_error( |
62 | 0 | self.val, |
63 | 0 | &format!("checking trailing content after parsing '{data_type}'"), |
64 | 0 | )) |
65 | | } else { |
66 | 0 | Ok(data_type) |
67 | | } |
68 | 0 | } |
69 | | |
70 | | /// parses the next full DataType |
71 | 0 | fn parse_next_type(&mut self) -> ArrowResult<DataType> { |
72 | 0 | match self.next_token()? { |
73 | 0 | Token::SimpleType(data_type) => Ok(data_type), |
74 | 0 | Token::Timestamp => self.parse_timestamp(), |
75 | 0 | Token::Time32 => self.parse_time32(), |
76 | 0 | Token::Time64 => self.parse_time64(), |
77 | 0 | Token::Duration => self.parse_duration(), |
78 | 0 | Token::Interval => self.parse_interval(), |
79 | 0 | Token::FixedSizeBinary => self.parse_fixed_size_binary(), |
80 | 0 | Token::Decimal32 => self.parse_decimal_32(), |
81 | 0 | Token::Decimal64 => self.parse_decimal_64(), |
82 | 0 | Token::Decimal128 => self.parse_decimal_128(), |
83 | 0 | Token::Decimal256 => self.parse_decimal_256(), |
84 | 0 | Token::Dictionary => self.parse_dictionary(), |
85 | 0 | Token::List => self.parse_list(), |
86 | 0 | Token::ListView => self.parse_list_view(), |
87 | 0 | Token::LargeList => self.parse_large_list(), |
88 | 0 | Token::LargeListView => self.parse_large_list_view(), |
89 | 0 | Token::FixedSizeList => self.parse_fixed_size_list(), |
90 | 0 | Token::Struct => self.parse_struct(), |
91 | 0 | Token::Union => self.parse_union(), |
92 | 0 | Token::Map => self.parse_map(), |
93 | 0 | Token::RunEndEncoded => self.parse_run_end_encoded(), |
94 | 0 | tok => Err(make_error( |
95 | 0 | self.val, |
96 | 0 | &format!("finding next type, got unexpected '{tok}'"), |
97 | 0 | )), |
98 | | } |
99 | 0 | } |
100 | | |
101 | | /// parses Field, this is the inversion of `format_field` in `datatype_display.rs`. |
102 | | /// E.g: "a": nullable Int64 |
103 | | /// |
104 | | /// TODO: support metadata: `"a": nullable Int64 metadata: {"foo": "value"}` |
105 | 0 | fn parse_field(&mut self) -> ArrowResult<Field> { |
106 | 0 | let name = self.parse_double_quoted_string("Field")?; |
107 | 0 | self.expect_token(Token::Colon)?; |
108 | 0 | let nullable = self.parse_opt_nullable(); |
109 | 0 | let data_type = self.parse_next_type()?; |
110 | 0 | Ok(Field::new(name, data_type, nullable)) |
111 | 0 | } |
112 | | |
113 | | /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME` |
114 | | /// if no field name is specified. |
115 | | /// E.g: `nullable Int64, field: 'foo'` or `nullable Int64` |
116 | | /// |
117 | | /// TODO: support metadata: `nullable Int64, metadata: {"foo2": "value"}` |
118 | 0 | fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> { |
119 | 0 | let nullable = self.parse_opt_nullable(); |
120 | 0 | let data_type = self.parse_next_type()?; |
121 | | |
122 | | // the field name (if exists) must be after a comma |
123 | 0 | let field_name = if self |
124 | 0 | .tokenizer |
125 | 0 | .next_if(|next| matches!(next, Ok(Token::Comma))) |
126 | 0 | .is_none() |
127 | | { |
128 | 0 | Field::LIST_FIELD_DEFAULT_NAME.into() |
129 | | } else { |
130 | | // expects: `field: 'field_name'`. |
131 | 0 | self.expect_token(Token::Field)?; |
132 | 0 | self.expect_token(Token::Colon)?; |
133 | 0 | self.parse_single_quoted_string(context)? |
134 | | }; |
135 | | |
136 | 0 | Ok(Field::new(field_name, data_type, nullable)) |
137 | 0 | } |
138 | | |
139 | | /// Parses the List type (called after `List` has been consumed) |
140 | | /// E.g: List(nullable Int64, field: 'foo') |
141 | 0 | fn parse_list(&mut self) -> ArrowResult<DataType> { |
142 | 0 | self.expect_token(Token::LParen)?; |
143 | 0 | let field = self.parse_list_field("List")?; |
144 | 0 | self.expect_token(Token::RParen)?; |
145 | 0 | Ok(DataType::List(Arc::new(field))) |
146 | 0 | } |
147 | | |
148 | | /// Parses the ListView type (called after `ListView` has been consumed) |
149 | | /// E.g: ListView(nullable Int64, field: 'foo') |
150 | 0 | fn parse_list_view(&mut self) -> ArrowResult<DataType> { |
151 | 0 | self.expect_token(Token::LParen)?; |
152 | 0 | let field = self.parse_list_field("ListView")?; |
153 | 0 | self.expect_token(Token::RParen)?; |
154 | 0 | Ok(DataType::ListView(Arc::new(field))) |
155 | 0 | } |
156 | | |
157 | | /// Parses the LargeList type (called after `LargeList` has been consumed) |
158 | | /// E.g: LargeList(nullable Int64, field: 'foo') |
159 | 0 | fn parse_large_list(&mut self) -> ArrowResult<DataType> { |
160 | 0 | self.expect_token(Token::LParen)?; |
161 | 0 | let field = self.parse_list_field("LargeList")?; |
162 | 0 | self.expect_token(Token::RParen)?; |
163 | 0 | Ok(DataType::LargeList(Arc::new(field))) |
164 | 0 | } |
165 | | |
166 | | /// Parses the LargeListView type (called after `LargeListView` has been consumed) |
167 | | /// E.g: LargeListView(nullable Int64, field: 'foo') |
168 | 0 | fn parse_large_list_view(&mut self) -> ArrowResult<DataType> { |
169 | 0 | self.expect_token(Token::LParen)?; |
170 | 0 | let field = self.parse_list_field("LargeListView")?; |
171 | 0 | self.expect_token(Token::RParen)?; |
172 | 0 | Ok(DataType::LargeListView(Arc::new(field))) |
173 | 0 | } |
174 | | |
175 | | /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed) |
176 | | /// E.g: FixedSizeList(5 x nullable Int64, field: 'foo') |
177 | 0 | fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> { |
178 | 0 | self.expect_token(Token::LParen)?; |
179 | 0 | let length = self.parse_i32("FixedSizeList")?; |
180 | 0 | self.expect_token(Token::X)?; |
181 | 0 | let field = self.parse_list_field("FixedSizeList")?; |
182 | 0 | self.expect_token(Token::RParen)?; |
183 | 0 | Ok(DataType::FixedSizeList(Arc::new(field), length)) |
184 | 0 | } |
185 | | |
186 | | /// Parses the next timeunit |
187 | 0 | fn parse_time_unit(&mut self, context: &str) -> ArrowResult<TimeUnit> { |
188 | 0 | match self.next_token()? { |
189 | 0 | Token::TimeUnit(time_unit) => Ok(time_unit), |
190 | 0 | tok => Err(make_error( |
191 | 0 | self.val, |
192 | 0 | &format!("finding TimeUnit for {context}, got {tok}"), |
193 | 0 | )), |
194 | | } |
195 | 0 | } |
196 | | |
197 | | /// Parses the next double quoted string |
198 | 0 | fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> { |
199 | 0 | let token = self.next_token()?; |
200 | 0 | if let Token::DoubleQuotedString(string) = token { |
201 | 0 | Ok(string) |
202 | | } else { |
203 | 0 | Err(make_error( |
204 | 0 | self.val, |
205 | 0 | &format!("expected double quoted string for {context}, got '{token}'"), |
206 | 0 | )) |
207 | | } |
208 | 0 | } |
209 | | |
210 | | /// Parses the next single quoted string |
211 | 0 | fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> { |
212 | 0 | let token = self.next_token()?; |
213 | 0 | if let Token::SingleQuotedString(string) = token { |
214 | 0 | Ok(string) |
215 | | } else { |
216 | 0 | Err(make_error( |
217 | 0 | self.val, |
218 | 0 | &format!("expected single quoted string for {context}, got '{token}'"), |
219 | 0 | )) |
220 | | } |
221 | 0 | } |
222 | | |
223 | | /// Parses the next integer value |
224 | 0 | fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> { |
225 | 0 | match self.next_token()? { |
226 | 0 | Token::Integer(v) => Ok(v), |
227 | 0 | tok => Err(make_error( |
228 | 0 | self.val, |
229 | 0 | &format!("finding i64 for {context}, got '{tok}'"), |
230 | 0 | )), |
231 | | } |
232 | 0 | } |
233 | | |
234 | | /// Parses the next i32 integer value |
235 | 0 | fn parse_i32(&mut self, context: &str) -> ArrowResult<i32> { |
236 | 0 | let length = self.parse_i64(context)?; |
237 | 0 | length.try_into().map_err(|e| { |
238 | 0 | make_error( |
239 | 0 | self.val, |
240 | 0 | &format!("converting {length} into i32 for {context}: {e}"), |
241 | | ) |
242 | 0 | }) |
243 | 0 | } |
244 | | |
245 | | /// Parses the next i8 integer value |
246 | 0 | fn parse_i8(&mut self, context: &str) -> ArrowResult<i8> { |
247 | 0 | let length = self.parse_i64(context)?; |
248 | 0 | length.try_into().map_err(|e| { |
249 | 0 | make_error( |
250 | 0 | self.val, |
251 | 0 | &format!("converting {length} into i8 for {context}: {e}"), |
252 | | ) |
253 | 0 | }) |
254 | 0 | } |
255 | | |
256 | | /// Parses the next u8 integer value |
257 | 0 | fn parse_u8(&mut self, context: &str) -> ArrowResult<u8> { |
258 | 0 | let length = self.parse_i64(context)?; |
259 | 0 | length.try_into().map_err(|e| { |
260 | 0 | make_error( |
261 | 0 | self.val, |
262 | 0 | &format!("converting {length} into u8 for {context}: {e}"), |
263 | | ) |
264 | 0 | }) |
265 | 0 | } |
266 | | |
267 | | /// Parses the next timestamp (called after `Timestamp` has been consumed) |
268 | 0 | fn parse_timestamp(&mut self) -> ArrowResult<DataType> { |
269 | 0 | self.expect_token(Token::LParen)?; |
270 | 0 | let time_unit = self.parse_time_unit("Timestamp")?; |
271 | | |
272 | | let timezone; |
273 | 0 | match self.next_token()? { |
274 | | Token::Comma => { |
275 | 0 | match self.next_token()? { |
276 | | // Support old style `Timestamp(Nanosecond, None)` |
277 | 0 | Token::None => { |
278 | 0 | timezone = None; |
279 | 0 | } |
280 | | // Support old style `Timestamp(Nanosecond, Some("Timezone"))` |
281 | | Token::Some => { |
282 | 0 | self.expect_token(Token::LParen)?; |
283 | 0 | timezone = Some(self.parse_double_quoted_string("Timezone")?); |
284 | 0 | self.expect_token(Token::RParen)?; |
285 | | } |
286 | 0 | Token::DoubleQuotedString(tz) => { |
287 | 0 | // Support new style `Timestamp(Nanosecond, "Timezone")` |
288 | 0 | timezone = Some(tz); |
289 | 0 | } |
290 | 0 | tok => { |
291 | 0 | return Err(make_error( |
292 | 0 | self.val, |
293 | 0 | &format!("Expected None, Some, or a timezone string, got {tok:?}"), |
294 | 0 | )); |
295 | | } |
296 | | }; |
297 | 0 | self.expect_token(Token::RParen)?; |
298 | | } |
299 | | // No timezone (e.g `Timestamp(ns)`) |
300 | 0 | Token::RParen => { |
301 | 0 | timezone = None; |
302 | 0 | } |
303 | 0 | next_token => { |
304 | 0 | return Err(make_error( |
305 | 0 | self.val, |
306 | 0 | &format!("Expected comma followed by a timezone, or an ), got {next_token:?}"), |
307 | 0 | )); |
308 | | } |
309 | | } |
310 | 0 | Ok(DataType::Timestamp(time_unit, timezone.map(Into::into))) |
311 | 0 | } |
312 | | |
313 | | /// Parses the next Time32 (called after `Time32` has been consumed) |
314 | 0 | fn parse_time32(&mut self) -> ArrowResult<DataType> { |
315 | 0 | self.expect_token(Token::LParen)?; |
316 | 0 | let time_unit = self.parse_time_unit("Time32")?; |
317 | 0 | self.expect_token(Token::RParen)?; |
318 | 0 | Ok(DataType::Time32(time_unit)) |
319 | 0 | } |
320 | | |
321 | | /// Parses the next Time64 (called after `Time64` has been consumed) |
322 | 0 | fn parse_time64(&mut self) -> ArrowResult<DataType> { |
323 | 0 | self.expect_token(Token::LParen)?; |
324 | 0 | let time_unit = self.parse_time_unit("Time64")?; |
325 | 0 | self.expect_token(Token::RParen)?; |
326 | 0 | Ok(DataType::Time64(time_unit)) |
327 | 0 | } |
328 | | |
329 | | /// Parses the next Duration (called after `Duration` has been consumed) |
330 | 0 | fn parse_duration(&mut self) -> ArrowResult<DataType> { |
331 | 0 | self.expect_token(Token::LParen)?; |
332 | 0 | let time_unit = self.parse_time_unit("Duration")?; |
333 | 0 | self.expect_token(Token::RParen)?; |
334 | 0 | Ok(DataType::Duration(time_unit)) |
335 | 0 | } |
336 | | |
337 | | /// Parses the next Interval (called after `Interval` has been consumed) |
338 | 0 | fn parse_interval(&mut self) -> ArrowResult<DataType> { |
339 | 0 | self.expect_token(Token::LParen)?; |
340 | 0 | let interval_unit = match self.next_token()? { |
341 | 0 | Token::IntervalUnit(interval_unit) => interval_unit, |
342 | 0 | tok => { |
343 | 0 | return Err(make_error( |
344 | 0 | self.val, |
345 | 0 | &format!("finding IntervalUnit for Interval, got {tok}"), |
346 | 0 | )); |
347 | | } |
348 | | }; |
349 | 0 | self.expect_token(Token::RParen)?; |
350 | 0 | Ok(DataType::Interval(interval_unit)) |
351 | 0 | } |
352 | | |
353 | | /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has been consumed) |
354 | 0 | fn parse_fixed_size_binary(&mut self) -> ArrowResult<DataType> { |
355 | 0 | self.expect_token(Token::LParen)?; |
356 | 0 | let length = self.parse_i32("FixedSizeBinary")?; |
357 | 0 | self.expect_token(Token::RParen)?; |
358 | 0 | Ok(DataType::FixedSizeBinary(length)) |
359 | 0 | } |
360 | | |
361 | | /// Parses the next Decimal32 (called after `Decimal32` has been consumed) |
362 | 0 | fn parse_decimal_32(&mut self) -> ArrowResult<DataType> { |
363 | 0 | self.expect_token(Token::LParen)?; |
364 | 0 | let precision = self.parse_u8("Decimal32")?; |
365 | 0 | self.expect_token(Token::Comma)?; |
366 | 0 | let scale = self.parse_i8("Decimal32")?; |
367 | 0 | self.expect_token(Token::RParen)?; |
368 | 0 | Ok(DataType::Decimal32(precision, scale)) |
369 | 0 | } |
370 | | |
371 | | /// Parses the next Decimal64 (called after `Decimal64` has been consumed) |
372 | 0 | fn parse_decimal_64(&mut self) -> ArrowResult<DataType> { |
373 | 0 | self.expect_token(Token::LParen)?; |
374 | 0 | let precision = self.parse_u8("Decimal64")?; |
375 | 0 | self.expect_token(Token::Comma)?; |
376 | 0 | let scale = self.parse_i8("Decimal64")?; |
377 | 0 | self.expect_token(Token::RParen)?; |
378 | 0 | Ok(DataType::Decimal64(precision, scale)) |
379 | 0 | } |
380 | | |
381 | | /// Parses the next Decimal128 (called after `Decimal128` has been consumed) |
382 | 0 | fn parse_decimal_128(&mut self) -> ArrowResult<DataType> { |
383 | 0 | self.expect_token(Token::LParen)?; |
384 | 0 | let precision = self.parse_u8("Decimal128")?; |
385 | 0 | self.expect_token(Token::Comma)?; |
386 | 0 | let scale = self.parse_i8("Decimal128")?; |
387 | 0 | self.expect_token(Token::RParen)?; |
388 | 0 | Ok(DataType::Decimal128(precision, scale)) |
389 | 0 | } |
390 | | |
391 | | /// Parses the next Decimal256 (called after `Decimal256` has been consumed) |
392 | 0 | fn parse_decimal_256(&mut self) -> ArrowResult<DataType> { |
393 | 0 | self.expect_token(Token::LParen)?; |
394 | 0 | let precision = self.parse_u8("Decimal256")?; |
395 | 0 | self.expect_token(Token::Comma)?; |
396 | 0 | let scale = self.parse_i8("Decimal256")?; |
397 | 0 | self.expect_token(Token::RParen)?; |
398 | 0 | Ok(DataType::Decimal256(precision, scale)) |
399 | 0 | } |
400 | | |
401 | | /// Parses the next Dictionary (called after `Dictionary` has been consumed) |
402 | 0 | fn parse_dictionary(&mut self) -> ArrowResult<DataType> { |
403 | 0 | self.expect_token(Token::LParen)?; |
404 | 0 | let key_type = self.parse_next_type()?; |
405 | 0 | self.expect_token(Token::Comma)?; |
406 | 0 | let value_type = self.parse_next_type()?; |
407 | 0 | self.expect_token(Token::RParen)?; |
408 | 0 | Ok(DataType::Dictionary( |
409 | 0 | Box::new(key_type), |
410 | 0 | Box::new(value_type), |
411 | 0 | )) |
412 | 0 | } |
413 | | |
414 | | /// Parses the next Struct (called after `Struct` has been consumed) |
415 | 0 | fn parse_struct(&mut self) -> ArrowResult<DataType> { |
416 | 0 | self.expect_token(Token::LParen)?; |
417 | 0 | let mut fields = Vec::new(); |
418 | | loop { |
419 | 0 | if self |
420 | 0 | .tokenizer |
421 | 0 | .next_if(|next| matches!(next, Ok(Token::RParen))) |
422 | 0 | .is_some() |
423 | | { |
424 | 0 | break; |
425 | 0 | } |
426 | | |
427 | 0 | let field = self.parse_field()?; |
428 | 0 | fields.push(Arc::new(field)); |
429 | 0 | match self.next_token()? { |
430 | 0 | Token::Comma => continue, |
431 | 0 | Token::RParen => break, |
432 | 0 | tok => { |
433 | 0 | return Err(make_error( |
434 | 0 | self.val, |
435 | 0 | &format!( |
436 | 0 | "Unexpected token while parsing Struct fields. Expected ',' or ')', but got '{tok}'" |
437 | 0 | ), |
438 | 0 | )); |
439 | | } |
440 | | } |
441 | | } |
442 | 0 | Ok(DataType::Struct(Fields::from(fields))) |
443 | 0 | } |
444 | | |
445 | | /// Parses the next Union (called after `Union` has been consumed) |
446 | | /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": nullable Utf8)) |
447 | 0 | fn parse_union(&mut self) -> ArrowResult<DataType> { |
448 | 0 | self.expect_token(Token::LParen)?; |
449 | 0 | let union_mode = self.parse_union_mode()?; |
450 | 0 | let mut type_ids = vec![]; |
451 | 0 | let mut fields = vec![]; |
452 | | loop { |
453 | 0 | if self |
454 | 0 | .tokenizer |
455 | 0 | .next_if(|next| matches!(next, Ok(Token::RParen))) |
456 | 0 | .is_some() |
457 | | { |
458 | 0 | break; |
459 | 0 | } |
460 | 0 | self.expect_token(Token::Comma)?; |
461 | 0 | let (type_id, field) = self.parse_union_field()?; |
462 | 0 | type_ids.push(type_id); |
463 | 0 | fields.push(field); |
464 | | } |
465 | 0 | Ok(DataType::Union( |
466 | 0 | UnionFields::new(type_ids, fields), |
467 | 0 | union_mode, |
468 | 0 | )) |
469 | 0 | } |
470 | | |
471 | | /// Parses the next UnionMode |
472 | 0 | fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> { |
473 | 0 | match self.next_token()? { |
474 | 0 | Token::UnionMode(union_mode) => Ok(union_mode), |
475 | 0 | tok => Err(make_error( |
476 | 0 | self.val, |
477 | 0 | &format!("finding UnionMode for Union, got {tok}"), |
478 | 0 | )), |
479 | | } |
480 | 0 | } |
481 | | |
482 | | /// Parses the next UnionField |
483 | | /// 0: ("a": nullable Int32) |
484 | 0 | fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> { |
485 | 0 | let type_id = self.parse_i8("UnionField")?; |
486 | 0 | self.expect_token(Token::Colon)?; |
487 | 0 | self.expect_token(Token::LParen)?; |
488 | 0 | let field = self.parse_field()?; |
489 | 0 | self.expect_token(Token::RParen)?; |
490 | 0 | Ok((type_id, field)) |
491 | 0 | } |
492 | | |
493 | | /// Parses the next Map (called after `Map` has been consumed) |
494 | | /// E.g: Map("entries": Struct("key": Utf8, "value": nullable Int32), sorted) |
495 | 0 | fn parse_map(&mut self) -> ArrowResult<DataType> { |
496 | 0 | self.expect_token(Token::LParen)?; |
497 | 0 | let field = self.parse_field()?; |
498 | 0 | self.expect_token(Token::Comma)?; |
499 | 0 | let sorted = self.parse_map_sorted()?; |
500 | 0 | self.expect_token(Token::RParen)?; |
501 | 0 | Ok(DataType::Map(Arc::new(field), sorted)) |
502 | 0 | } |
503 | | |
504 | | /// Parses map's sorted |
505 | 0 | fn parse_map_sorted(&mut self) -> ArrowResult<bool> { |
506 | 0 | match self.next_token()? { |
507 | 0 | Token::MapSorted(sorted) => Ok(sorted), |
508 | 0 | tok => Err(make_error( |
509 | 0 | self.val, |
510 | 0 | &format!("Expected sorted or unsorted for a map; got {tok:?}"), |
511 | 0 | )), |
512 | | } |
513 | 0 | } |
514 | | |
515 | | /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been consumed) |
516 | | /// E.g: RunEndEncoded("run_ends": UInt32, "values": nullable Int32) |
517 | 0 | fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> { |
518 | 0 | self.expect_token(Token::LParen)?; |
519 | 0 | let run_ends = self.parse_field()?; |
520 | 0 | self.expect_token(Token::Comma)?; |
521 | 0 | let values = self.parse_field()?; |
522 | 0 | self.expect_token(Token::RParen)?; |
523 | 0 | Ok(DataType::RunEndEncoded( |
524 | 0 | Arc::new(run_ends), |
525 | 0 | Arc::new(values), |
526 | 0 | )) |
527 | 0 | } |
528 | | |
529 | | /// return and consume if the next token is `Token::Nullable` |
530 | 0 | fn parse_opt_nullable(&mut self) -> bool { |
531 | 0 | self.tokenizer |
532 | 0 | .next_if(|next| matches!(next, Ok(Token::Nullable))) |
533 | 0 | .is_some() |
534 | 0 | } |
535 | | |
536 | | /// return the next token, or an error if there are none left |
537 | 0 | fn next_token(&mut self) -> ArrowResult<Token> { |
538 | 0 | match self.tokenizer.next() { |
539 | 0 | None => Err(make_error(self.val, "finding next token")), |
540 | 0 | Some(token) => token, |
541 | | } |
542 | 0 | } |
543 | | |
544 | | /// consume the next token, returning OK(()) if it matches tok, and Err if not |
545 | 0 | fn expect_token(&mut self, tok: Token) -> ArrowResult<()> { |
546 | 0 | let next_token = self.next_token()?; |
547 | 0 | if next_token == tok { |
548 | 0 | Ok(()) |
549 | | } else { |
550 | 0 | Err(make_error_expected(self.val, &tok, &next_token)) |
551 | | } |
552 | 0 | } |
553 | | } |
554 | | |
555 | | /// returns true if this character is a separator |
556 | 0 | fn is_separator(c: char) -> bool { |
557 | 0 | c == '(' || c == ')' || c == ',' || c == ':' || c == ' ' |
558 | 0 | } |
559 | | |
560 | | enum QuoteType { |
561 | | Double, |
562 | | Single, |
563 | | } |
564 | | |
565 | | #[derive(Debug)] |
566 | | /// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for parsing |
567 | | /// |
568 | | /// For example the string "Timestamp(ns)" would be parsed into: |
569 | | /// |
570 | | /// * Token::Timestamp |
571 | | /// * Token::Lparen |
572 | | /// * Token::IntervalUnit(IntervalUnit::Nanosecond) |
573 | | /// * Token::Rparen, |
574 | | struct Tokenizer<'a> { |
575 | | val: &'a str, |
576 | | chars: Peekable<Chars<'a>>, |
577 | | // temporary buffer for parsing words |
578 | | word: String, |
579 | | } |
580 | | |
581 | | impl<'a> Tokenizer<'a> { |
582 | 0 | fn new(val: &'a str) -> Self { |
583 | 0 | Self { |
584 | 0 | val, |
585 | 0 | chars: val.chars().peekable(), |
586 | 0 | word: String::new(), |
587 | 0 | } |
588 | 0 | } |
589 | | |
590 | | /// returns the next char, without consuming it |
591 | 0 | fn peek_next_char(&mut self) -> Option<char> { |
592 | 0 | self.chars.peek().copied() |
593 | 0 | } |
594 | | |
595 | | /// returns the next char, and consuming it |
596 | 0 | fn next_char(&mut self) -> Option<char> { |
597 | 0 | self.chars.next() |
598 | 0 | } |
599 | | |
600 | | /// parse the characters in val starting at pos, until the next |
601 | | /// `,`, `(`, or `)` or end of line |
602 | 0 | fn parse_word(&mut self) -> ArrowResult<Token> { |
603 | | // reset temp space |
604 | 0 | self.word.clear(); |
605 | | loop { |
606 | 0 | match self.peek_next_char() { |
607 | 0 | None => break, |
608 | 0 | Some(c) if is_separator(c) => break, |
609 | 0 | Some(c) => { |
610 | 0 | self.next_char(); |
611 | 0 | self.word.push(c); |
612 | 0 | } |
613 | | } |
614 | | } |
615 | | |
616 | 0 | if let Some(c) = self.word.chars().next() { |
617 | | // if it started with a number, try parsing it as an integer |
618 | 0 | if c == '-' || c.is_numeric() { |
619 | 0 | let val: i64 = self.word.parse().map_err(|e| { |
620 | 0 | make_error(self.val, &format!("parsing {} as integer: {e}", self.word)) |
621 | 0 | })?; |
622 | 0 | return Ok(Token::Integer(val)); |
623 | 0 | } |
624 | 0 | } |
625 | | |
626 | | // figure out what the word was |
627 | 0 | let token = match self.word.as_str() { |
628 | 0 | "Null" => Token::SimpleType(DataType::Null), |
629 | 0 | "Boolean" => Token::SimpleType(DataType::Boolean), |
630 | | |
631 | 0 | "Int8" => Token::SimpleType(DataType::Int8), |
632 | 0 | "Int16" => Token::SimpleType(DataType::Int16), |
633 | 0 | "Int32" => Token::SimpleType(DataType::Int32), |
634 | 0 | "Int64" => Token::SimpleType(DataType::Int64), |
635 | | |
636 | 0 | "UInt8" => Token::SimpleType(DataType::UInt8), |
637 | 0 | "UInt16" => Token::SimpleType(DataType::UInt16), |
638 | 0 | "UInt32" => Token::SimpleType(DataType::UInt32), |
639 | 0 | "UInt64" => Token::SimpleType(DataType::UInt64), |
640 | | |
641 | 0 | "Utf8" => Token::SimpleType(DataType::Utf8), |
642 | 0 | "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8), |
643 | 0 | "Utf8View" => Token::SimpleType(DataType::Utf8View), |
644 | 0 | "Binary" => Token::SimpleType(DataType::Binary), |
645 | 0 | "BinaryView" => Token::SimpleType(DataType::BinaryView), |
646 | 0 | "LargeBinary" => Token::SimpleType(DataType::LargeBinary), |
647 | | |
648 | 0 | "Float16" => Token::SimpleType(DataType::Float16), |
649 | 0 | "Float32" => Token::SimpleType(DataType::Float32), |
650 | 0 | "Float64" => Token::SimpleType(DataType::Float64), |
651 | | |
652 | 0 | "Date32" => Token::SimpleType(DataType::Date32), |
653 | 0 | "Date64" => Token::SimpleType(DataType::Date64), |
654 | | |
655 | 0 | "List" => Token::List, |
656 | 0 | "ListView" => Token::ListView, |
657 | 0 | "LargeList" => Token::LargeList, |
658 | 0 | "LargeListView" => Token::LargeListView, |
659 | 0 | "FixedSizeList" => Token::FixedSizeList, |
660 | | |
661 | 0 | "s" | "Second" => Token::TimeUnit(TimeUnit::Second), |
662 | 0 | "ms" | "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond), |
663 | 0 | "µs" | "us" | "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond), |
664 | 0 | "ns" | "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond), |
665 | | |
666 | 0 | "Timestamp" => Token::Timestamp, |
667 | 0 | "Time32" => Token::Time32, |
668 | 0 | "Time64" => Token::Time64, |
669 | 0 | "Duration" => Token::Duration, |
670 | 0 | "Interval" => Token::Interval, |
671 | 0 | "Dictionary" => Token::Dictionary, |
672 | | |
673 | 0 | "FixedSizeBinary" => Token::FixedSizeBinary, |
674 | | |
675 | 0 | "Decimal32" => Token::Decimal32, |
676 | 0 | "Decimal64" => Token::Decimal64, |
677 | 0 | "Decimal128" => Token::Decimal128, |
678 | 0 | "Decimal256" => Token::Decimal256, |
679 | | |
680 | 0 | "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth), |
681 | 0 | "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime), |
682 | 0 | "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano), |
683 | | |
684 | 0 | "Some" => Token::Some, |
685 | 0 | "None" => Token::None, |
686 | | |
687 | 0 | "nullable" => Token::Nullable, |
688 | 0 | "field" => Token::Field, |
689 | 0 | "x" => Token::X, |
690 | | |
691 | 0 | "Struct" => Token::Struct, |
692 | | |
693 | 0 | "Union" => Token::Union, |
694 | 0 | "Sparse" => Token::UnionMode(UnionMode::Sparse), |
695 | 0 | "Dense" => Token::UnionMode(UnionMode::Dense), |
696 | | |
697 | 0 | "Map" => Token::Map, |
698 | 0 | "sorted" => Token::MapSorted(true), |
699 | 0 | "unsorted" => Token::MapSorted(false), |
700 | | |
701 | 0 | "RunEndEncoded" => Token::RunEndEncoded, |
702 | | |
703 | 0 | token => { |
704 | 0 | return Err(make_error(self.val, &format!("unknown token: {token}"))); |
705 | | } |
706 | | }; |
707 | 0 | Ok(token) |
708 | 0 | } |
709 | | |
710 | | /// Parses e.g. `"foo bar"`, `'foo bar'` |
711 | 0 | fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> { |
712 | 0 | let quote = match quote_type { |
713 | 0 | QuoteType::Double => '\"', |
714 | 0 | QuoteType::Single => '\'', |
715 | | }; |
716 | | |
717 | 0 | if self.next_char() != Some(quote) { |
718 | 0 | return Err(make_error(self.val, "Expected \"")); |
719 | 0 | } |
720 | | |
721 | | // reset temp space |
722 | 0 | self.word.clear(); |
723 | | |
724 | 0 | let mut is_escaped = false; |
725 | | |
726 | | loop { |
727 | 0 | match self.next_char() { |
728 | | None => { |
729 | 0 | return Err(ArrowError::ParseError(format!( |
730 | 0 | "Unterminated string at: \"{}", |
731 | 0 | self.word |
732 | 0 | ))); |
733 | | } |
734 | 0 | Some(c) => match c { |
735 | 0 | '\\' => { |
736 | 0 | is_escaped = true; |
737 | 0 | self.word.push(c); |
738 | 0 | } |
739 | 0 | c if c == quote => { |
740 | 0 | if is_escaped { |
741 | 0 | self.word.push(c); |
742 | 0 | is_escaped = false; |
743 | 0 | } else { |
744 | 0 | break; |
745 | | } |
746 | | } |
747 | 0 | c => { |
748 | 0 | self.word.push(c); |
749 | 0 | } |
750 | | }, |
751 | | } |
752 | | } |
753 | | |
754 | 0 | let val: String = self.word.parse().map_err(|err| { |
755 | 0 | ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word)) |
756 | 0 | })?; |
757 | | |
758 | 0 | if val.is_empty() { |
759 | | // Using empty strings as field names is just asking for trouble |
760 | 0 | return Err(make_error(self.val, "empty strings aren't allowed")); |
761 | 0 | } |
762 | | |
763 | 0 | match quote_type { |
764 | 0 | QuoteType::Double => Ok(Token::DoubleQuotedString(val)), |
765 | 0 | QuoteType::Single => Ok(Token::SingleQuotedString(val)), |
766 | | } |
767 | 0 | } |
768 | | } |
769 | | |
770 | | impl Iterator for Tokenizer<'_> { |
771 | | type Item = ArrowResult<Token>; |
772 | | |
773 | 0 | fn next(&mut self) -> Option<Self::Item> { |
774 | | loop { |
775 | 0 | match self.peek_next_char()? { |
776 | | ' ' => { |
777 | | // skip whitespace |
778 | 0 | self.next_char(); |
779 | 0 | continue; |
780 | | } |
781 | | '"' => { |
782 | 0 | return Some(self.parse_quoted_string(QuoteType::Double)); |
783 | | } |
784 | | '\'' => { |
785 | 0 | return Some(self.parse_quoted_string(QuoteType::Single)); |
786 | | } |
787 | | '(' => { |
788 | 0 | self.next_char(); |
789 | 0 | return Some(Ok(Token::LParen)); |
790 | | } |
791 | | ')' => { |
792 | 0 | self.next_char(); |
793 | 0 | return Some(Ok(Token::RParen)); |
794 | | } |
795 | | ',' => { |
796 | 0 | self.next_char(); |
797 | 0 | return Some(Ok(Token::Comma)); |
798 | | } |
799 | | ':' => { |
800 | 0 | self.next_char(); |
801 | 0 | return Some(Ok(Token::Colon)); |
802 | | } |
803 | 0 | _ => return Some(self.parse_word()), |
804 | | } |
805 | | } |
806 | 0 | } |
807 | | } |
808 | | |
809 | | /// Grammar is |
810 | | /// |
811 | | #[derive(Debug, PartialEq)] |
812 | | enum Token { |
813 | | // Null, or Int32 |
814 | | SimpleType(DataType), |
815 | | Timestamp, |
816 | | Time32, |
817 | | Time64, |
818 | | Duration, |
819 | | Interval, |
820 | | FixedSizeBinary, |
821 | | Decimal32, |
822 | | Decimal64, |
823 | | Decimal128, |
824 | | Decimal256, |
825 | | Dictionary, |
826 | | TimeUnit(TimeUnit), |
827 | | IntervalUnit(IntervalUnit), |
828 | | LParen, |
829 | | RParen, |
830 | | Comma, |
831 | | Colon, |
832 | | Some, |
833 | | None, |
834 | | Integer(i64), |
835 | | DoubleQuotedString(String), |
836 | | SingleQuotedString(String), |
837 | | List, |
838 | | ListView, |
839 | | LargeList, |
840 | | LargeListView, |
841 | | FixedSizeList, |
842 | | Struct, |
843 | | Union, |
844 | | UnionMode(UnionMode), |
845 | | Map, |
846 | | MapSorted(bool), |
847 | | RunEndEncoded, |
848 | | Nullable, |
849 | | Field, |
850 | | X, |
851 | | } |
852 | | |
853 | | impl Display for Token { |
854 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
855 | 0 | match self { |
856 | 0 | Token::SimpleType(t) => write!(f, "{t}"), |
857 | 0 | Token::List => write!(f, "List"), |
858 | 0 | Token::ListView => write!(f, "ListView"), |
859 | 0 | Token::LargeList => write!(f, "LargeList"), |
860 | 0 | Token::LargeListView => write!(f, "LargeListView"), |
861 | 0 | Token::FixedSizeList => write!(f, "FixedSizeList"), |
862 | 0 | Token::Timestamp => write!(f, "Timestamp"), |
863 | 0 | Token::Time32 => write!(f, "Time32"), |
864 | 0 | Token::Time64 => write!(f, "Time64"), |
865 | 0 | Token::Duration => write!(f, "Duration"), |
866 | 0 | Token::Interval => write!(f, "Interval"), |
867 | 0 | Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"), |
868 | 0 | Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"), |
869 | 0 | Token::LParen => write!(f, "("), |
870 | 0 | Token::RParen => write!(f, ")"), |
871 | 0 | Token::Comma => write!(f, ","), |
872 | 0 | Token::Colon => write!(f, ":"), |
873 | 0 | Token::Some => write!(f, "Some"), |
874 | 0 | Token::None => write!(f, "None"), |
875 | 0 | Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), |
876 | 0 | Token::Decimal32 => write!(f, "Decimal32"), |
877 | 0 | Token::Decimal64 => write!(f, "Decimal64"), |
878 | 0 | Token::Decimal128 => write!(f, "Decimal128"), |
879 | 0 | Token::Decimal256 => write!(f, "Decimal256"), |
880 | 0 | Token::Dictionary => write!(f, "Dictionary"), |
881 | 0 | Token::Integer(v) => write!(f, "Integer({v})"), |
882 | 0 | Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), |
883 | 0 | Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"), |
884 | 0 | Token::Struct => write!(f, "Struct"), |
885 | 0 | Token::Union => write!(f, "Union"), |
886 | 0 | Token::UnionMode(m) => write!(f, "{m:?}"), |
887 | 0 | Token::Map => write!(f, "Map"), |
888 | 0 | Token::MapSorted(sorted) => { |
889 | 0 | write!(f, "{}", if *sorted { "sorted" } else { "unsorted" }) |
890 | | } |
891 | 0 | Token::RunEndEncoded => write!(f, "RunEndEncoded"), |
892 | 0 | Token::Nullable => write!(f, "nullable"), |
893 | 0 | Token::Field => write!(f, "field"), |
894 | 0 | Token::X => write!(f, "x"), |
895 | | } |
896 | 0 | } |
897 | | } |
898 | | |
899 | | #[cfg(test)] |
900 | | mod test { |
901 | | use super::*; |
902 | | |
903 | | #[test] |
904 | | fn test_parse_data_type() { |
905 | | // this ensures types can be parsed correctly from their string representations |
906 | | for dt in list_datatypes() { |
907 | | round_trip(dt) |
908 | | } |
909 | | } |
910 | | |
911 | | /// Ensure we converting data_type to a string, and then parse it as a type |
912 | | /// verifying it is the same |
913 | | fn round_trip(data_type: DataType) { |
914 | | let data_type_string = data_type.to_string(); |
915 | | println!("Input '{data_type_string}' ({data_type:?})"); |
916 | | let parsed_type = parse_data_type(&data_type_string).unwrap(); |
917 | | assert_eq!( |
918 | | data_type, parsed_type, |
919 | | "Mismatch parsing {data_type_string}" |
920 | | ); |
921 | | } |
922 | | |
923 | | fn list_datatypes() -> Vec<DataType> { |
924 | | vec![ |
925 | | // --------- |
926 | | // Non Nested types |
927 | | // --------- |
928 | | DataType::Null, |
929 | | DataType::Boolean, |
930 | | DataType::Int8, |
931 | | DataType::Int16, |
932 | | DataType::Int32, |
933 | | DataType::Int64, |
934 | | DataType::UInt8, |
935 | | DataType::UInt16, |
936 | | DataType::UInt32, |
937 | | DataType::UInt64, |
938 | | DataType::Float16, |
939 | | DataType::Float32, |
940 | | DataType::Float64, |
941 | | DataType::Timestamp(TimeUnit::Second, None), |
942 | | DataType::Timestamp(TimeUnit::Millisecond, None), |
943 | | DataType::Timestamp(TimeUnit::Microsecond, None), |
944 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
945 | | // we can't cover all possible timezones, here we only test utc and +08:00 |
946 | | DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), |
947 | | DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), |
948 | | DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), |
949 | | DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())), |
950 | | DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())), |
951 | | DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())), |
952 | | DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())), |
953 | | DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), |
954 | | DataType::Date32, |
955 | | DataType::Date64, |
956 | | DataType::Time32(TimeUnit::Second), |
957 | | DataType::Time32(TimeUnit::Millisecond), |
958 | | DataType::Time32(TimeUnit::Microsecond), |
959 | | DataType::Time32(TimeUnit::Nanosecond), |
960 | | DataType::Time64(TimeUnit::Second), |
961 | | DataType::Time64(TimeUnit::Millisecond), |
962 | | DataType::Time64(TimeUnit::Microsecond), |
963 | | DataType::Time64(TimeUnit::Nanosecond), |
964 | | DataType::Duration(TimeUnit::Second), |
965 | | DataType::Duration(TimeUnit::Millisecond), |
966 | | DataType::Duration(TimeUnit::Microsecond), |
967 | | DataType::Duration(TimeUnit::Nanosecond), |
968 | | DataType::Interval(IntervalUnit::YearMonth), |
969 | | DataType::Interval(IntervalUnit::DayTime), |
970 | | DataType::Interval(IntervalUnit::MonthDayNano), |
971 | | DataType::Binary, |
972 | | DataType::BinaryView, |
973 | | DataType::FixedSizeBinary(0), |
974 | | DataType::FixedSizeBinary(1234), |
975 | | DataType::FixedSizeBinary(-432), |
976 | | DataType::LargeBinary, |
977 | | DataType::Utf8, |
978 | | DataType::Utf8View, |
979 | | DataType::LargeUtf8, |
980 | | DataType::Decimal32(7, 8), |
981 | | DataType::Decimal64(6, 9), |
982 | | DataType::Decimal128(7, 12), |
983 | | DataType::Decimal256(6, 13), |
984 | | // --------- |
985 | | // Nested types |
986 | | // --------- |
987 | | DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), |
988 | | DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), |
989 | | DataType::Dictionary( |
990 | | Box::new(DataType::Int8), |
991 | | Box::new(DataType::Timestamp(TimeUnit::Nanosecond, None)), |
992 | | ), |
993 | | DataType::Dictionary( |
994 | | Box::new(DataType::Int8), |
995 | | Box::new(DataType::FixedSizeBinary(23)), |
996 | | ), |
997 | | DataType::Dictionary( |
998 | | Box::new(DataType::Int8), |
999 | | Box::new( |
1000 | | // nested dictionaries are probably a bad idea but they are possible |
1001 | | DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), |
1002 | | ), |
1003 | | ), |
1004 | | DataType::Struct(Fields::from(vec![ |
1005 | | Field::new("f1", DataType::Int64, true), |
1006 | | Field::new("f2", DataType::Float64, true), |
1007 | | Field::new( |
1008 | | "f3", |
1009 | | DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), |
1010 | | true, |
1011 | | ), |
1012 | | Field::new( |
1013 | | "f4", |
1014 | | DataType::Dictionary( |
1015 | | Box::new(DataType::Int8), |
1016 | | Box::new(DataType::FixedSizeBinary(23)), |
1017 | | ), |
1018 | | true, |
1019 | | ), |
1020 | | ])), |
1021 | | DataType::Struct(Fields::from(vec![ |
1022 | | Field::new("Int64", DataType::Int64, true), |
1023 | | Field::new("Float64", DataType::Float64, true), |
1024 | | ])), |
1025 | | DataType::Struct(Fields::from(vec![ |
1026 | | Field::new("f1", DataType::Int64, true), |
1027 | | Field::new( |
1028 | | "nested_struct", |
1029 | | DataType::Struct(Fields::from(vec![Field::new("n1", DataType::Int64, true)])), |
1030 | | true, |
1031 | | ), |
1032 | | ])), |
1033 | | DataType::Struct(Fields::from(vec![Field::new("f1", DataType::Int64, true)])), |
1034 | | DataType::Struct(Fields::empty()), |
1035 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), |
1036 | | DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))), |
1037 | | DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1038 | | DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))), |
1039 | | DataType::List(Arc::new(Field::new( |
1040 | | "nested_list", |
1041 | | DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1042 | | true, |
1043 | | ))), |
1044 | | DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))), |
1045 | | DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))), |
1046 | | DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1047 | | DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))), |
1048 | | DataType::ListView(Arc::new(Field::new( |
1049 | | "nested_list_view", |
1050 | | DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1051 | | true, |
1052 | | ))), |
1053 | | DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))), |
1054 | | DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))), |
1055 | | DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1056 | | DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))), |
1057 | | DataType::LargeList(Arc::new(Field::new( |
1058 | | "nested_large_list", |
1059 | | DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1060 | | true, |
1061 | | ))), |
1062 | | DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))), |
1063 | | DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))), |
1064 | | DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1065 | | DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))), |
1066 | | DataType::LargeListView(Arc::new(Field::new( |
1067 | | "nested_large_list_view", |
1068 | | DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), |
1069 | | true, |
1070 | | ))), |
1071 | | DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2), |
1072 | | DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2), |
1073 | | DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2), |
1074 | | DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2), |
1075 | | DataType::FixedSizeList( |
1076 | | Arc::new(Field::new( |
1077 | | "nested_fixed_size_list", |
1078 | | DataType::FixedSizeList( |
1079 | | Arc::new(Field::new("Int64", DataType::Int64, true)), |
1080 | | 2, |
1081 | | ), |
1082 | | true, |
1083 | | )), |
1084 | | 2, |
1085 | | ), |
1086 | | DataType::Union( |
1087 | | UnionFields::new( |
1088 | | vec![0, 1], |
1089 | | vec![ |
1090 | | Field::new("Int32", DataType::Int32, false), |
1091 | | Field::new("Utf8", DataType::Utf8, true), |
1092 | | ], |
1093 | | ), |
1094 | | UnionMode::Sparse, |
1095 | | ), |
1096 | | DataType::Union( |
1097 | | UnionFields::new( |
1098 | | vec![0, 1], |
1099 | | vec![ |
1100 | | Field::new("Int32", DataType::Int32, false), |
1101 | | Field::new("Utf8", DataType::Utf8, true), |
1102 | | ], |
1103 | | ), |
1104 | | UnionMode::Dense, |
1105 | | ), |
1106 | | DataType::Union( |
1107 | | UnionFields::new( |
1108 | | vec![0, 1], |
1109 | | vec![ |
1110 | | Field::new_union( |
1111 | | "nested_union", |
1112 | | vec![0, 1], |
1113 | | vec![ |
1114 | | Field::new("Int32", DataType::Int32, false), |
1115 | | Field::new("Utf8", DataType::Utf8, true), |
1116 | | ], |
1117 | | UnionMode::Dense, |
1118 | | ), |
1119 | | Field::new("Utf8", DataType::Utf8, true), |
1120 | | ], |
1121 | | ), |
1122 | | UnionMode::Sparse, |
1123 | | ), |
1124 | | DataType::Union( |
1125 | | UnionFields::new(vec![0], vec![Field::new("Int32", DataType::Int32, false)]), |
1126 | | UnionMode::Dense, |
1127 | | ), |
1128 | | DataType::Union( |
1129 | | UnionFields::new(Vec::<i8>::new(), Vec::<Field>::new()), |
1130 | | UnionMode::Sparse, |
1131 | | ), |
1132 | | DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), true), |
1133 | | DataType::Map(Arc::new(Field::new("Int64", DataType::Int64, true)), false), |
1134 | | DataType::Map( |
1135 | | Arc::new(Field::new_map( |
1136 | | "nested_map", |
1137 | | "entries", |
1138 | | Field::new("key", DataType::Utf8, false), |
1139 | | Field::new("value", DataType::Int32, true), |
1140 | | false, |
1141 | | true, |
1142 | | )), |
1143 | | true, |
1144 | | ), |
1145 | | DataType::RunEndEncoded( |
1146 | | Arc::new(Field::new("run_ends", DataType::UInt32, false)), |
1147 | | Arc::new(Field::new("values", DataType::Int32, true)), |
1148 | | ), |
1149 | | DataType::RunEndEncoded( |
1150 | | Arc::new(Field::new( |
1151 | | "nested_run_end_encoded", |
1152 | | DataType::RunEndEncoded( |
1153 | | Arc::new(Field::new("run_ends", DataType::UInt32, false)), |
1154 | | Arc::new(Field::new("values", DataType::Int32, true)), |
1155 | | ), |
1156 | | true, |
1157 | | )), |
1158 | | Arc::new(Field::new("values", DataType::Int32, true)), |
1159 | | ), |
1160 | | ] |
1161 | | } |
1162 | | |
1163 | | #[test] |
1164 | | fn test_parse_data_type_whitespace_tolerance() { |
1165 | | // (string to parse, expected DataType) |
1166 | | let cases = [ |
1167 | | ("Int8", DataType::Int8), |
1168 | | ( |
1169 | | "Timestamp (ns)", |
1170 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
1171 | | ), |
1172 | | ( |
1173 | | "Timestamp (ns) ", |
1174 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
1175 | | ), |
1176 | | ( |
1177 | | " Timestamp (ns )", |
1178 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
1179 | | ), |
1180 | | ( |
1181 | | "Timestamp (ns ) ", |
1182 | | DataType::Timestamp(TimeUnit::Nanosecond, None), |
1183 | | ), |
1184 | | ]; |
1185 | | |
1186 | | for (data_type_string, expected_data_type) in cases { |
1187 | | let parsed_data_type = parse_data_type(data_type_string).unwrap(); |
1188 | | assert_eq!( |
1189 | | parsed_data_type, expected_data_type, |
1190 | | "Parsing '{data_type_string}', expecting '{expected_data_type}'" |
1191 | | ); |
1192 | | } |
1193 | | } |
1194 | | |
1195 | | /// Ensure that old style types can still be parsed |
1196 | | #[test] |
1197 | | fn test_parse_data_type_backwards_compatibility() { |
1198 | | use DataType::*; |
1199 | | use IntervalUnit::*; |
1200 | | use TimeUnit::*; |
1201 | | // List below created with: |
1202 | | // for t in list_datatypes() { |
1203 | | // println!(r#"("{t}", {t:?}),"#) |
1204 | | // } |
1205 | | // (string to parse, expected DataType) |
1206 | | let cases = [ |
1207 | | ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)), |
1208 | | ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)), |
1209 | | ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)), |
1210 | | ("Timestamp(Second, None)", Timestamp(Second, None)), |
1211 | | ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)), |
1212 | | // Timezones |
1213 | | ( |
1214 | | r#"Timestamp(Nanosecond, Some("+00:00"))"#, |
1215 | | Timestamp(Nanosecond, Some("+00:00".into())), |
1216 | | ), |
1217 | | ( |
1218 | | r#"Timestamp(Microsecond, Some("+00:00"))"#, |
1219 | | Timestamp(Microsecond, Some("+00:00".into())), |
1220 | | ), |
1221 | | ( |
1222 | | r#"Timestamp(Millisecond, Some("+00:00"))"#, |
1223 | | Timestamp(Millisecond, Some("+00:00".into())), |
1224 | | ), |
1225 | | ( |
1226 | | r#"Timestamp(Second, Some("+00:00"))"#, |
1227 | | Timestamp(Second, Some("+00:00".into())), |
1228 | | ), |
1229 | | ("Null", Null), |
1230 | | ("Boolean", Boolean), |
1231 | | ("Int8", Int8), |
1232 | | ("Int16", Int16), |
1233 | | ("Int32", Int32), |
1234 | | ("Int64", Int64), |
1235 | | ("UInt8", UInt8), |
1236 | | ("UInt16", UInt16), |
1237 | | ("UInt32", UInt32), |
1238 | | ("UInt64", UInt64), |
1239 | | ("Float16", Float16), |
1240 | | ("Float32", Float32), |
1241 | | ("Float64", Float64), |
1242 | | ("Timestamp(s)", Timestamp(Second, None)), |
1243 | | ("Timestamp(ms)", Timestamp(Millisecond, None)), |
1244 | | ("Timestamp(µs)", Timestamp(Microsecond, None)), |
1245 | | ("Timestamp(ns)", Timestamp(Nanosecond, None)), |
1246 | | ( |
1247 | | r#"Timestamp(ns, "+00:00")"#, |
1248 | | Timestamp(Nanosecond, Some("+00:00".into())), |
1249 | | ), |
1250 | | ( |
1251 | | r#"Timestamp(µs, "+00:00")"#, |
1252 | | Timestamp(Microsecond, Some("+00:00".into())), |
1253 | | ), |
1254 | | ( |
1255 | | r#"Timestamp(ms, "+00:00")"#, |
1256 | | Timestamp(Millisecond, Some("+00:00".into())), |
1257 | | ), |
1258 | | ( |
1259 | | r#"Timestamp(s, "+00:00")"#, |
1260 | | Timestamp(Second, Some("+00:00".into())), |
1261 | | ), |
1262 | | ( |
1263 | | r#"Timestamp(ns, "+08:00")"#, |
1264 | | Timestamp(Nanosecond, Some("+08:00".into())), |
1265 | | ), |
1266 | | ( |
1267 | | r#"Timestamp(µs, "+08:00")"#, |
1268 | | Timestamp(Microsecond, Some("+08:00".into())), |
1269 | | ), |
1270 | | ( |
1271 | | r#"Timestamp(ms, "+08:00")"#, |
1272 | | Timestamp(Millisecond, Some("+08:00".into())), |
1273 | | ), |
1274 | | ( |
1275 | | r#"Timestamp(s, "+08:00")"#, |
1276 | | Timestamp(Second, Some("+08:00".into())), |
1277 | | ), |
1278 | | ("Date32", Date32), |
1279 | | ("Date64", Date64), |
1280 | | ("Time32(s)", Time32(Second)), |
1281 | | ("Time32(ms)", Time32(Millisecond)), |
1282 | | ("Time32(µs)", Time32(Microsecond)), |
1283 | | ("Time32(ns)", Time32(Nanosecond)), |
1284 | | ("Time64(s)", Time64(Second)), |
1285 | | ("Time64(ms)", Time64(Millisecond)), |
1286 | | ("Time64(µs)", Time64(Microsecond)), |
1287 | | ("Time64(ns)", Time64(Nanosecond)), |
1288 | | ("Duration(s)", Duration(Second)), |
1289 | | ("Duration(ms)", Duration(Millisecond)), |
1290 | | ("Duration(µs)", Duration(Microsecond)), |
1291 | | ("Duration(ns)", Duration(Nanosecond)), |
1292 | | ("Interval(YearMonth)", Interval(YearMonth)), |
1293 | | ("Interval(DayTime)", Interval(DayTime)), |
1294 | | ("Interval(MonthDayNano)", Interval(MonthDayNano)), |
1295 | | ("Binary", Binary), |
1296 | | ("BinaryView", BinaryView), |
1297 | | ("FixedSizeBinary(0)", FixedSizeBinary(0)), |
1298 | | ("FixedSizeBinary(1234)", FixedSizeBinary(1234)), |
1299 | | ("FixedSizeBinary(-432)", FixedSizeBinary(-432)), |
1300 | | ("LargeBinary", LargeBinary), |
1301 | | ("Utf8", Utf8), |
1302 | | ("Utf8View", Utf8View), |
1303 | | ("LargeUtf8", LargeUtf8), |
1304 | | ("Decimal32(7, 8)", Decimal32(7, 8)), |
1305 | | ("Decimal64(6, 9)", Decimal64(6, 9)), |
1306 | | ("Decimal128(7, 12)", Decimal128(7, 12)), |
1307 | | ("Decimal256(6, 13)", Decimal256(6, 13)), |
1308 | | ( |
1309 | | "Dictionary(Int32, Utf8)", |
1310 | | Dictionary(Box::new(Int32), Box::new(Utf8)), |
1311 | | ), |
1312 | | ( |
1313 | | "Dictionary(Int8, Utf8)", |
1314 | | Dictionary(Box::new(Int8), Box::new(Utf8)), |
1315 | | ), |
1316 | | ( |
1317 | | "Dictionary(Int8, Timestamp(ns))", |
1318 | | Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond, None))), |
1319 | | ), |
1320 | | ( |
1321 | | "Dictionary(Int8, FixedSizeBinary(23))", |
1322 | | Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))), |
1323 | | ), |
1324 | | ( |
1325 | | "Dictionary(Int8, Dictionary(Int8, Utf8))", |
1326 | | Dictionary( |
1327 | | Box::new(Int8), |
1328 | | Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))), |
1329 | | ), |
1330 | | ), |
1331 | | ( |
1332 | | r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3": nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8, FixedSizeBinary(23)))"#, |
1333 | | Struct(Fields::from(vec![ |
1334 | | Field::new("f1", Int64, true), |
1335 | | Field::new("f2", Float64, true), |
1336 | | Field::new("f3", Timestamp(Second, Some("+08:00".into())), true), |
1337 | | Field::new( |
1338 | | "f4", |
1339 | | Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))), |
1340 | | true, |
1341 | | ), |
1342 | | ])), |
1343 | | ), |
1344 | | ( |
1345 | | r#"Struct("Int64": nullable Int64, "Float64": nullable Float64)"#, |
1346 | | Struct(Fields::from(vec![ |
1347 | | Field::new("Int64", Int64, true), |
1348 | | Field::new("Float64", Float64, true), |
1349 | | ])), |
1350 | | ), |
1351 | | ( |
1352 | | r#"Struct("f1": nullable Int64, "nested_struct": nullable Struct("n1": nullable Int64))"#, |
1353 | | Struct(Fields::from(vec![ |
1354 | | Field::new("f1", Int64, true), |
1355 | | Field::new( |
1356 | | "nested_struct", |
1357 | | Struct(Fields::from(vec![Field::new("n1", Int64, true)])), |
1358 | | true, |
1359 | | ), |
1360 | | ])), |
1361 | | ), |
1362 | | (r#"Struct()"#, Struct(Fields::empty())), |
1363 | | ]; |
1364 | | |
1365 | | for (data_type_string, expected_data_type) in cases { |
1366 | | let parsed_data_type = parse_data_type(data_type_string).unwrap(); |
1367 | | assert_eq!( |
1368 | | parsed_data_type, expected_data_type, |
1369 | | "Parsing '{data_type_string}', expecting '{expected_data_type}'" |
1370 | | ); |
1371 | | } |
1372 | | } |
1373 | | |
1374 | | #[test] |
1375 | | fn parse_data_type_errors() { |
1376 | | // (string to parse, expected error message) |
1377 | | let cases = [ |
1378 | | ("", "Unsupported type ''"), |
1379 | | ("", "Error finding next token"), |
1380 | | ("null", "Unsupported type 'null'"), |
1381 | | ("Nu", "Unsupported type 'Nu'"), |
1382 | | (r#"Timestamp(ns, +00:00)"#, "Error unknown token: +00"), |
1383 | | ( |
1384 | | r#"Timestamp(ns, "+00:00)"#, |
1385 | | r#"Unterminated string at: "+00:00)"#, |
1386 | | ), |
1387 | | (r#"Timestamp(ns, "")"#, r#"empty strings aren't allowed"#), |
1388 | | ( |
1389 | | r#"Timestamp(ns, "+00:00"")"#, |
1390 | | r#"Parser error: Unterminated string at: ")"#, |
1391 | | ), |
1392 | | ("Timestamp(ns, ", "Error finding next token"), |
1393 | | ( |
1394 | | "Float32 Float32", |
1395 | | "trailing content after parsing 'Float32'", |
1396 | | ), |
1397 | | ("Int32, ", "trailing content after parsing 'Int32'"), |
1398 | | ("Int32(3), ", "trailing content after parsing 'Int32'"), |
1399 | | ( |
1400 | | "FixedSizeBinary(Int32), ", |
1401 | | "Error finding i64 for FixedSizeBinary, got 'Int32'", |
1402 | | ), |
1403 | | ( |
1404 | | "FixedSizeBinary(3.0), ", |
1405 | | "Error parsing 3.0 as integer: invalid digit found in string", |
1406 | | ), |
1407 | | // too large for i32 |
1408 | | ( |
1409 | | "FixedSizeBinary(4000000000), ", |
1410 | | "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted", |
1411 | | ), |
1412 | | // can't have negative precision |
1413 | | ( |
1414 | | "Decimal32(-3, 5)", |
1415 | | "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted", |
1416 | | ), |
1417 | | ( |
1418 | | "Decimal64(-3, 5)", |
1419 | | "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted", |
1420 | | ), |
1421 | | ( |
1422 | | "Decimal128(-3, 5)", |
1423 | | "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted", |
1424 | | ), |
1425 | | ( |
1426 | | "Decimal256(-3, 5)", |
1427 | | "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted", |
1428 | | ), |
1429 | | ( |
1430 | | "Decimal32(3, 500)", |
1431 | | "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted", |
1432 | | ), |
1433 | | ( |
1434 | | "Decimal64(3, 500)", |
1435 | | "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted", |
1436 | | ), |
1437 | | ( |
1438 | | "Decimal128(3, 500)", |
1439 | | "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted", |
1440 | | ), |
1441 | | ( |
1442 | | "Decimal256(3, 500)", |
1443 | | "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted", |
1444 | | ), |
1445 | | ("Struct(f1 Int64)", "Error unknown token: f1"), |
1446 | | ("Struct(\"f1\" Int64)", "Expected ':'"), |
1447 | | ( |
1448 | | "Struct(\"f1\": )", |
1449 | | "Error finding next type, got unexpected ')'", |
1450 | | ), |
1451 | | ]; |
1452 | | |
1453 | | for (data_type_string, expected_message) in cases { |
1454 | | println!("Parsing '{data_type_string}', expecting '{expected_message}'"); |
1455 | | match parse_data_type(data_type_string) { |
1456 | | Ok(d) => panic!("Expected error while parsing '{data_type_string}', but got '{d}'"), |
1457 | | Err(e) => { |
1458 | | let message = e.to_string(); |
1459 | | assert!( |
1460 | | message.contains(expected_message), |
1461 | | "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n" |
1462 | | ); |
1463 | | |
1464 | | if !message.contains("Unterminated string") { |
1465 | | // errors should also contain a help message |
1466 | | assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}"); |
1467 | | } |
1468 | | } |
1469 | | } |
1470 | | } |
1471 | | } |
1472 | | |
1473 | | #[test] |
1474 | | fn parse_error_type() { |
1475 | | let err = parse_data_type("foobar").unwrap_err(); |
1476 | | assert!(matches!(err, ArrowError::ParseError(_))); |
1477 | | assert_eq!( |
1478 | | err.to_string(), |
1479 | | "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar" |
1480 | | ); |
1481 | | } |
1482 | | } |