/Users/andrewlamb/Software/arrow-rs/arrow-cast/src/cast/string.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::cast::*; |
19 | | use arrow_buffer::NullBuffer; |
20 | | |
21 | 0 | pub(crate) fn value_to_string<O: OffsetSizeTrait>( |
22 | 0 | array: &dyn Array, |
23 | 0 | options: &CastOptions, |
24 | 0 | ) -> Result<ArrayRef, ArrowError> { |
25 | 0 | let mut builder = GenericStringBuilder::<O>::new(); |
26 | 0 | let formatter = ArrayFormatter::try_new(array, &options.format_options)?; |
27 | 0 | let nulls = array.nulls(); |
28 | 0 | for i in 0..array.len() { |
29 | 0 | match nulls.map(|x| x.is_null(i)).unwrap_or_default() { |
30 | 0 | true => builder.append_null(), |
31 | | false => { |
32 | 0 | formatter.value(i).write(&mut builder)?; |
33 | | // tell the builder the row is finished |
34 | 0 | builder.append_value(""); |
35 | | } |
36 | | } |
37 | | } |
38 | 0 | Ok(Arc::new(builder.finish())) |
39 | 0 | } |
40 | | |
41 | 0 | pub(crate) fn value_to_string_view( |
42 | 0 | array: &dyn Array, |
43 | 0 | options: &CastOptions, |
44 | 0 | ) -> Result<ArrayRef, ArrowError> { |
45 | 0 | let mut builder = StringViewBuilder::with_capacity(array.len()); |
46 | 0 | let formatter = ArrayFormatter::try_new(array, &options.format_options)?; |
47 | 0 | let nulls = array.nulls(); |
48 | | // buffer to avoid reallocating on each value |
49 | | // TODO: replace with write to builder after https://github.com/apache/arrow-rs/issues/6373 |
50 | 0 | let mut buffer = String::new(); |
51 | 0 | for i in 0..array.len() { |
52 | 0 | match nulls.map(|x| x.is_null(i)).unwrap_or_default() { |
53 | 0 | true => builder.append_null(), |
54 | | false => { |
55 | | // write to buffer first and then copy into target array |
56 | 0 | buffer.clear(); |
57 | 0 | formatter.value(i).write(&mut buffer)?; |
58 | 0 | builder.append_value(&buffer) |
59 | | } |
60 | | } |
61 | | } |
62 | 0 | Ok(Arc::new(builder.finish())) |
63 | 0 | } |
64 | | |
65 | | /// Parse UTF-8 |
66 | 0 | pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>( |
67 | 0 | array: &dyn Array, |
68 | 0 | cast_options: &CastOptions, |
69 | 0 | ) -> Result<ArrayRef, ArrowError> { |
70 | 0 | let string_array = array.as_string::<O>(); |
71 | 0 | parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || { |
72 | 0 | string_array.nulls().cloned() |
73 | 0 | }) |
74 | 0 | } |
75 | | |
76 | | /// Parse UTF-8 View |
77 | 0 | pub(crate) fn parse_string_view<P: Parser>( |
78 | 0 | array: &dyn Array, |
79 | 0 | cast_options: &CastOptions, |
80 | 0 | ) -> Result<ArrayRef, ArrowError> { |
81 | 0 | let string_view_array = array.as_string_view(); |
82 | 0 | parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || { |
83 | 0 | string_view_array.nulls().cloned() |
84 | 0 | }) |
85 | 0 | } |
86 | | |
87 | 0 | fn parse_string_iter< |
88 | 0 | 'a, |
89 | 0 | P: Parser, |
90 | 0 | I: Iterator<Item = Option<&'a str>>, |
91 | 0 | F: FnOnce() -> Option<NullBuffer>, |
92 | 0 | >( |
93 | 0 | iter: I, |
94 | 0 | cast_options: &CastOptions, |
95 | 0 | nulls: F, |
96 | 0 | ) -> Result<ArrayRef, ArrowError> { |
97 | 0 | let array = if cast_options.safe { |
98 | 0 | let iter = iter.map(|x| x.and_then(P::parse)); |
99 | | |
100 | | // Benefit: |
101 | | // 20% performance improvement |
102 | | // Soundness: |
103 | | // The iterator is trustedLen because it comes from an `StringArray`. |
104 | 0 | unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) } |
105 | | } else { |
106 | 0 | let v = iter |
107 | 0 | .map(|x| match x { |
108 | 0 | Some(v) => P::parse(v).ok_or_else(|| { |
109 | 0 | ArrowError::CastError(format!( |
110 | 0 | "Cannot cast string '{}' to value of {:?} type", |
111 | 0 | v, |
112 | 0 | P::DATA_TYPE |
113 | 0 | )) |
114 | 0 | }), |
115 | 0 | None => Ok(P::Native::default()), |
116 | 0 | }) |
117 | 0 | .collect::<Result<Vec<_>, ArrowError>>()?; |
118 | 0 | PrimitiveArray::new(v.into(), nulls()) |
119 | | }; |
120 | | |
121 | 0 | Ok(Arc::new(array) as ArrayRef) |
122 | 0 | } |
123 | | |
124 | | /// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) |
125 | 0 | pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>( |
126 | 0 | array: &dyn Array, |
127 | 0 | to_tz: &Option<Arc<str>>, |
128 | 0 | cast_options: &CastOptions, |
129 | 0 | ) -> Result<ArrayRef, ArrowError> { |
130 | 0 | let array = array.as_string::<O>(); |
131 | 0 | let out: PrimitiveArray<T> = match to_tz { |
132 | 0 | Some(tz) => { |
133 | 0 | let tz: Tz = tz.as_ref().parse()?; |
134 | 0 | cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)? |
135 | | } |
136 | 0 | None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?, |
137 | | }; |
138 | 0 | Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) |
139 | 0 | } |
140 | | |
141 | | /// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) |
142 | 0 | pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>( |
143 | 0 | array: &dyn Array, |
144 | 0 | to_tz: &Option<Arc<str>>, |
145 | 0 | cast_options: &CastOptions, |
146 | 0 | ) -> Result<ArrayRef, ArrowError> { |
147 | 0 | let array = array.as_string_view(); |
148 | 0 | let out: PrimitiveArray<T> = match to_tz { |
149 | 0 | Some(tz) => { |
150 | 0 | let tz: Tz = tz.as_ref().parse()?; |
151 | 0 | cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)? |
152 | | } |
153 | 0 | None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?, |
154 | | }; |
155 | 0 | Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) |
156 | 0 | } |
157 | | |
158 | 0 | fn cast_string_to_timestamp_impl< |
159 | 0 | 'a, |
160 | 0 | I: Iterator<Item = Option<&'a str>>, |
161 | 0 | T: ArrowTimestampType, |
162 | 0 | Tz: TimeZone, |
163 | 0 | >( |
164 | 0 | iter: I, |
165 | 0 | tz: &Tz, |
166 | 0 | cast_options: &CastOptions, |
167 | 0 | ) -> Result<PrimitiveArray<T>, ArrowError> { |
168 | 0 | if cast_options.safe { |
169 | 0 | let iter = iter.map(|v| { |
170 | 0 | v.and_then(|v| { |
171 | 0 | let naive = string_to_datetime(tz, v).ok()?.naive_utc(); |
172 | 0 | T::make_value(naive) |
173 | 0 | }) |
174 | 0 | }); |
175 | | // Benefit: |
176 | | // 20% performance improvement |
177 | | // Soundness: |
178 | | // The iterator is trustedLen because it comes from an `StringArray`. |
179 | | |
180 | 0 | Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) |
181 | | } else { |
182 | 0 | let vec = iter |
183 | 0 | .map(|v| { |
184 | 0 | v.map(|v| { |
185 | 0 | let naive = string_to_datetime(tz, v)?.naive_utc(); |
186 | 0 | T::make_value(naive).ok_or_else(|| match T::UNIT { |
187 | 0 | TimeUnit::Nanosecond => ArrowError::CastError(format!( |
188 | 0 | "Overflow converting {naive} to Nanosecond. The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804" |
189 | 0 | )), |
190 | 0 | _ => ArrowError::CastError(format!( |
191 | 0 | "Overflow converting {naive} to {:?}", |
192 | 0 | T::UNIT |
193 | 0 | )) |
194 | 0 | }) |
195 | 0 | }) |
196 | 0 | .transpose() |
197 | 0 | }) |
198 | 0 | .collect::<Result<Vec<Option<i64>>, _>>()?; |
199 | | |
200 | | // Benefit: |
201 | | // 20% performance improvement |
202 | | // Soundness: |
203 | | // The iterator is trustedLen because it comes from an `StringArray`. |
204 | 0 | Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) |
205 | | } |
206 | 0 | } |
207 | | |
208 | 0 | pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>( |
209 | 0 | array: &dyn Array, |
210 | 0 | cast_options: &CastOptions, |
211 | 0 | parse_function: F, |
212 | 0 | ) -> Result<ArrayRef, ArrowError> |
213 | 0 | where |
214 | 0 | Offset: OffsetSizeTrait, |
215 | 0 | ArrowType: ArrowPrimitiveType, |
216 | 0 | F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy, |
217 | | { |
218 | 0 | let string_array = array |
219 | 0 | .as_any() |
220 | 0 | .downcast_ref::<GenericStringArray<Offset>>() |
221 | 0 | .unwrap(); |
222 | 0 | cast_string_to_interval_impl::<_, ArrowType, F>( |
223 | 0 | string_array.iter(), |
224 | 0 | cast_options, |
225 | 0 | parse_function, |
226 | | ) |
227 | 0 | } |
228 | | |
229 | 0 | pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>( |
230 | 0 | array: &dyn Array, |
231 | 0 | cast_options: &CastOptions, |
232 | 0 | ) -> Result<ArrayRef, ArrowError> { |
233 | 0 | cast_string_to_interval::<Offset, _, IntervalYearMonthType>( |
234 | 0 | array, |
235 | 0 | cast_options, |
236 | | parse_interval_year_month, |
237 | | ) |
238 | 0 | } |
239 | | |
240 | 0 | pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>( |
241 | 0 | array: &dyn Array, |
242 | 0 | cast_options: &CastOptions, |
243 | 0 | ) -> Result<ArrayRef, ArrowError> { |
244 | 0 | cast_string_to_interval::<Offset, _, IntervalDayTimeType>( |
245 | 0 | array, |
246 | 0 | cast_options, |
247 | | parse_interval_day_time, |
248 | | ) |
249 | 0 | } |
250 | | |
251 | 0 | pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>( |
252 | 0 | array: &dyn Array, |
253 | 0 | cast_options: &CastOptions, |
254 | 0 | ) -> Result<ArrayRef, ArrowError> { |
255 | 0 | cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>( |
256 | 0 | array, |
257 | 0 | cast_options, |
258 | | parse_interval_month_day_nano, |
259 | | ) |
260 | 0 | } |
261 | | |
262 | 0 | pub(crate) fn cast_view_to_interval<F, ArrowType>( |
263 | 0 | array: &dyn Array, |
264 | 0 | cast_options: &CastOptions, |
265 | 0 | parse_function: F, |
266 | 0 | ) -> Result<ArrayRef, ArrowError> |
267 | 0 | where |
268 | 0 | ArrowType: ArrowPrimitiveType, |
269 | 0 | F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy, |
270 | | { |
271 | 0 | let string_view_array = array.as_any().downcast_ref::<StringViewArray>().unwrap(); |
272 | 0 | cast_string_to_interval_impl::<_, ArrowType, F>( |
273 | 0 | string_view_array.iter(), |
274 | 0 | cast_options, |
275 | 0 | parse_function, |
276 | | ) |
277 | 0 | } |
278 | | |
279 | 0 | pub(crate) fn cast_view_to_year_month_interval( |
280 | 0 | array: &dyn Array, |
281 | 0 | cast_options: &CastOptions, |
282 | 0 | ) -> Result<ArrayRef, ArrowError> { |
283 | 0 | cast_view_to_interval::<_, IntervalYearMonthType>( |
284 | 0 | array, |
285 | 0 | cast_options, |
286 | | parse_interval_year_month, |
287 | | ) |
288 | 0 | } |
289 | | |
290 | 0 | pub(crate) fn cast_view_to_day_time_interval( |
291 | 0 | array: &dyn Array, |
292 | 0 | cast_options: &CastOptions, |
293 | 0 | ) -> Result<ArrayRef, ArrowError> { |
294 | 0 | cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time) |
295 | 0 | } |
296 | | |
297 | 0 | pub(crate) fn cast_view_to_month_day_nano_interval( |
298 | 0 | array: &dyn Array, |
299 | 0 | cast_options: &CastOptions, |
300 | 0 | ) -> Result<ArrayRef, ArrowError> { |
301 | 0 | cast_view_to_interval::<_, IntervalMonthDayNanoType>( |
302 | 0 | array, |
303 | 0 | cast_options, |
304 | | parse_interval_month_day_nano, |
305 | | ) |
306 | 0 | } |
307 | | |
308 | 0 | fn cast_string_to_interval_impl<'a, I, ArrowType, F>( |
309 | 0 | iter: I, |
310 | 0 | cast_options: &CastOptions, |
311 | 0 | parse_function: F, |
312 | 0 | ) -> Result<ArrayRef, ArrowError> |
313 | 0 | where |
314 | 0 | I: Iterator<Item = Option<&'a str>>, |
315 | 0 | ArrowType: ArrowPrimitiveType, |
316 | 0 | F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy, |
317 | | { |
318 | 0 | let interval_array = if cast_options.safe { |
319 | 0 | let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok())); |
320 | | |
321 | | // Benefit: |
322 | | // 20% performance improvement |
323 | | // Soundness: |
324 | | // The iterator is trustedLen because it comes from an `StringArray`. |
325 | 0 | unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) } |
326 | | } else { |
327 | 0 | let vec = iter |
328 | 0 | .map(|v| v.map(parse_function).transpose()) |
329 | 0 | .collect::<Result<Vec<_>, ArrowError>>()?; |
330 | | |
331 | | // Benefit: |
332 | | // 20% performance improvement |
333 | | // Soundness: |
334 | | // The iterator is trustedLen because it comes from an `StringArray`. |
335 | 0 | unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) } |
336 | | }; |
337 | 0 | Ok(Arc::new(interval_array) as ArrayRef) |
338 | 0 | } |
339 | | |
340 | | /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same |
341 | | /// offset size so re-encoding offset is unnecessary. |
342 | 0 | pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>( |
343 | 0 | array: &dyn Array, |
344 | 0 | cast_options: &CastOptions, |
345 | 0 | ) -> Result<ArrayRef, ArrowError> { |
346 | 0 | let array = array |
347 | 0 | .as_any() |
348 | 0 | .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>() |
349 | 0 | .unwrap(); |
350 | | |
351 | 0 | match GenericStringArray::<O>::try_from_binary(array.clone()) { |
352 | 0 | Ok(a) => Ok(Arc::new(a)), |
353 | 0 | Err(e) => match cast_options.safe { |
354 | | true => { |
355 | | // Fallback to slow method to convert invalid sequences to nulls |
356 | 0 | let mut builder = |
357 | 0 | GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len()); |
358 | | |
359 | 0 | let iter = array |
360 | 0 | .iter() |
361 | 0 | .map(|v| v.and_then(|v| std::str::from_utf8(v).ok())); |
362 | | |
363 | 0 | builder.extend(iter); |
364 | 0 | Ok(Arc::new(builder.finish())) |
365 | | } |
366 | 0 | false => Err(e), |
367 | | }, |
368 | | } |
369 | 0 | } |
370 | | |
371 | | /// Casts string to boolean |
372 | 0 | fn cast_string_to_boolean<'a, StrArray>( |
373 | 0 | array: &StrArray, |
374 | 0 | cast_options: &CastOptions, |
375 | 0 | ) -> Result<ArrayRef, ArrowError> |
376 | 0 | where |
377 | 0 | StrArray: StringArrayType<'a>, |
378 | | { |
379 | 0 | let output_array = array |
380 | 0 | .iter() |
381 | 0 | .map(|value| match value { |
382 | 0 | Some(value) => match value.to_ascii_lowercase().trim() { |
383 | 0 | "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)), |
384 | 0 | "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => { |
385 | 0 | Ok(Some(false)) |
386 | | } |
387 | 0 | invalid_value => match cast_options.safe { |
388 | 0 | true => Ok(None), |
389 | 0 | false => Err(ArrowError::CastError(format!( |
390 | 0 | "Cannot cast value '{invalid_value}' to value of Boolean type", |
391 | 0 | ))), |
392 | | }, |
393 | | }, |
394 | 0 | None => Ok(None), |
395 | 0 | }) |
396 | 0 | .collect::<Result<BooleanArray, _>>()?; |
397 | | |
398 | 0 | Ok(Arc::new(output_array)) |
399 | 0 | } |
400 | | |
401 | 0 | pub(crate) fn cast_utf8_to_boolean<OffsetSize>( |
402 | 0 | from: &dyn Array, |
403 | 0 | cast_options: &CastOptions, |
404 | 0 | ) -> Result<ArrayRef, ArrowError> |
405 | 0 | where |
406 | 0 | OffsetSize: OffsetSizeTrait, |
407 | | { |
408 | 0 | let array = from |
409 | 0 | .as_any() |
410 | 0 | .downcast_ref::<GenericStringArray<OffsetSize>>() |
411 | 0 | .unwrap(); |
412 | | |
413 | 0 | cast_string_to_boolean(&array, cast_options) |
414 | 0 | } |
415 | | |
416 | 0 | pub(crate) fn cast_utf8view_to_boolean( |
417 | 0 | from: &dyn Array, |
418 | 0 | cast_options: &CastOptions, |
419 | 0 | ) -> Result<ArrayRef, ArrowError> { |
420 | 0 | let array = from.as_any().downcast_ref::<StringViewArray>().unwrap(); |
421 | | |
422 | 0 | cast_string_to_boolean(&array, cast_options) |
423 | 0 | } |