Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-cast/src/cast/string.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::cast::*;
19
use arrow_buffer::NullBuffer;
20
21
0
pub(crate) fn value_to_string<O: OffsetSizeTrait>(
22
0
    array: &dyn Array,
23
0
    options: &CastOptions,
24
0
) -> Result<ArrayRef, ArrowError> {
25
0
    let mut builder = GenericStringBuilder::<O>::new();
26
0
    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
27
0
    let nulls = array.nulls();
28
0
    for i in 0..array.len() {
29
0
        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
30
0
            true => builder.append_null(),
31
            false => {
32
0
                formatter.value(i).write(&mut builder)?;
33
                // tell the builder the row is finished
34
0
                builder.append_value("");
35
            }
36
        }
37
    }
38
0
    Ok(Arc::new(builder.finish()))
39
0
}
40
41
0
pub(crate) fn value_to_string_view(
42
0
    array: &dyn Array,
43
0
    options: &CastOptions,
44
0
) -> Result<ArrayRef, ArrowError> {
45
0
    let mut builder = StringViewBuilder::with_capacity(array.len());
46
0
    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
47
0
    let nulls = array.nulls();
48
    // buffer to avoid reallocating on each value
49
    // TODO: replace with write to builder after https://github.com/apache/arrow-rs/issues/6373
50
0
    let mut buffer = String::new();
51
0
    for i in 0..array.len() {
52
0
        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
53
0
            true => builder.append_null(),
54
            false => {
55
                // write to buffer first and then copy into target array
56
0
                buffer.clear();
57
0
                formatter.value(i).write(&mut buffer)?;
58
0
                builder.append_value(&buffer)
59
            }
60
        }
61
    }
62
0
    Ok(Arc::new(builder.finish()))
63
0
}
64
65
/// Parse UTF-8
66
0
pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
67
0
    array: &dyn Array,
68
0
    cast_options: &CastOptions,
69
0
) -> Result<ArrayRef, ArrowError> {
70
0
    let string_array = array.as_string::<O>();
71
0
    parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
72
0
        string_array.nulls().cloned()
73
0
    })
74
0
}
75
76
/// Parse UTF-8 View
77
0
pub(crate) fn parse_string_view<P: Parser>(
78
0
    array: &dyn Array,
79
0
    cast_options: &CastOptions,
80
0
) -> Result<ArrayRef, ArrowError> {
81
0
    let string_view_array = array.as_string_view();
82
0
    parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
83
0
        string_view_array.nulls().cloned()
84
0
    })
85
0
}
86
87
0
fn parse_string_iter<
88
0
    'a,
89
0
    P: Parser,
90
0
    I: Iterator<Item = Option<&'a str>>,
91
0
    F: FnOnce() -> Option<NullBuffer>,
92
0
>(
93
0
    iter: I,
94
0
    cast_options: &CastOptions,
95
0
    nulls: F,
96
0
) -> Result<ArrayRef, ArrowError> {
97
0
    let array = if cast_options.safe {
98
0
        let iter = iter.map(|x| x.and_then(P::parse));
99
100
        // Benefit:
101
        //     20% performance improvement
102
        // Soundness:
103
        //     The iterator is trustedLen because it comes from an `StringArray`.
104
0
        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
105
    } else {
106
0
        let v = iter
107
0
            .map(|x| match x {
108
0
                Some(v) => P::parse(v).ok_or_else(|| {
109
0
                    ArrowError::CastError(format!(
110
0
                        "Cannot cast string '{}' to value of {:?} type",
111
0
                        v,
112
0
                        P::DATA_TYPE
113
0
                    ))
114
0
                }),
115
0
                None => Ok(P::Native::default()),
116
0
            })
117
0
            .collect::<Result<Vec<_>, ArrowError>>()?;
118
0
        PrimitiveArray::new(v.into(), nulls())
119
    };
120
121
0
    Ok(Arc::new(array) as ArrayRef)
122
0
}
123
124
/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
125
0
pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
126
0
    array: &dyn Array,
127
0
    to_tz: &Option<Arc<str>>,
128
0
    cast_options: &CastOptions,
129
0
) -> Result<ArrayRef, ArrowError> {
130
0
    let array = array.as_string::<O>();
131
0
    let out: PrimitiveArray<T> = match to_tz {
132
0
        Some(tz) => {
133
0
            let tz: Tz = tz.as_ref().parse()?;
134
0
            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
135
        }
136
0
        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
137
    };
138
0
    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
139
0
}
140
141
/// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
142
0
pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
143
0
    array: &dyn Array,
144
0
    to_tz: &Option<Arc<str>>,
145
0
    cast_options: &CastOptions,
146
0
) -> Result<ArrayRef, ArrowError> {
147
0
    let array = array.as_string_view();
148
0
    let out: PrimitiveArray<T> = match to_tz {
149
0
        Some(tz) => {
150
0
            let tz: Tz = tz.as_ref().parse()?;
151
0
            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
152
        }
153
0
        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
154
    };
155
0
    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
156
0
}
157
158
0
fn cast_string_to_timestamp_impl<
159
0
    'a,
160
0
    I: Iterator<Item = Option<&'a str>>,
161
0
    T: ArrowTimestampType,
162
0
    Tz: TimeZone,
163
0
>(
164
0
    iter: I,
165
0
    tz: &Tz,
166
0
    cast_options: &CastOptions,
167
0
) -> Result<PrimitiveArray<T>, ArrowError> {
168
0
    if cast_options.safe {
169
0
        let iter = iter.map(|v| {
170
0
            v.and_then(|v| {
171
0
                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
172
0
                T::make_value(naive)
173
0
            })
174
0
        });
175
        // Benefit:
176
        //     20% performance improvement
177
        // Soundness:
178
        //     The iterator is trustedLen because it comes from an `StringArray`.
179
180
0
        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
181
    } else {
182
0
        let vec = iter
183
0
            .map(|v| {
184
0
                v.map(|v| {
185
0
                    let naive = string_to_datetime(tz, v)?.naive_utc();
186
0
                    T::make_value(naive).ok_or_else(|| match T::UNIT {
187
0
                        TimeUnit::Nanosecond => ArrowError::CastError(format!(
188
0
                            "Overflow converting {naive} to Nanosecond. The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"
189
0
                        )),
190
0
                        _ => ArrowError::CastError(format!(
191
0
                            "Overflow converting {naive} to {:?}",
192
0
                            T::UNIT
193
0
                        ))
194
0
                    })
195
0
                })
196
0
                    .transpose()
197
0
            })
198
0
            .collect::<Result<Vec<Option<i64>>, _>>()?;
199
200
        // Benefit:
201
        //     20% performance improvement
202
        // Soundness:
203
        //     The iterator is trustedLen because it comes from an `StringArray`.
204
0
        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
205
    }
206
0
}
207
208
0
pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
209
0
    array: &dyn Array,
210
0
    cast_options: &CastOptions,
211
0
    parse_function: F,
212
0
) -> Result<ArrayRef, ArrowError>
213
0
where
214
0
    Offset: OffsetSizeTrait,
215
0
    ArrowType: ArrowPrimitiveType,
216
0
    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
217
{
218
0
    let string_array = array
219
0
        .as_any()
220
0
        .downcast_ref::<GenericStringArray<Offset>>()
221
0
        .unwrap();
222
0
    cast_string_to_interval_impl::<_, ArrowType, F>(
223
0
        string_array.iter(),
224
0
        cast_options,
225
0
        parse_function,
226
    )
227
0
}
228
229
0
pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
230
0
    array: &dyn Array,
231
0
    cast_options: &CastOptions,
232
0
) -> Result<ArrayRef, ArrowError> {
233
0
    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
234
0
        array,
235
0
        cast_options,
236
        parse_interval_year_month,
237
    )
238
0
}
239
240
0
pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
241
0
    array: &dyn Array,
242
0
    cast_options: &CastOptions,
243
0
) -> Result<ArrayRef, ArrowError> {
244
0
    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
245
0
        array,
246
0
        cast_options,
247
        parse_interval_day_time,
248
    )
249
0
}
250
251
0
pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
252
0
    array: &dyn Array,
253
0
    cast_options: &CastOptions,
254
0
) -> Result<ArrayRef, ArrowError> {
255
0
    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
256
0
        array,
257
0
        cast_options,
258
        parse_interval_month_day_nano,
259
    )
260
0
}
261
262
0
pub(crate) fn cast_view_to_interval<F, ArrowType>(
263
0
    array: &dyn Array,
264
0
    cast_options: &CastOptions,
265
0
    parse_function: F,
266
0
) -> Result<ArrayRef, ArrowError>
267
0
where
268
0
    ArrowType: ArrowPrimitiveType,
269
0
    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
270
{
271
0
    let string_view_array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
272
0
    cast_string_to_interval_impl::<_, ArrowType, F>(
273
0
        string_view_array.iter(),
274
0
        cast_options,
275
0
        parse_function,
276
    )
277
0
}
278
279
0
pub(crate) fn cast_view_to_year_month_interval(
280
0
    array: &dyn Array,
281
0
    cast_options: &CastOptions,
282
0
) -> Result<ArrayRef, ArrowError> {
283
0
    cast_view_to_interval::<_, IntervalYearMonthType>(
284
0
        array,
285
0
        cast_options,
286
        parse_interval_year_month,
287
    )
288
0
}
289
290
0
pub(crate) fn cast_view_to_day_time_interval(
291
0
    array: &dyn Array,
292
0
    cast_options: &CastOptions,
293
0
) -> Result<ArrayRef, ArrowError> {
294
0
    cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time)
295
0
}
296
297
0
pub(crate) fn cast_view_to_month_day_nano_interval(
298
0
    array: &dyn Array,
299
0
    cast_options: &CastOptions,
300
0
) -> Result<ArrayRef, ArrowError> {
301
0
    cast_view_to_interval::<_, IntervalMonthDayNanoType>(
302
0
        array,
303
0
        cast_options,
304
        parse_interval_month_day_nano,
305
    )
306
0
}
307
308
0
fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
309
0
    iter: I,
310
0
    cast_options: &CastOptions,
311
0
    parse_function: F,
312
0
) -> Result<ArrayRef, ArrowError>
313
0
where
314
0
    I: Iterator<Item = Option<&'a str>>,
315
0
    ArrowType: ArrowPrimitiveType,
316
0
    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
317
{
318
0
    let interval_array = if cast_options.safe {
319
0
        let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
320
321
        // Benefit:
322
        //     20% performance improvement
323
        // Soundness:
324
        //     The iterator is trustedLen because it comes from an `StringArray`.
325
0
        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
326
    } else {
327
0
        let vec = iter
328
0
            .map(|v| v.map(parse_function).transpose())
329
0
            .collect::<Result<Vec<_>, ArrowError>>()?;
330
331
        // Benefit:
332
        //     20% performance improvement
333
        // Soundness:
334
        //     The iterator is trustedLen because it comes from an `StringArray`.
335
0
        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
336
    };
337
0
    Ok(Arc::new(interval_array) as ArrayRef)
338
0
}
339
340
/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
341
/// offset size so re-encoding offset is unnecessary.
342
0
pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
343
0
    array: &dyn Array,
344
0
    cast_options: &CastOptions,
345
0
) -> Result<ArrayRef, ArrowError> {
346
0
    let array = array
347
0
        .as_any()
348
0
        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
349
0
        .unwrap();
350
351
0
    match GenericStringArray::<O>::try_from_binary(array.clone()) {
352
0
        Ok(a) => Ok(Arc::new(a)),
353
0
        Err(e) => match cast_options.safe {
354
            true => {
355
                // Fallback to slow method to convert invalid sequences to nulls
356
0
                let mut builder =
357
0
                    GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());
358
359
0
                let iter = array
360
0
                    .iter()
361
0
                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
362
363
0
                builder.extend(iter);
364
0
                Ok(Arc::new(builder.finish()))
365
            }
366
0
            false => Err(e),
367
        },
368
    }
369
0
}
370
371
/// Casts string to boolean
372
0
fn cast_string_to_boolean<'a, StrArray>(
373
0
    array: &StrArray,
374
0
    cast_options: &CastOptions,
375
0
) -> Result<ArrayRef, ArrowError>
376
0
where
377
0
    StrArray: StringArrayType<'a>,
378
{
379
0
    let output_array = array
380
0
        .iter()
381
0
        .map(|value| match value {
382
0
            Some(value) => match value.to_ascii_lowercase().trim() {
383
0
                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
384
0
                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
385
0
                    Ok(Some(false))
386
                }
387
0
                invalid_value => match cast_options.safe {
388
0
                    true => Ok(None),
389
0
                    false => Err(ArrowError::CastError(format!(
390
0
                        "Cannot cast value '{invalid_value}' to value of Boolean type",
391
0
                    ))),
392
                },
393
            },
394
0
            None => Ok(None),
395
0
        })
396
0
        .collect::<Result<BooleanArray, _>>()?;
397
398
0
    Ok(Arc::new(output_array))
399
0
}
400
401
0
pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
402
0
    from: &dyn Array,
403
0
    cast_options: &CastOptions,
404
0
) -> Result<ArrayRef, ArrowError>
405
0
where
406
0
    OffsetSize: OffsetSizeTrait,
407
{
408
0
    let array = from
409
0
        .as_any()
410
0
        .downcast_ref::<GenericStringArray<OffsetSize>>()
411
0
        .unwrap();
412
413
0
    cast_string_to_boolean(&array, cast_options)
414
0
}
415
416
0
pub(crate) fn cast_utf8view_to_boolean(
417
0
    from: &dyn Array,
418
0
    cast_options: &CastOptions,
419
0
) -> Result<ArrayRef, ArrowError> {
420
0
    let array = from.as_any().downcast_ref::<StringViewArray>().unwrap();
421
422
0
    cast_string_to_boolean(&array, cast_options)
423
0
}