Coverage Report

Created: 2025-11-17 14:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-schema/src/fields.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::ops::Deref;
19
use std::sync::Arc;
20
21
use crate::{ArrowError, DataType, Field, FieldRef};
22
23
/// A cheaply cloneable, owned slice of [`FieldRef`]
24
///
25
/// Similar to `Arc<Vec<FieldRef>>` or `Arc<[FieldRef]>`
26
///
27
/// Can be constructed in a number of ways
28
///
29
/// ```
30
/// # use std::sync::Arc;
31
/// # use arrow_schema::{DataType, Field, Fields, SchemaBuilder};
32
/// // Can be constructed from Vec<Field>
33
/// Fields::from(vec![Field::new("a", DataType::Boolean, false)]);
34
/// // Can be constructed from Vec<FieldRef>
35
/// Fields::from(vec![Arc::new(Field::new("a", DataType::Boolean, false))]);
36
/// // Can be constructed from an iterator of Field
37
/// std::iter::once(Field::new("a", DataType::Boolean, false)).collect::<Fields>();
38
/// // Can be constructed from an iterator of FieldRef
39
/// std::iter::once(Arc::new(Field::new("a", DataType::Boolean, false))).collect::<Fields>();
40
/// ```
41
///
42
/// See [`SchemaBuilder`] for mutating or updating [`Fields`]
43
///
44
/// ```
45
/// # use arrow_schema::{DataType, Field, SchemaBuilder};
46
/// let mut builder = SchemaBuilder::new();
47
/// builder.push(Field::new("a", DataType::Boolean, false));
48
/// builder.push(Field::new("b", DataType::Boolean, false));
49
/// let fields = builder.finish().fields;
50
///
51
/// let mut builder = SchemaBuilder::from(&fields);
52
/// builder.remove(0);
53
/// let new = builder.finish().fields;
54
/// ```
55
///
56
/// [`SchemaBuilder`]: crate::SchemaBuilder
57
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
58
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
59
#[cfg_attr(feature = "serde", serde(transparent))]
60
pub struct Fields(Arc<[FieldRef]>);
61
62
impl std::fmt::Debug for Fields {
63
4
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64
4
        self.0.as_ref().fmt(f)
65
4
    }
66
}
67
68
impl Fields {
69
    /// Returns a new empty [`Fields`]
70
14
    pub fn empty() -> Self {
71
14
        Self(Arc::new([]))
72
14
    }
73
74
    /// Return size of this instance in bytes.
75
0
    pub fn size(&self) -> usize {
76
0
        self.iter()
77
0
            .map(|field| field.size() + std::mem::size_of::<FieldRef>())
78
0
            .sum()
79
0
    }
80
81
    /// Searches for a field by name, returning it along with its index if found
82
13
    pub fn find(&self, name: &str) -> Option<(usize, &FieldRef)> {
83
13
        self.0.iter().enumerate().find(|(_, b)| b.name() == name)
84
13
    }
85
86
    /// Check to see if `self` is a superset of `other`
87
    ///
88
    /// In particular returns true if both have the same number of fields, and [`Field::contains`]
89
    /// for each field across self and other
90
    ///
91
    /// In other words, any record that conforms to `other` should also conform to `self`
92
0
    pub fn contains(&self, other: &Fields) -> bool {
93
0
        if Arc::ptr_eq(&self.0, &other.0) {
94
0
            return true;
95
0
        }
96
0
        self.len() == other.len()
97
0
            && self
98
0
                .iter()
99
0
                .zip(other.iter())
100
0
                .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
101
0
    }
102
103
    /// Returns a copy of this [`Fields`] containing only those [`FieldRef`] passing a predicate
104
    ///
105
    /// Performs a depth-first scan of [`Fields`] invoking `filter` for each [`FieldRef`]
106
    /// containing no child [`FieldRef`], a leaf field, along with a count of the number
107
    /// of such leaves encountered so far. Only [`FieldRef`] for which `filter`
108
    /// returned `true` will be included in the result.
109
    ///
110
    /// This can therefore be used to select a subset of fields from nested types
111
    /// such as [`DataType::Struct`] or [`DataType::List`].
112
    ///
113
    /// ```
114
    /// # use arrow_schema::{DataType, Field, Fields};
115
    /// let fields = Fields::from(vec![
116
    ///     Field::new("a", DataType::Int32, true), // Leaf 0
117
    ///     Field::new("b", DataType::Struct(Fields::from(vec![
118
    ///         Field::new("c", DataType::Float32, false), // Leaf 1
119
    ///         Field::new("d", DataType::Float64, false), // Leaf 2
120
    ///         Field::new("e", DataType::Struct(Fields::from(vec![
121
    ///             Field::new("f", DataType::Int32, false),   // Leaf 3
122
    ///             Field::new("g", DataType::Float16, false), // Leaf 4
123
    ///         ])), true),
124
    ///     ])), false)
125
    /// ]);
126
    /// let filtered = fields.filter_leaves(|idx, _| [0, 2, 3, 4].contains(&idx));
127
    /// let expected = Fields::from(vec![
128
    ///     Field::new("a", DataType::Int32, true),
129
    ///     Field::new("b", DataType::Struct(Fields::from(vec![
130
    ///         Field::new("d", DataType::Float64, false),
131
    ///         Field::new("e", DataType::Struct(Fields::from(vec![
132
    ///             Field::new("f", DataType::Int32, false),
133
    ///             Field::new("g", DataType::Float16, false),
134
    ///         ])), true),
135
    ///     ])), false)
136
    /// ]);
137
    /// assert_eq!(filtered, expected);
138
    /// ```
139
0
    pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut filter: F) -> Self {
140
0
        self.try_filter_leaves(|idx, field| Ok(filter(idx, field)))
141
0
            .unwrap()
142
0
    }
143
144
    /// Returns a copy of this [`Fields`] containing only those [`FieldRef`] passing a predicate
145
    /// or an error if the predicate fails.
146
    ///
147
    /// See [`Fields::filter_leaves`] for more information.
148
0
    pub fn try_filter_leaves<F: FnMut(usize, &FieldRef) -> Result<bool, ArrowError>>(
149
0
        &self,
150
0
        mut filter: F,
151
0
    ) -> Result<Self, ArrowError> {
152
0
        fn filter_field<F: FnMut(&FieldRef) -> Result<bool, ArrowError>>(
153
0
            f: &FieldRef,
154
0
            filter: &mut F,
155
0
        ) -> Result<Option<FieldRef>, ArrowError> {
156
            use DataType::*;
157
158
0
            let v = match f.data_type() {
159
0
                Dictionary(_, v) => v.as_ref(),       // Key must be integer
160
0
                RunEndEncoded(_, v) => v.data_type(), // Run-ends must be integer
161
0
                d => d,
162
            };
163
0
            let d = match v {
164
0
                List(child) => {
165
0
                    let fields = filter_field(child, filter)?;
166
0
                    if let Some(fields) = fields {
167
0
                        List(fields)
168
                    } else {
169
0
                        return Ok(None);
170
                    }
171
                }
172
0
                LargeList(child) => {
173
0
                    let fields = filter_field(child, filter)?;
174
0
                    if let Some(fields) = fields {
175
0
                        LargeList(fields)
176
                    } else {
177
0
                        return Ok(None);
178
                    }
179
                }
180
0
                Map(child, ordered) => {
181
0
                    let fields = filter_field(child, filter)?;
182
0
                    if let Some(fields) = fields {
183
0
                        Map(fields, *ordered)
184
                    } else {
185
0
                        return Ok(None);
186
                    }
187
                }
188
0
                FixedSizeList(child, size) => {
189
0
                    let fields = filter_field(child, filter)?;
190
0
                    if let Some(fields) = fields {
191
0
                        FixedSizeList(fields, *size)
192
                    } else {
193
0
                        return Ok(None);
194
                    }
195
                }
196
0
                Struct(fields) => {
197
0
                    let filtered: Result<Vec<_>, _> =
198
0
                        fields.iter().map(|f| filter_field(f, filter)).collect();
199
0
                    let filtered: Fields = filtered?
200
0
                        .iter()
201
0
                        .filter_map(|f| f.as_ref().cloned())
202
0
                        .collect();
203
204
0
                    if filtered.is_empty() {
205
0
                        return Ok(None);
206
0
                    }
207
208
0
                    Struct(filtered)
209
                }
210
0
                Union(fields, mode) => {
211
0
                    let filtered: Result<Vec<_>, _> = fields
212
0
                        .iter()
213
0
                        .map(|(id, f)| filter_field(f, filter).map(|f| f.map(|f| (id, f))))
214
0
                        .collect();
215
0
                    let filtered: UnionFields = filtered?
216
0
                        .iter()
217
0
                        .filter_map(|f| f.as_ref().cloned())
218
0
                        .collect();
219
220
0
                    if filtered.is_empty() {
221
0
                        return Ok(None);
222
0
                    }
223
224
0
                    Union(filtered, *mode)
225
                }
226
                _ => {
227
0
                    let filtered = filter(f)?;
228
0
                    return Ok(filtered.then(|| f.clone()));
229
                }
230
            };
231
0
            let d = match f.data_type() {
232
0
                Dictionary(k, _) => Dictionary(k.clone(), Box::new(d)),
233
0
                RunEndEncoded(v, f) => {
234
0
                    RunEndEncoded(v.clone(), Arc::new(f.as_ref().clone().with_data_type(d)))
235
                }
236
0
                _ => d,
237
            };
238
0
            Ok(Some(Arc::new(f.as_ref().clone().with_data_type(d))))
239
0
        }
240
241
0
        let mut leaf_idx = 0;
242
0
        let mut filter = |f: &FieldRef| {
243
0
            let t = filter(leaf_idx, f)?;
244
0
            leaf_idx += 1;
245
0
            Ok(t)
246
0
        };
247
248
0
        let filtered: Result<Vec<_>, _> = self
249
0
            .0
250
0
            .iter()
251
0
            .map(|f| filter_field(f, &mut filter))
252
0
            .collect();
253
0
        let filtered = filtered?
254
0
            .iter()
255
0
            .filter_map(|f| f.as_ref().cloned())
256
0
            .collect();
257
0
        Ok(filtered)
258
0
    }
259
}
260
261
impl Default for Fields {
262
3
    fn default() -> Self {
263
3
        Self::empty()
264
3
    }
265
}
266
267
impl FromIterator<Field> for Fields {
268
99
    fn from_iter<T: IntoIterator<Item = Field>>(iter: T) -> Self {
269
99
        iter.into_iter().map(Arc::new).collect()
270
99
    }
271
}
272
273
impl FromIterator<FieldRef> for Fields {
274
99
    fn from_iter<T: IntoIterator<Item = FieldRef>>(iter: T) -> Self {
275
99
        Self(iter.into_iter().collect())
276
99
    }
277
}
278
279
impl From<Vec<Field>> for Fields {
280
99
    fn from(value: Vec<Field>) -> Self {
281
99
        value.into_iter().collect()
282
99
    }
283
}
284
285
impl From<Vec<FieldRef>> for Fields {
286
54
    fn from(value: Vec<FieldRef>) -> Self {
287
54
        Self(value.into())
288
54
    }
289
}
290
291
impl From<&[FieldRef]> for Fields {
292
0
    fn from(value: &[FieldRef]) -> Self {
293
0
        Self(value.into())
294
0
    }
295
}
296
297
impl<const N: usize> From<[FieldRef; N]> for Fields {
298
0
    fn from(value: [FieldRef; N]) -> Self {
299
0
        Self(Arc::new(value))
300
0
    }
301
}
302
303
impl Deref for Fields {
304
    type Target = [FieldRef];
305
306
923
    fn deref(&self) -> &Self::Target {
307
923
        self.0.as_ref()
308
923
    }
309
}
310
311
impl<'a> IntoIterator for &'a Fields {
312
    type Item = &'a FieldRef;
313
    type IntoIter = std::slice::Iter<'a, FieldRef>;
314
315
278
    fn into_iter(self) -> Self::IntoIter {
316
278
        self.0.iter()
317
278
    }
318
}
319
320
/// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids
321
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
322
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
323
#[cfg_attr(feature = "serde", serde(transparent))]
324
pub struct UnionFields(Arc<[(i8, FieldRef)]>);
325
326
impl std::fmt::Debug for UnionFields {
327
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328
0
        self.0.as_ref().fmt(f)
329
0
    }
330
}
331
332
impl UnionFields {
333
    /// Create a new [`UnionFields`] with no fields
334
2
    pub fn empty() -> Self {
335
2
        Self(Arc::from([]))
336
2
    }
337
338
    /// Create a new [`UnionFields`] from a [`Fields`] and array of type_ids
339
    ///
340
    /// See <https://arrow.apache.org/docs/format/Columnar.html#union-layout>
341
    ///
342
    /// ```
343
    /// use arrow_schema::{DataType, Field, UnionFields};
344
    /// // Create a new UnionFields with type id mapping
345
    /// // 1 -> DataType::UInt8
346
    /// // 3 -> DataType::Utf8
347
    /// UnionFields::new(
348
    ///     vec![1, 3],
349
    ///     vec![
350
    ///         Field::new("field1", DataType::UInt8, false),
351
    ///         Field::new("field3", DataType::Utf8, false),
352
    ///     ],
353
    /// );
354
    /// ```
355
34
    pub fn new<F, T>(type_ids: T, fields: F) -> Self
356
34
    where
357
34
        F: IntoIterator,
358
34
        F::Item: Into<FieldRef>,
359
34
        T: IntoIterator<Item = i8>,
360
    {
361
34
        let fields = fields.into_iter().map(Into::into);
362
34
        let mut set = 0_u128;
363
34
        type_ids
364
34
            .into_iter()
365
60
            .
inspect34
(|&idx| {
366
60
                let mask = 1_u128 << idx;
367
60
                if (set & mask) != 0 {
368
0
                    panic!("duplicate type id: {idx}");
369
60
                } else {
370
60
                    set |= mask;
371
60
                }
372
60
            })
373
34
            .zip(fields)
374
34
            .collect()
375
34
    }
376
377
    /// Return size of this instance in bytes.
378
0
    pub fn size(&self) -> usize {
379
0
        self.iter()
380
0
            .map(|(_, field)| field.size() + std::mem::size_of::<(i8, FieldRef)>())
381
0
            .sum()
382
0
    }
383
384
    /// Returns the number of fields in this [`UnionFields`]
385
92
    pub fn len(&self) -> usize {
386
92
        self.0.len()
387
92
    }
388
389
    /// Returns `true` if this is empty
390
4
    pub fn is_empty(&self) -> bool {
391
4
        self.0.is_empty()
392
4
    }
393
394
    /// Returns an iterator over the fields and type ids in this [`UnionFields`]
395
383
    pub fn iter(&self) -> impl Iterator<Item = (i8, &FieldRef)> + '_ {
396
596
        
self.0.iter()383
.
map383
(|(id, f)| (*id, f))
397
383
    }
398
399
    /// Merge this field into self if it is compatible.
400
    ///
401
    /// See [`Field::try_merge`]
402
0
    pub(crate) fn try_merge(&mut self, other: &Self) -> Result<(), ArrowError> {
403
        // TODO: This currently may produce duplicate type IDs (#3982)
404
0
        let mut output: Vec<_> = self.iter().map(|(id, f)| (id, f.clone())).collect();
405
0
        for (field_type_id, from_field) in other.iter() {
406
0
            let mut is_new_field = true;
407
0
            for (self_type_id, self_field) in output.iter_mut() {
408
0
                if from_field == self_field {
409
                    // If the nested fields in two unions are the same, they must have same
410
                    // type id.
411
0
                    if *self_type_id != field_type_id {
412
0
                        return Err(ArrowError::SchemaError(format!(
413
0
                            "Fail to merge schema field '{}' because the self_type_id = {} does not equal field_type_id = {}",
414
0
                            self_field.name(),
415
0
                            self_type_id,
416
0
                            field_type_id
417
0
                        )));
418
0
                    }
419
420
0
                    is_new_field = false;
421
0
                    break;
422
0
                }
423
            }
424
425
0
            if is_new_field {
426
0
                output.push((field_type_id, from_field.clone()))
427
0
            }
428
        }
429
0
        *self = output.into_iter().collect();
430
0
        Ok(())
431
0
    }
432
}
433
434
impl FromIterator<(i8, FieldRef)> for UnionFields {
435
53
    fn from_iter<T: IntoIterator<Item = (i8, FieldRef)>>(iter: T) -> Self {
436
        // TODO: Should this validate type IDs are unique (#3982)
437
53
        Self(iter.into_iter().collect())
438
53
    }
439
}
440
441
#[cfg(test)]
442
mod tests {
443
    use super::*;
444
    use crate::UnionMode;
445
446
    #[test]
447
    fn test_filter() {
448
        let floats = Fields::from(vec![
449
            Field::new("a", DataType::Float32, false),
450
            Field::new("b", DataType::Float32, false),
451
        ]);
452
        let fields = Fields::from(vec![
453
            Field::new("a", DataType::Int32, true),
454
            Field::new("floats", DataType::Struct(floats.clone()), true),
455
            Field::new("b", DataType::Int16, true),
456
            Field::new(
457
                "c",
458
                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
459
                false,
460
            ),
461
            Field::new(
462
                "d",
463
                DataType::Dictionary(
464
                    Box::new(DataType::Int32),
465
                    Box::new(DataType::Struct(floats.clone())),
466
                ),
467
                false,
468
            ),
469
            Field::new_list(
470
                "e",
471
                Field::new("floats", DataType::Struct(floats.clone()), true),
472
                true,
473
            ),
474
            Field::new_fixed_size_list(
475
                "f",
476
                Field::new_list_field(DataType::Int32, false),
477
                3,
478
                false,
479
            ),
480
            Field::new_map(
481
                "g",
482
                "entries",
483
                Field::new("keys", DataType::LargeUtf8, false),
484
                Field::new("values", DataType::Int32, true),
485
                false,
486
                false,
487
            ),
488
            Field::new(
489
                "h",
490
                DataType::Union(
491
                    UnionFields::new(
492
                        vec![1, 3],
493
                        vec![
494
                            Field::new("field1", DataType::UInt8, false),
495
                            Field::new("field3", DataType::Utf8, false),
496
                        ],
497
                    ),
498
                    UnionMode::Dense,
499
                ),
500
                true,
501
            ),
502
            Field::new(
503
                "i",
504
                DataType::RunEndEncoded(
505
                    Arc::new(Field::new("run_ends", DataType::Int32, false)),
506
                    Arc::new(Field::new("values", DataType::Struct(floats.clone()), true)),
507
                ),
508
                false,
509
            ),
510
        ]);
511
512
        let floats_a = DataType::Struct(vec![floats[0].clone()].into());
513
514
        let r = fields.filter_leaves(|idx, _| idx == 0 || idx == 1);
515
        assert_eq!(r.len(), 2);
516
        assert_eq!(r[0], fields[0]);
517
        assert_eq!(r[1].data_type(), &floats_a);
518
519
        let r = fields.filter_leaves(|_, f| f.name() == "a");
520
        assert_eq!(r.len(), 5);
521
        assert_eq!(r[0], fields[0]);
522
        assert_eq!(r[1].data_type(), &floats_a);
523
        assert_eq!(
524
            r[2].data_type(),
525
            &DataType::Dictionary(Box::new(DataType::Int32), Box::new(floats_a.clone()))
526
        );
527
        assert_eq!(
528
            r[3].as_ref(),
529
            &Field::new_list("e", Field::new("floats", floats_a.clone(), true), true)
530
        );
531
        assert_eq!(
532
            r[4].as_ref(),
533
            &Field::new(
534
                "i",
535
                DataType::RunEndEncoded(
536
                    Arc::new(Field::new("run_ends", DataType::Int32, false)),
537
                    Arc::new(Field::new("values", floats_a.clone(), true)),
538
                ),
539
                false,
540
            )
541
        );
542
543
        let r = fields.filter_leaves(|_, f| f.name() == "floats");
544
        assert_eq!(r.len(), 0);
545
546
        let r = fields.filter_leaves(|idx, _| idx == 9);
547
        assert_eq!(r.len(), 1);
548
        assert_eq!(r[0], fields[6]);
549
550
        let r = fields.filter_leaves(|idx, _| idx == 10 || idx == 11);
551
        assert_eq!(r.len(), 1);
552
        assert_eq!(r[0], fields[7]);
553
554
        let union = DataType::Union(
555
            UnionFields::new(vec![1], vec![Field::new("field1", DataType::UInt8, false)]),
556
            UnionMode::Dense,
557
        );
558
559
        let r = fields.filter_leaves(|idx, _| idx == 12);
560
        assert_eq!(r.len(), 1);
561
        assert_eq!(r[0].data_type(), &union);
562
563
        let r = fields.filter_leaves(|idx, _| idx == 14 || idx == 15);
564
        assert_eq!(r.len(), 1);
565
        assert_eq!(r[0], fields[9]);
566
567
        // Propagate error
568
        let r = fields.try_filter_leaves(|_, _| Err(ArrowError::SchemaError("error".to_string())));
569
        assert!(r.is_err());
570
    }
571
}