/Users/andrewlamb/Software/arrow-rs/arrow-array/src/iterator.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Idiomatic iterators for [`Array`](crate::Array) |
19 | | |
20 | | use crate::array::{ |
21 | | ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, |
22 | | GenericStringArray, PrimitiveArray, |
23 | | }; |
24 | | use crate::{FixedSizeListArray, GenericListViewArray, MapArray}; |
25 | | use arrow_buffer::NullBuffer; |
26 | | |
27 | | /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] |
28 | | /// |
29 | | /// # Performance |
30 | | /// |
31 | | /// [`ArrayIter`] provides an idiomatic way to iterate over an array, however, this |
32 | | /// comes at the cost of performance. In particular the interleaved handling of |
33 | | /// the null mask is often sub-optimal. |
34 | | /// |
35 | | /// If performing an infallible operation, it is typically faster to perform the operation |
36 | | /// on every index of the array, and handle the null mask separately. For [`PrimitiveArray`] |
37 | | /// this functionality is provided by [`compute::unary`] |
38 | | /// |
39 | | /// If performing a fallible operation, it isn't possible to perform the operation independently |
40 | | /// of the null mask, as this might result in a spurious failure on a null index. However, |
41 | | /// there are more efficient ways to iterate over just the non-null indices, this functionality |
42 | | /// is provided by [`compute::try_unary`] |
43 | | /// |
44 | | /// [`PrimitiveArray`]: crate::PrimitiveArray |
45 | | /// [`compute::unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.unary.html |
46 | | /// [`compute::try_unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.try_unary.html |
47 | | #[derive(Debug)] |
48 | | pub struct ArrayIter<T: ArrayAccessor> { |
49 | | array: T, |
50 | | logical_nulls: Option<NullBuffer>, |
51 | | current: usize, |
52 | | current_end: usize, |
53 | | } |
54 | | |
55 | | impl<T: ArrayAccessor> ArrayIter<T> { |
56 | | /// create a new iterator |
57 | 0 | pub fn new(array: T) -> Self { |
58 | 0 | let len = array.len(); |
59 | 0 | let logical_nulls = array.logical_nulls(); |
60 | 0 | ArrayIter { |
61 | 0 | array, |
62 | 0 | logical_nulls, |
63 | 0 | current: 0, |
64 | 0 | current_end: len, |
65 | 0 | } |
66 | 0 | } |
67 | | |
68 | | #[inline] |
69 | 0 | fn is_null(&self, idx: usize) -> bool { |
70 | 0 | self.logical_nulls |
71 | 0 | .as_ref() |
72 | 0 | .map(|x| x.is_null(idx)) |
73 | 0 | .unwrap_or_default() |
74 | 0 | } |
75 | | } |
76 | | |
77 | | impl<T: ArrayAccessor> Iterator for ArrayIter<T> { |
78 | | type Item = Option<T::Item>; |
79 | | |
80 | | #[inline] |
81 | 0 | fn next(&mut self) -> Option<Self::Item> { |
82 | 0 | if self.current == self.current_end { |
83 | 0 | None |
84 | 0 | } else if self.is_null(self.current) { |
85 | 0 | self.current += 1; |
86 | 0 | Some(None) |
87 | | } else { |
88 | 0 | let old = self.current; |
89 | 0 | self.current += 1; |
90 | | // Safety: |
91 | | // we just checked bounds in `self.current_end == self.current` |
92 | | // this is safe on the premise that this struct is initialized with |
93 | | // current = array.len() |
94 | | // and that current_end is ever only decremented |
95 | 0 | unsafe { Some(Some(self.array.value_unchecked(old))) } |
96 | | } |
97 | 0 | } |
98 | | |
99 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
100 | 0 | ( |
101 | 0 | self.array.len() - self.current, |
102 | 0 | Some(self.array.len() - self.current), |
103 | 0 | ) |
104 | 0 | } |
105 | | } |
106 | | |
107 | | impl<T: ArrayAccessor> DoubleEndedIterator for ArrayIter<T> { |
108 | | fn next_back(&mut self) -> Option<Self::Item> { |
109 | | if self.current_end == self.current { |
110 | | None |
111 | | } else { |
112 | | self.current_end -= 1; |
113 | | Some(if self.is_null(self.current_end) { |
114 | | None |
115 | | } else { |
116 | | // Safety: |
117 | | // we just checked bounds in `self.current_end == self.current` |
118 | | // this is safe on the premise that this struct is initialized with |
119 | | // current = array.len() |
120 | | // and that current_end is ever only decremented |
121 | | unsafe { Some(self.array.value_unchecked(self.current_end)) } |
122 | | }) |
123 | | } |
124 | | } |
125 | | } |
126 | | |
127 | | /// all arrays have known size. |
128 | | impl<T: ArrayAccessor> ExactSizeIterator for ArrayIter<T> {} |
129 | | |
130 | | /// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray |
131 | | pub type PrimitiveIter<'a, T> = ArrayIter<&'a PrimitiveArray<T>>; |
132 | | /// an iterator that returns Some(T) or None, that can be used on any BooleanArray |
133 | | pub type BooleanIter<'a> = ArrayIter<&'a BooleanArray>; |
134 | | /// an iterator that returns Some(T) or None, that can be used on any Utf8Array |
135 | | pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray<T>>; |
136 | | /// an iterator that returns Some(T) or None, that can be used on any BinaryArray |
137 | | pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray<T>>; |
138 | | /// an iterator that returns Some(T) or None, that can be used on any FixedSizeBinaryArray |
139 | | pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; |
140 | | /// an iterator that returns Some(T) or None, that can be used on any FixedSizeListArray |
141 | | pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; |
142 | | /// an iterator that returns Some(T) or None, that can be used on any ListArray |
143 | | pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray<O>>; |
144 | | /// an iterator that returns Some(T) or None, that can be used on any MapArray |
145 | | pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; |
146 | | /// an iterator that returns Some(T) or None, that can be used on any ListArray |
147 | | pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray<O>>; |
148 | | #[cfg(test)] |
149 | | mod tests { |
150 | | use std::sync::Arc; |
151 | | |
152 | | use crate::array::{ArrayRef, BinaryArray, BooleanArray, Int32Array, StringArray}; |
153 | | |
154 | | #[test] |
155 | | fn test_primitive_array_iter_round_trip() { |
156 | | let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); |
157 | | let array = Arc::new(array) as ArrayRef; |
158 | | |
159 | | let array = array.as_any().downcast_ref::<Int32Array>().unwrap(); |
160 | | |
161 | | // to and from iter, with a +1 |
162 | | let result: Int32Array = array.iter().map(|e| e.map(|e| e + 1)).collect(); |
163 | | |
164 | | let expected = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); |
165 | | assert_eq!(result, expected); |
166 | | |
167 | | // check if DoubleEndedIterator is implemented |
168 | | let result: Int32Array = array.iter().rev().collect(); |
169 | | let rev_array = Int32Array::from(vec![Some(4), None, Some(2), None, Some(0)]); |
170 | | assert_eq!(result, rev_array); |
171 | | // check if ExactSizeIterator is implemented |
172 | | let _ = array.iter().rposition(|opt_b| opt_b == Some(1)); |
173 | | } |
174 | | |
175 | | #[test] |
176 | | fn test_double_ended() { |
177 | | let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); |
178 | | let mut a = array.iter(); |
179 | | assert_eq!(a.next(), Some(Some(0))); |
180 | | assert_eq!(a.next(), Some(None)); |
181 | | assert_eq!(a.next_back(), Some(Some(4))); |
182 | | assert_eq!(a.next_back(), Some(None)); |
183 | | assert_eq!(a.next_back(), Some(Some(2))); |
184 | | // the two sides have met: None is returned by both |
185 | | assert_eq!(a.next_back(), None); |
186 | | assert_eq!(a.next(), None); |
187 | | } |
188 | | |
189 | | #[test] |
190 | | fn test_string_array_iter_round_trip() { |
191 | | let array = StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); |
192 | | let array = Arc::new(array) as ArrayRef; |
193 | | |
194 | | let array = array.as_any().downcast_ref::<StringArray>().unwrap(); |
195 | | |
196 | | // to and from iter, with a +1 |
197 | | let result: StringArray = array |
198 | | .iter() |
199 | | .map(|e| { |
200 | | e.map(|e| { |
201 | | let mut a = e.to_string(); |
202 | | a.push('b'); |
203 | | a |
204 | | }) |
205 | | }) |
206 | | .collect(); |
207 | | |
208 | | let expected = |
209 | | StringArray::from(vec![Some("ab"), None, Some("aaab"), None, Some("aaaaab")]); |
210 | | assert_eq!(result, expected); |
211 | | |
212 | | // check if DoubleEndedIterator is implemented |
213 | | let result: StringArray = array.iter().rev().collect(); |
214 | | let rev_array = StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); |
215 | | assert_eq!(result, rev_array); |
216 | | // check if ExactSizeIterator is implemented |
217 | | let _ = array.iter().rposition(|opt_b| opt_b == Some("a")); |
218 | | } |
219 | | |
220 | | #[test] |
221 | | fn test_binary_array_iter_round_trip() { |
222 | | let array = BinaryArray::from(vec![ |
223 | | Some(b"a" as &[u8]), |
224 | | None, |
225 | | Some(b"aaa"), |
226 | | None, |
227 | | Some(b"aaaaa"), |
228 | | ]); |
229 | | |
230 | | // to and from iter |
231 | | let result: BinaryArray = array.iter().collect(); |
232 | | |
233 | | assert_eq!(result, array); |
234 | | |
235 | | // check if DoubleEndedIterator is implemented |
236 | | let result: BinaryArray = array.iter().rev().collect(); |
237 | | let rev_array = BinaryArray::from(vec![ |
238 | | Some(b"aaaaa" as &[u8]), |
239 | | None, |
240 | | Some(b"aaa"), |
241 | | None, |
242 | | Some(b"a"), |
243 | | ]); |
244 | | assert_eq!(result, rev_array); |
245 | | |
246 | | // check if ExactSizeIterator is implemented |
247 | | let _ = array.iter().rposition(|opt_b| opt_b == Some(&[9])); |
248 | | } |
249 | | |
250 | | #[test] |
251 | | fn test_boolean_array_iter_round_trip() { |
252 | | let array = BooleanArray::from(vec![Some(true), None, Some(false)]); |
253 | | |
254 | | // to and from iter |
255 | | let result: BooleanArray = array.iter().collect(); |
256 | | |
257 | | assert_eq!(result, array); |
258 | | |
259 | | // check if DoubleEndedIterator is implemented |
260 | | let result: BooleanArray = array.iter().rev().collect(); |
261 | | let rev_array = BooleanArray::from(vec![Some(false), None, Some(true)]); |
262 | | assert_eq!(result, rev_array); |
263 | | |
264 | | // check if ExactSizeIterator is implemented |
265 | | let _ = array.iter().rposition(|opt_b| opt_b == Some(true)); |
266 | | } |
267 | | } |