/Users/andrewlamb/Software/arrow-rs/arrow-buffer/src/buffer/run.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::buffer::ScalarBuffer; |
19 | | use crate::ArrowNativeType; |
20 | | |
21 | | /// A slice-able buffer of monotonically increasing, positive integers used to store run-ends |
22 | | /// |
23 | | /// # Logical vs Physical |
24 | | /// |
25 | | /// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is |
26 | | /// called the physical index. The logical index is then the corresponding index in the logical |
27 | | /// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. |
28 | | /// |
29 | | /// Each value in [`RunEndBuffer::values`] is the cumulative length of all runs in the |
30 | | /// logical array, up to that physical index. |
31 | | /// |
32 | | /// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, |
33 | | /// as there are `3` values, and the maximum logical index is `5`, as the maximum run end |
34 | | /// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]` |
35 | | /// |
36 | | /// ```text |
37 | | /// ┌─────────┐ ┌─────────┐ ┌─────────┐ |
38 | | /// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │ |
39 | | /// ├─────────┤ ├─────────┤ │ ├─────────┤ |
40 | | /// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │ |
41 | | /// ├─────────┤ ├─────────┤ │ │ ├─────────┤ |
42 | | /// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ |
43 | | /// └─────────┘ ├─────────┤ │ │ └─────────┘ |
44 | | /// run ends │ 3 │ ───┘ │ physical indices |
45 | | /// ├─────────┤ │ |
46 | | /// │ 4 │ ─────┤ |
47 | | /// ├─────────┤ │ |
48 | | /// │ 5 │ ─────┘ |
49 | | /// └─────────┘ |
50 | | /// logical indices |
51 | | /// ``` |
52 | | /// |
53 | | /// # Slicing |
54 | | /// |
55 | | /// In order to provide zero-copy slicing, this container stores a separate offset and length |
56 | | /// |
57 | | /// For example, a [`RunEndBuffer`] containing values `[3, 6, 8]` with offset and length `4` would |
58 | | /// describe the physical indices `1, 1, 2, 2` |
59 | | /// |
60 | | /// For example, a [`RunEndBuffer`] containing values `[6, 8, 9]` with offset `2` and length `5` |
61 | | /// would describe the physical indices `0, 0, 0, 0, 1` |
62 | | /// |
63 | | /// [Run-End encoded layout]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout |
64 | | #[derive(Debug, Clone)] |
65 | | pub struct RunEndBuffer<E: ArrowNativeType> { |
66 | | run_ends: ScalarBuffer<E>, |
67 | | len: usize, |
68 | | offset: usize, |
69 | | } |
70 | | |
71 | | impl<E> RunEndBuffer<E> |
72 | | where |
73 | | E: ArrowNativeType, |
74 | | { |
75 | | /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], an `offset` and `len` |
76 | | /// |
77 | | /// # Panics |
78 | | /// |
79 | | /// - `buffer` does not contain strictly increasing values greater than zero |
80 | | /// - the last value of `buffer` is less than `offset + len` |
81 | | pub fn new(run_ends: ScalarBuffer<E>, offset: usize, len: usize) -> Self { |
82 | | assert!( |
83 | | run_ends.windows(2).all(|w| w[0] < w[1]), |
84 | | "run-ends not strictly increasing" |
85 | | ); |
86 | | |
87 | | if len != 0 { |
88 | | assert!(!run_ends.is_empty(), "non-empty slice but empty run-ends"); |
89 | | let end = E::from_usize(offset.saturating_add(len)).unwrap(); |
90 | | assert!( |
91 | | *run_ends.first().unwrap() > E::usize_as(0), |
92 | | "run-ends not greater than 0" |
93 | | ); |
94 | | assert!( |
95 | | *run_ends.last().unwrap() >= end, |
96 | | "slice beyond bounds of run-ends" |
97 | | ); |
98 | | } |
99 | | |
100 | | Self { |
101 | | run_ends, |
102 | | offset, |
103 | | len, |
104 | | } |
105 | | } |
106 | | |
107 | | /// Create a new [`RunEndBuffer`] from an [`ScalarBuffer`], an `offset` and `len` |
108 | | /// |
109 | | /// # Safety |
110 | | /// |
111 | | /// - `buffer` must contain strictly increasing values greater than zero |
112 | | /// - The last value of `buffer` must be greater than or equal to `offset + len` |
113 | 0 | pub unsafe fn new_unchecked(run_ends: ScalarBuffer<E>, offset: usize, len: usize) -> Self { |
114 | 0 | Self { |
115 | 0 | run_ends, |
116 | 0 | offset, |
117 | 0 | len, |
118 | 0 | } |
119 | 0 | } |
120 | | |
121 | | /// Returns the logical offset into the run-ends stored by this buffer |
122 | | #[inline] |
123 | 0 | pub fn offset(&self) -> usize { |
124 | 0 | self.offset |
125 | 0 | } |
126 | | |
127 | | /// Returns the logical length of the run-ends stored by this buffer |
128 | | #[inline] |
129 | 0 | pub fn len(&self) -> usize { |
130 | 0 | self.len |
131 | 0 | } |
132 | | |
133 | | /// Returns true if this buffer is empty |
134 | | #[inline] |
135 | 0 | pub fn is_empty(&self) -> bool { |
136 | 0 | self.len == 0 |
137 | 0 | } |
138 | | |
139 | | /// Free up unused memory. |
140 | 0 | pub fn shrink_to_fit(&mut self) { |
141 | | // TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer |
142 | 0 | self.run_ends.shrink_to_fit(); |
143 | 0 | } |
144 | | |
145 | | /// Returns the values of this [`RunEndBuffer`] not including any offset |
146 | | #[inline] |
147 | 0 | pub fn values(&self) -> &[E] { |
148 | 0 | &self.run_ends |
149 | 0 | } |
150 | | |
151 | | /// Returns the maximum run-end encoded in the underlying buffer |
152 | | #[inline] |
153 | 0 | pub fn max_value(&self) -> usize { |
154 | 0 | self.values().last().copied().unwrap_or_default().as_usize() |
155 | 0 | } |
156 | | |
157 | | /// Performs a binary search to find the physical index for the given logical index |
158 | | /// |
159 | | /// The result is arbitrary if `logical_index >= self.len()` |
160 | 0 | pub fn get_physical_index(&self, logical_index: usize) -> usize { |
161 | 0 | let logical_index = E::usize_as(self.offset + logical_index); |
162 | 0 | let cmp = |p: &E| p.partial_cmp(&logical_index).unwrap(); |
163 | | |
164 | 0 | match self.run_ends.binary_search_by(cmp) { |
165 | 0 | Ok(idx) => idx + 1, |
166 | 0 | Err(idx) => idx, |
167 | | } |
168 | 0 | } |
169 | | |
170 | | /// Returns the physical index at which the logical array starts |
171 | 0 | pub fn get_start_physical_index(&self) -> usize { |
172 | 0 | if self.offset == 0 || self.len == 0 { |
173 | 0 | return 0; |
174 | 0 | } |
175 | | // Fallback to binary search |
176 | 0 | self.get_physical_index(0) |
177 | 0 | } |
178 | | |
179 | | /// Returns the physical index at which the logical array ends |
180 | 0 | pub fn get_end_physical_index(&self) -> usize { |
181 | 0 | if self.len == 0 { |
182 | 0 | return 0; |
183 | 0 | } |
184 | 0 | if self.max_value() == self.offset + self.len { |
185 | 0 | return self.values().len() - 1; |
186 | 0 | } |
187 | | // Fallback to binary search |
188 | 0 | self.get_physical_index(self.len - 1) |
189 | 0 | } |
190 | | |
191 | | /// Slices this [`RunEndBuffer`] by the provided `offset` and `length` |
192 | 0 | pub fn slice(&self, offset: usize, len: usize) -> Self { |
193 | 0 | assert!( |
194 | 0 | offset.saturating_add(len) <= self.len, |
195 | 0 | "the length + offset of the sliced RunEndBuffer cannot exceed the existing length" |
196 | | ); |
197 | 0 | Self { |
198 | 0 | run_ends: self.run_ends.clone(), |
199 | 0 | offset: self.offset + offset, |
200 | 0 | len, |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | | /// Returns the inner [`ScalarBuffer`] |
205 | 0 | pub fn inner(&self) -> &ScalarBuffer<E> { |
206 | 0 | &self.run_ends |
207 | 0 | } |
208 | | |
209 | | /// Returns the inner [`ScalarBuffer`], consuming self |
210 | 0 | pub fn into_inner(self) -> ScalarBuffer<E> { |
211 | 0 | self.run_ends |
212 | 0 | } |
213 | | } |
214 | | |
215 | | #[cfg(test)] |
216 | | mod tests { |
217 | | use crate::buffer::RunEndBuffer; |
218 | | |
219 | | #[test] |
220 | | fn test_zero_length_slice() { |
221 | | let buffer = RunEndBuffer::new(vec![1_i32, 4_i32].into(), 0, 4); |
222 | | assert_eq!(buffer.get_start_physical_index(), 0); |
223 | | assert_eq!(buffer.get_end_physical_index(), 1); |
224 | | assert_eq!(buffer.get_physical_index(3), 1); |
225 | | |
226 | | for offset in 0..4 { |
227 | | let sliced = buffer.slice(offset, 0); |
228 | | assert_eq!(sliced.get_start_physical_index(), 0); |
229 | | assert_eq!(sliced.get_end_physical_index(), 0); |
230 | | } |
231 | | |
232 | | let buffer = RunEndBuffer::new(Vec::<i32>::new().into(), 0, 0); |
233 | | assert_eq!(buffer.get_start_physical_index(), 0); |
234 | | assert_eq!(buffer.get_end_physical_index(), 0); |
235 | | } |
236 | | } |