Coverage Report

Created: 2025-08-26 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/andrewlamb/Software/arrow-rs/arrow-select/src/coalesce/byte_view.rs
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::coalesce::InProgressArray;
19
use arrow_array::cast::AsArray;
20
use arrow_array::types::ByteViewType;
21
use arrow_array::{Array, ArrayRef, GenericByteViewArray};
22
use arrow_buffer::{Buffer, NullBufferBuilder};
23
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
24
use arrow_schema::ArrowError;
25
use std::marker::PhantomData;
26
use std::sync::Arc;
27
28
/// InProgressArray for [`StringViewArray`] and [`BinaryViewArray`]
29
///
30
/// This structure buffers the views and data buffers as they are copied from
31
/// the source array, and then produces a new array when `finish` is called. It
32
/// also handles "garbage collection" by copying strings to a new buffer when
33
/// the source buffer is sparse (i.e. uses at least 2x more than the memory it
34
/// needs).
35
///
36
/// [`StringViewArray`]: arrow_array::StringViewArray
37
/// [`BinaryViewArray`]: arrow_array::BinaryViewArray
38
pub(crate) struct InProgressByteViewArray<B: ByteViewType> {
39
    /// The source array and information
40
    source: Option<Source>,
41
    /// the target batch size (and thus size for views allocation)
42
    batch_size: usize,
43
    /// The in progress views
44
    views: Vec<u128>,
45
    /// In progress nulls
46
    nulls: NullBufferBuilder,
47
    /// current buffer
48
    current: Option<Vec<u8>>,
49
    /// completed buffers
50
    completed: Vec<Buffer>,
51
    /// Allocates new buffers of increasing size as needed
52
    buffer_source: BufferSource,
53
    /// Phantom so we can use the same struct for both StringViewArray and
54
    /// BinaryViewArray
55
    _phantom: PhantomData<B>,
56
}
57
58
struct Source {
59
    /// The array to copy form
60
    array: ArrayRef,
61
    /// Should the strings from the source array be copied into new buffers?
62
    need_gc: bool,
63
    /// How many bytes were actually used in the source array's buffers?
64
    ideal_buffer_size: usize,
65
}
66
67
// manually implement Debug because ByteViewType doesn't implement Debug
68
impl<B: ByteViewType> std::fmt::Debug for InProgressByteViewArray<B> {
69
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
70
0
        f.debug_struct("InProgressByteViewArray")
71
0
            .field("batch_size", &self.batch_size)
72
0
            .field("views", &self.views.len())
73
0
            .field("nulls", &self.nulls)
74
0
            .field("current", &self.current.as_ref().map(|_| "Some(...)"))
75
0
            .field("completed", &self.completed.len())
76
0
            .finish()
77
0
    }
78
}
79
80
impl<B: ByteViewType> InProgressByteViewArray<B> {
81
0
    pub(crate) fn new(batch_size: usize) -> Self {
82
0
        let buffer_source = BufferSource::new();
83
84
0
        Self {
85
0
            batch_size,
86
0
            source: None,
87
0
            views: Vec::new(),                         // allocate in push
88
0
            nulls: NullBufferBuilder::new(batch_size), // no allocation
89
0
            current: None,
90
0
            completed: vec![],
91
0
            buffer_source,
92
0
            _phantom: PhantomData,
93
0
        }
94
0
    }
95
96
    /// Allocate space for output views and nulls if needed
97
    ///
98
    /// This is done on write (when we know it is necessary) rather than
99
    /// eagerly to avoid allocations that are not used.
100
0
    fn ensure_capacity(&mut self) {
101
0
        self.views.reserve(self.batch_size);
102
0
    }
103
104
    /// Finishes in progress buffer, if any
105
0
    fn finish_current(&mut self) {
106
0
        let Some(next_buffer) = self.current.take() else {
107
0
            return;
108
        };
109
0
        self.completed.push(next_buffer.into());
110
0
    }
111
112
    /// Append views to self.views, updating the buffer index if necessary
113
    #[inline(never)]
114
0
    fn append_views_and_update_buffer_index(&mut self, views: &[u128], buffers: &[Buffer]) {
115
0
        if let Some(buffer) = self.current.take() {
116
0
            self.completed.push(buffer.into());
117
0
        }
118
0
        let starting_buffer: u32 = self.completed.len().try_into().expect("too many buffers");
119
0
        self.completed.extend_from_slice(buffers);
120
121
0
        if starting_buffer == 0 {
122
0
            // If there are no buffers, we can just use the views as is
123
0
            self.views.extend_from_slice(views);
124
0
        } else {
125
            // If there are buffers, we need to update the buffer index
126
0
            let updated_views = views.iter().map(|v| {
127
0
                let mut byte_view = ByteView::from(*v);
128
0
                if byte_view.length > MAX_INLINE_VIEW_LEN {
129
0
                    // Small views (<=12 bytes) are inlined, so only need to update large views
130
0
                    byte_view.buffer_index += starting_buffer;
131
0
                };
132
0
                byte_view.as_u128()
133
0
            });
134
135
0
            self.views.extend(updated_views);
136
        }
137
0
    }
138
139
    /// Append views to self.views, copying data from the buffers into
140
    /// self.buffers and updating the buffer index as necessary.
141
    ///
142
    /// # Arguments
143
    /// - `views` - the views to append
144
    /// - `view_buffer_size` - the total number of bytes pointed to by all
145
    ///   views (used to allocate new buffers if needed)
146
    /// - `buffers` - the buffers the reviews point to
147
    #[inline(never)]
148
0
    fn append_views_and_copy_strings(
149
0
        &mut self,
150
0
        views: &[u128],
151
0
        view_buffer_size: usize,
152
0
        buffers: &[Buffer],
153
0
    ) {
154
        // Note: the calculations below are designed to avoid any reallocations
155
        // of the current buffer, and to only allocate new buffers when
156
        // necessary, which is critical for performance.
157
158
        // If there is no current buffer, allocate a new one
159
0
        let Some(current) = self.current.take() else {
160
0
            let new_buffer = self.buffer_source.next_buffer(view_buffer_size);
161
0
            self.append_views_and_copy_strings_inner(views, new_buffer, buffers);
162
0
            return;
163
        };
164
165
        // If there is a current buffer with enough space, append the views and
166
        // copy the strings into the existing buffer.
167
0
        let mut remaining_capacity = current.capacity() - current.len();
168
0
        if view_buffer_size <= remaining_capacity {
169
0
            self.append_views_and_copy_strings_inner(views, current, buffers);
170
0
            return;
171
0
        }
172
173
        // Here there is a current buffer, but it doesn't have enough space to
174
        // hold all the strings. Copy as many views as we can into the current
175
        // buffer and then allocate a new buffer for the remaining views
176
        //
177
        // TODO: should we copy the strings too at the same time?
178
0
        let mut num_view_to_current = 0;
179
0
        for view in views {
180
0
            let b = ByteView::from(*view);
181
0
            let str_len = b.length;
182
0
            if remaining_capacity < str_len as usize {
183
0
                break;
184
0
            }
185
0
            if str_len > MAX_INLINE_VIEW_LEN {
186
0
                remaining_capacity -= str_len as usize;
187
0
            }
188
0
            num_view_to_current += 1;
189
        }
190
191
0
        let first_views = &views[0..num_view_to_current];
192
0
        let string_bytes_to_copy = current.capacity() - current.len() - remaining_capacity;
193
0
        let remaining_view_buffer_size = view_buffer_size - string_bytes_to_copy;
194
195
0
        self.append_views_and_copy_strings_inner(first_views, current, buffers);
196
0
        let completed = self.current.take().expect("completed");
197
0
        self.completed.push(completed.into());
198
199
        // Copy any remaining views into a new buffer
200
0
        let remaining_views = &views[num_view_to_current..];
201
0
        let new_buffer = self.buffer_source.next_buffer(remaining_view_buffer_size);
202
0
        self.append_views_and_copy_strings_inner(remaining_views, new_buffer, buffers);
203
0
    }
204
205
    /// Append views to self.views, copying data from the buffers into
206
    /// dst_buffer, which is then set as self.current
207
    ///
208
    /// # Panics:
209
    /// If `self.current` is `Some`
210
    ///
211
    /// See `append_views_and_copy_strings` for more details
212
    #[inline(never)]
213
0
    fn append_views_and_copy_strings_inner(
214
0
        &mut self,
215
0
        views: &[u128],
216
0
        mut dst_buffer: Vec<u8>,
217
0
        buffers: &[Buffer],
218
0
    ) {
219
0
        assert!(self.current.is_none(), "current buffer should be None");
220
221
0
        if views.is_empty() {
222
0
            self.current = Some(dst_buffer);
223
0
            return;
224
0
        }
225
226
0
        let new_buffer_index: u32 = self.completed.len().try_into().expect("too many buffers");
227
228
        // In debug builds, check that the vector has enough capacity to copy
229
        // the views into it without reallocating.
230
        #[cfg(debug_assertions)]
231
        {
232
0
            let total_length: usize = views
233
0
                .iter()
234
0
                .filter_map(|v| {
235
0
                    let b = ByteView::from(*v);
236
0
                    if b.length > MAX_INLINE_VIEW_LEN {
237
0
                        Some(b.length as usize)
238
                    } else {
239
0
                        None
240
                    }
241
0
                })
242
0
                .sum();
243
0
            debug_assert!(
244
0
                dst_buffer.capacity() >= total_length,
245
0
                "dst_buffer capacity {} is less than total length {}",
246
0
                dst_buffer.capacity(),
247
                total_length
248
            );
249
        }
250
251
        // Copy the views, updating the buffer index and copying the data as needed
252
0
        let new_views = views.iter().map(|v| {
253
0
            let mut b: ByteView = ByteView::from(*v);
254
0
            if b.length > MAX_INLINE_VIEW_LEN {
255
0
                let buffer_index = b.buffer_index as usize;
256
0
                let buffer_offset = b.offset as usize;
257
0
                let str_len = b.length as usize;
258
0
259
0
                // Update view to location in current
260
0
                b.offset = dst_buffer.len() as u32;
261
0
                b.buffer_index = new_buffer_index;
262
0
263
0
                // safety: input views are validly constructed
264
0
                let src = unsafe {
265
0
                    buffers
266
0
                        .get_unchecked(buffer_index)
267
0
                        .get_unchecked(buffer_offset..buffer_offset + str_len)
268
0
                };
269
0
                dst_buffer.extend_from_slice(src);
270
0
            }
271
0
            b.as_u128()
272
0
        });
273
0
        self.views.extend(new_views);
274
0
        self.current = Some(dst_buffer);
275
0
    }
276
}
277
278
impl<B: ByteViewType> InProgressArray for InProgressByteViewArray<B> {
279
0
    fn set_source(&mut self, source: Option<ArrayRef>) {
280
0
        self.source = source.map(|array| {
281
0
            let s = array.as_byte_view::<B>();
282
283
0
            let (need_gc, ideal_buffer_size) = if s.data_buffers().is_empty() {
284
0
                (false, 0)
285
            } else {
286
0
                let ideal_buffer_size = s.total_buffer_bytes_used();
287
                // We don't use get_buffer_memory_size here, because gc is for the contents of the
288
                // data buffers, not views and nulls.
289
0
                let actual_buffer_size =
290
0
                    s.data_buffers().iter().map(|b| b.capacity()).sum::<usize>();
291
                // copying strings is expensive, so only do it if the array is
292
                // sparse (uses at least 2x the memory it needs)
293
0
                let need_gc =
294
0
                    ideal_buffer_size != 0 && actual_buffer_size > (ideal_buffer_size * 2);
295
0
                (need_gc, ideal_buffer_size)
296
            };
297
298
0
            Source {
299
0
                array,
300
0
                need_gc,
301
0
                ideal_buffer_size,
302
0
            }
303
0
        })
304
0
    }
305
306
0
    fn copy_rows(&mut self, offset: usize, len: usize) -> Result<(), ArrowError> {
307
0
        self.ensure_capacity();
308
0
        let source = self.source.take().ok_or_else(|| {
309
0
            ArrowError::InvalidArgumentError(
310
0
                "Internal Error: InProgressByteViewArray: source not set".to_string(),
311
0
            )
312
0
        })?;
313
314
        // If creating StringViewArray output, ensure input was valid utf8 too
315
0
        let s = source.array.as_byte_view::<B>();
316
317
        // add any nulls, as necessary
318
0
        if let Some(nulls) = s.nulls().as_ref() {
319
0
            let nulls = nulls.slice(offset, len);
320
0
            self.nulls.append_buffer(&nulls);
321
0
        } else {
322
0
            self.nulls.append_n_non_nulls(len);
323
0
        };
324
325
0
        let buffers = s.data_buffers();
326
0
        let views = &s.views().as_ref()[offset..offset + len];
327
328
        // If there are no data buffers in s (all inlined views), can append the
329
        // views/nulls and done
330
0
        if source.ideal_buffer_size == 0 {
331
0
            self.views.extend_from_slice(views);
332
0
            self.source = Some(source);
333
0
            return Ok(());
334
0
        }
335
336
        // Copying the strings into a buffer can be time-consuming so
337
        // only do it if the array is sparse
338
0
        if source.need_gc {
339
0
            self.append_views_and_copy_strings(views, source.ideal_buffer_size, buffers);
340
0
        } else {
341
0
            self.append_views_and_update_buffer_index(views, buffers);
342
0
        }
343
0
        self.source = Some(source);
344
0
        Ok(())
345
0
    }
346
347
0
    fn finish(&mut self) -> Result<ArrayRef, ArrowError> {
348
0
        self.finish_current();
349
0
        assert!(self.current.is_none());
350
0
        let buffers = std::mem::take(&mut self.completed);
351
0
        let views = std::mem::take(&mut self.views);
352
0
        let nulls = self.nulls.finish();
353
0
        self.nulls = NullBufferBuilder::new(self.batch_size);
354
355
        // Safety: we created valid views and buffers above and the
356
        // input arrays had value data and nulls
357
0
        let new_array =
358
0
            unsafe { GenericByteViewArray::<B>::new_unchecked(views.into(), buffers, nulls) };
359
0
        Ok(Arc::new(new_array))
360
0
    }
361
}
362
363
const STARTING_BLOCK_SIZE: usize = 4 * 1024; // (note the first size used is actually 8KiB)
364
const MAX_BLOCK_SIZE: usize = 1024 * 1024; // 1MiB
365
366
/// Manages allocating new buffers for `StringViewArray` in increasing sizes
367
#[derive(Debug)]
368
struct BufferSource {
369
    current_size: usize,
370
}
371
372
impl BufferSource {
373
0
    fn new() -> Self {
374
0
        Self {
375
0
            current_size: STARTING_BLOCK_SIZE,
376
0
        }
377
0
    }
378
379
    /// Return a new buffer, with a capacity of at least `min_size`
380
0
    fn next_buffer(&mut self, min_size: usize) -> Vec<u8> {
381
0
        let size = self.next_size(min_size);
382
0
        Vec::with_capacity(size)
383
0
    }
384
385
0
    fn next_size(&mut self, min_size: usize) -> usize {
386
0
        if self.current_size < MAX_BLOCK_SIZE {
387
0
            // If the current size is less than the max size, we can double it
388
0
            // we have fixed start/end block sizes, so we can't overflow
389
0
            self.current_size = self.current_size.saturating_mul(2);
390
0
        }
391
0
        if self.current_size >= min_size {
392
0
            self.current_size
393
        } else {
394
            // increase next size until we hit min_size or max  size
395
0
            while self.current_size <= min_size && self.current_size < MAX_BLOCK_SIZE {
396
0
                self.current_size = self.current_size.saturating_mul(2);
397
0
            }
398
0
            self.current_size.max(min_size)
399
        }
400
0
    }
401
}
402
403
#[cfg(test)]
404
mod tests {
405
    use super::*;
406
407
    #[test]
408
    fn test_buffer_source() {
409
        let mut source = BufferSource::new();
410
        assert_eq!(source.next_buffer(1000).capacity(), 8192);
411
        assert_eq!(source.next_buffer(1000).capacity(), 16384);
412
        assert_eq!(source.next_buffer(1000).capacity(), 32768);
413
        assert_eq!(source.next_buffer(1000).capacity(), 65536);
414
        assert_eq!(source.next_buffer(1000).capacity(), 131072);
415
        assert_eq!(source.next_buffer(1000).capacity(), 262144);
416
        assert_eq!(source.next_buffer(1000).capacity(), 524288);
417
        assert_eq!(source.next_buffer(1000).capacity(), 1024 * 1024);
418
        // clamped to max size
419
        assert_eq!(source.next_buffer(1000).capacity(), 1024 * 1024);
420
        // Can override with larger size request
421
        assert_eq!(source.next_buffer(10_000_000).capacity(), 10_000_000);
422
    }
423
424
    #[test]
425
    fn test_buffer_source_with_min_small() {
426
        let mut source = BufferSource::new();
427
        // First buffer should be 8kb
428
        assert_eq!(source.next_buffer(5_600).capacity(), 8 * 1024);
429
        // then 16kb
430
        assert_eq!(source.next_buffer(5_600).capacity(), 16 * 1024);
431
        // then 32kb
432
        assert_eq!(source.next_buffer(5_600).capacity(), 32 * 1024);
433
    }
434
435
    #[test]
436
    fn test_buffer_source_with_min_large() {
437
        let mut source = BufferSource::new();
438
        assert_eq!(source.next_buffer(500_000).capacity(), 512 * 1024);
439
        assert_eq!(source.next_buffer(500_000).capacity(), 1024 * 1024);
440
        // clamped to max size
441
        assert_eq!(source.next_buffer(500_000).capacity(), 1024 * 1024);
442
        // Can override with larger size request
443
        assert_eq!(source.next_buffer(2_000_000).capacity(), 2_000_000);
444
    }
445
}