/Users/andrewlamb/Software/arrow-rs/arrow-data/src/byte_view.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use arrow_buffer::Buffer; |
19 | | use arrow_schema::ArrowError; |
20 | | |
21 | | /// The maximum number of bytes that can be stored inline in a byte view. |
22 | | /// |
23 | | /// See [`ByteView`] and [`GenericByteViewArray`] for more information on the |
24 | | /// layout of the views. |
25 | | /// |
26 | | /// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html |
27 | | pub const MAX_INLINE_VIEW_LEN: u32 = 12; |
28 | | |
29 | | /// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and |
30 | | /// `BinaryViewArray`) where the length is greater than 12 bytes. |
31 | | /// |
32 | | /// See Also: |
33 | | /// * [`GenericByteViewArray`] for more information on the layout of the views. |
34 | | /// * [`validate_binary_view`] and [`validate_string_view`] to validate |
35 | | /// |
36 | | /// # Example: Create a new u128 view |
37 | | /// |
38 | | /// ```rust |
39 | | /// # use arrow_data::ByteView;; |
40 | | /// // Create a view for a string of length 20 |
41 | | /// // first four bytes are "Rust" |
42 | | /// // stored in buffer 3 |
43 | | /// // at offset 42 |
44 | | /// let prefix = "Rust"; |
45 | | /// let view = ByteView::new(20, prefix.as_bytes()) |
46 | | /// .with_buffer_index(3) |
47 | | /// .with_offset(42); |
48 | | /// |
49 | | /// // create the final u128 |
50 | | /// let v = view.as_u128(); |
51 | | /// assert_eq!(v, 0x2a000000037473755200000014); |
52 | | /// ``` |
53 | | /// |
54 | | /// # Example: decode a `u128` into its constituent fields |
55 | | /// ```rust |
56 | | /// # use arrow_data::ByteView; |
57 | | /// // Convert a u128 to a ByteView |
58 | | /// // See validate_{string,binary}_view functions to validate |
59 | | /// let v = ByteView::from(0x2a000000037473755200000014); |
60 | | /// |
61 | | /// assert_eq!(v.length, 20); |
62 | | /// assert_eq!(v.prefix, 0x74737552); |
63 | | /// assert_eq!(v.buffer_index, 3); |
64 | | /// assert_eq!(v.offset, 42); |
65 | | /// ``` |
66 | | /// |
67 | | /// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html |
68 | | #[derive(Debug, Copy, Clone, Default)] |
69 | | #[repr(C)] |
70 | | pub struct ByteView { |
71 | | /// The length of the string/bytes. |
72 | | pub length: u32, |
73 | | /// First 4 bytes of string/bytes data. |
74 | | pub prefix: u32, |
75 | | /// The buffer index. |
76 | | pub buffer_index: u32, |
77 | | /// The offset into the buffer. |
78 | | pub offset: u32, |
79 | | } |
80 | | |
81 | | impl ByteView { |
82 | | /// Construct a [`ByteView`] for data `length` of bytes with the specified prefix. |
83 | | /// |
84 | | /// See example on [`ByteView`] docs |
85 | | /// |
86 | | /// Notes: |
87 | | /// * the length should always be greater than [`MAX_INLINE_VIEW_LEN`] |
88 | | /// (Data less than 12 bytes is stored as an inline view) |
89 | | /// * buffer and offset are set to `0` |
90 | | /// |
91 | | /// # Panics |
92 | | /// If the prefix is not exactly 4 bytes |
93 | | #[inline] |
94 | | pub fn new(length: u32, prefix: &[u8]) -> Self { |
95 | | debug_assert!(length > MAX_INLINE_VIEW_LEN); |
96 | | Self { |
97 | | length, |
98 | | prefix: u32::from_le_bytes(prefix.try_into().unwrap()), |
99 | | buffer_index: 0, |
100 | | offset: 0, |
101 | | } |
102 | | } |
103 | | |
104 | | /// Set the [`Self::buffer_index`] field |
105 | | #[inline] |
106 | 0 | pub fn with_buffer_index(mut self, buffer_index: u32) -> Self { |
107 | 0 | self.buffer_index = buffer_index; |
108 | 0 | self |
109 | 0 | } |
110 | | |
111 | | /// Set the [`Self::offset`] field |
112 | | #[inline] |
113 | | pub fn with_offset(mut self, offset: u32) -> Self { |
114 | | self.offset = offset; |
115 | | self |
116 | | } |
117 | | |
118 | | #[inline(always)] |
119 | | /// Convert `ByteView` to `u128` by concatenating the fields |
120 | 0 | pub fn as_u128(self) -> u128 { |
121 | 0 | (self.length as u128) |
122 | 0 | | ((self.prefix as u128) << 32) |
123 | 0 | | ((self.buffer_index as u128) << 64) |
124 | 0 | | ((self.offset as u128) << 96) |
125 | 0 | } |
126 | | } |
127 | | |
128 | | impl From<u128> for ByteView { |
129 | | #[inline] |
130 | 0 | fn from(value: u128) -> Self { |
131 | 0 | Self { |
132 | 0 | length: value as u32, |
133 | 0 | prefix: (value >> 32) as u32, |
134 | 0 | buffer_index: (value >> 64) as u32, |
135 | 0 | offset: (value >> 96) as u32, |
136 | 0 | } |
137 | 0 | } |
138 | | } |
139 | | |
140 | | impl From<ByteView> for u128 { |
141 | | #[inline] |
142 | 0 | fn from(value: ByteView) -> Self { |
143 | 0 | value.as_u128() |
144 | 0 | } |
145 | | } |
146 | | |
147 | | /// Validates the combination of `views` and `buffers` is a valid BinaryView |
148 | 0 | pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { |
149 | 0 | validate_view_impl(views, buffers, |_, _| Ok(())) |
150 | 0 | } |
151 | | |
152 | | /// Validates the combination of `views` and `buffers` is a valid StringView |
153 | 0 | pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { |
154 | 0 | validate_view_impl(views, buffers, |idx, b| { |
155 | 0 | std::str::from_utf8(b).map_err(|e| { |
156 | 0 | ArrowError::InvalidArgumentError(format!( |
157 | 0 | "Encountered non-UTF-8 data at index {idx}: {e}" |
158 | 0 | )) |
159 | 0 | })?; |
160 | 0 | Ok(()) |
161 | 0 | }) |
162 | 0 | } |
163 | | |
164 | 0 | fn validate_view_impl<F>(views: &[u128], buffers: &[Buffer], f: F) -> Result<(), ArrowError> |
165 | 0 | where |
166 | 0 | F: Fn(usize, &[u8]) -> Result<(), ArrowError>, |
167 | | { |
168 | 0 | for (idx, v) in views.iter().enumerate() { |
169 | 0 | let len = *v as u32; |
170 | 0 | if len <= MAX_INLINE_VIEW_LEN { |
171 | 0 | if len < MAX_INLINE_VIEW_LEN && (v >> (32 + len * 8)) != 0 { |
172 | 0 | return Err(ArrowError::InvalidArgumentError(format!( |
173 | 0 | "View at index {idx} contained non-zero padding for string of length {len}", |
174 | 0 | ))); |
175 | 0 | } |
176 | 0 | f(idx, &v.to_le_bytes()[4..4 + len as usize])?; |
177 | | } else { |
178 | 0 | let view = ByteView::from(*v); |
179 | 0 | let data = buffers.get(view.buffer_index as usize).ok_or_else(|| { |
180 | 0 | ArrowError::InvalidArgumentError(format!( |
181 | 0 | "Invalid buffer index at {idx}: got index {} but only has {} buffers", |
182 | 0 | view.buffer_index, |
183 | 0 | buffers.len() |
184 | 0 | )) |
185 | 0 | })?; |
186 | | |
187 | 0 | let start = view.offset as usize; |
188 | 0 | let end = start + len as usize; |
189 | 0 | let b = data.get(start..end).ok_or_else(|| { |
190 | 0 | ArrowError::InvalidArgumentError(format!( |
191 | 0 | "Invalid buffer slice at {idx}: got {start}..{end} but buffer {} has length {}", |
192 | 0 | view.buffer_index, |
193 | 0 | data.len() |
194 | 0 | )) |
195 | 0 | })?; |
196 | | |
197 | 0 | if !b.starts_with(&view.prefix.to_le_bytes()) { |
198 | 0 | return Err(ArrowError::InvalidArgumentError( |
199 | 0 | "Mismatch between embedded prefix and data".to_string(), |
200 | 0 | )); |
201 | 0 | } |
202 | | |
203 | 0 | f(idx, b)?; |
204 | | } |
205 | | } |
206 | 0 | Ok(()) |
207 | 0 | } |