/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/reader/vlq.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | /// Decoder for zig-zag encoded variable length (VLW) integers |
19 | | /// |
20 | | /// See also: |
21 | | /// <https://avro.apache.org/docs/1.11.1/specification/#primitive-types-1> |
22 | | /// <https://protobuf.dev/programming-guides/encoding/#varints> |
23 | | #[derive(Debug, Default)] |
24 | | pub struct VLQDecoder { |
25 | | /// Scratch space for decoding VLQ integers |
26 | | in_progress: u64, |
27 | | shift: u32, |
28 | | } |
29 | | |
30 | | impl VLQDecoder { |
31 | | /// Decode a signed long from `buf` |
32 | 808 | pub fn long(&mut self, buf: &mut &[u8]) -> Option<i64> { |
33 | 961 | while let Some(byte) = buf.first().copied() { |
34 | 961 | *buf = &buf[1..]; |
35 | 961 | self.in_progress |= ((byte & 0x7F) as u64) << self.shift; |
36 | 961 | self.shift += 7; |
37 | 961 | if byte & 0x80 == 0 { |
38 | 808 | let val = self.in_progress; |
39 | 808 | self.in_progress = 0; |
40 | 808 | self.shift = 0; |
41 | 808 | return Some((val >> 1) as i64 ^ -((val & 1) as i64)); |
42 | 153 | } |
43 | | } |
44 | 0 | None |
45 | 808 | } |
46 | | } |
47 | | |
48 | | /// Read a varint from `buf` returning the decoded `u64` and the number of bytes read |
49 | | #[inline] |
50 | 9.40k | pub(crate) fn read_varint(buf: &[u8]) -> Option<(u64, usize)> { |
51 | 9.40k | let first9.40k = *buf.first()?1 ; |
52 | 9.40k | if first < 0x80 { |
53 | 7.02k | return Some((first as u64, 1)); |
54 | 2.37k | } |
55 | | |
56 | 2.37k | if let Some(array1.81k ) = buf.get(..10) { |
57 | 1.81k | return read_varint_array(array.try_into().unwrap()); |
58 | 560 | } |
59 | | |
60 | 560 | read_varint_slow(buf) |
61 | 9.40k | } |
62 | | |
63 | | /// Based on |
64 | | /// - <https://github.com/tokio-rs/prost/blob/master/prost/src/encoding/varint.rs#L71> |
65 | | /// - <https://github.com/google/protobuf/blob/3.3.x/src/google/protobuf/io/coded_stream.cc#L365-L406> |
66 | | /// - <https://github.com/protocolbuffers/protobuf-go/blob/v1.27.1/encoding/protowire/wire.go#L358> |
67 | | #[inline] |
68 | 1.81k | fn read_varint_array(buf: [u8; 10]) -> Option<(u64, usize)> { |
69 | 1.81k | let mut in_progress = 0_u64; |
70 | 15.7k | for (idx, b) in buf1.81k .into_iter1.81k ().take1.81k (9).enumerate1.81k () { |
71 | 15.7k | in_progress += (b as u64) << (7 * idx); |
72 | 15.7k | if b < 0x80 { |
73 | 816 | return Some((in_progress, idx + 1)); |
74 | 14.8k | } |
75 | 14.8k | in_progress -= 0x80 << (7 * idx); |
76 | | } |
77 | | |
78 | 996 | let b = buf[9] as u64; |
79 | 996 | in_progress += b << (7 * 9); |
80 | 996 | (b < 0x02).then_some((in_progress, 10)) |
81 | 1.81k | } |
82 | | |
83 | | #[inline(never)] |
84 | | #[cold] |
85 | 560 | fn read_varint_slow(buf: &[u8]) -> Option<(u64, usize)> { |
86 | 560 | let mut value = 0; |
87 | 4.87k | for (count, byte) in buf560 .iter560 ().take560 (10).enumerate560 () { |
88 | 4.87k | let byte = buf[count]; |
89 | 4.87k | value |= u64::from(byte & 0x7F) << (count * 7); |
90 | 4.87k | if byte <= 0x7F { |
91 | | // Check for u64::MAX overflow. See [`ConsumeVarint`][1] for details. |
92 | | // [1]: https://github.com/protocolbuffers/protobuf-go/blob/v1.27.1/encoding/protowire/wire.go#L358 |
93 | 560 | return (count != 9 || byte < 20 ).then_some((value, count + 1)); |
94 | 4.31k | } |
95 | | } |
96 | | |
97 | 0 | None |
98 | 560 | } |
99 | | |
100 | | #[cfg(test)] |
101 | | mod tests { |
102 | | use super::*; |
103 | | |
104 | 1.00k | fn encode_var(mut n: u64, dst: &mut [u8]) -> usize { |
105 | 1.00k | let mut i = 0; |
106 | | |
107 | 9.50k | while n >= 0x80 { |
108 | 8.50k | dst[i] = 0x80 | (n as u8); |
109 | 8.50k | i += 1; |
110 | 8.50k | n >>= 7; |
111 | 8.50k | } |
112 | | |
113 | 1.00k | dst[i] = n as u8; |
114 | 1.00k | i + 1 |
115 | 1.00k | } |
116 | | |
117 | 1.00k | fn varint_test(a: u64) { |
118 | 1.00k | let mut buf = [0_u8; 10]; |
119 | 1.00k | let len = encode_var(a, &mut buf); |
120 | 1.00k | assert_eq!(read_varint(&buf[..len]).unwrap(), (a, len)); |
121 | 1.00k | assert_eq!(read_varint(&buf).unwrap(), (a, len)); |
122 | 1.00k | } |
123 | | |
124 | | #[test] |
125 | 1 | fn test_varint() { |
126 | 1 | varint_test(0); |
127 | 1 | varint_test(4395932); |
128 | 1 | varint_test(u64::MAX); |
129 | | |
130 | 1.00k | for _ in 0..1000 { |
131 | 1.00k | varint_test(rand::random()); |
132 | 1.00k | } |
133 | 1 | } |
134 | | } |