/Users/andrewlamb/Software/arrow-rs/arrow-avro/src/reader/block.rs
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Decoder for [`Block`] |
19 | | |
20 | | use crate::reader::vlq::VLQDecoder; |
21 | | use arrow_schema::ArrowError; |
22 | | |
23 | | /// A file data block |
24 | | /// |
25 | | /// <https://avro.apache.org/docs/1.11.1/specification/#object-container-files> |
26 | | #[derive(Debug, Default)] |
27 | | pub struct Block { |
28 | | /// The number of objects in this block |
29 | | pub count: usize, |
30 | | /// The serialized objects within this block |
31 | | pub data: Vec<u8>, |
32 | | /// The sync marker |
33 | | pub sync: [u8; 16], |
34 | | } |
35 | | |
36 | | /// A decoder for [`Block`] |
37 | | #[derive(Debug)] |
38 | | pub struct BlockDecoder { |
39 | | state: BlockDecoderState, |
40 | | in_progress: Block, |
41 | | vlq_decoder: VLQDecoder, |
42 | | bytes_remaining: usize, |
43 | | } |
44 | | |
45 | | #[derive(Debug)] |
46 | | enum BlockDecoderState { |
47 | | Count, |
48 | | Size, |
49 | | Data, |
50 | | Sync, |
51 | | Finished, |
52 | | } |
53 | | |
54 | | impl Default for BlockDecoder { |
55 | 73 | fn default() -> Self { |
56 | 73 | Self { |
57 | 73 | state: BlockDecoderState::Count, |
58 | 73 | in_progress: Default::default(), |
59 | 73 | vlq_decoder: Default::default(), |
60 | 73 | bytes_remaining: 0, |
61 | 73 | } |
62 | 73 | } |
63 | | } |
64 | | |
65 | | impl BlockDecoder { |
66 | | /// Parse [`Block`] from `buf`, returning the number of bytes read |
67 | | /// |
68 | | /// This method can be called multiple times with consecutive chunks of data, allowing |
69 | | /// integration with chunked IO systems like [`BufRead::fill_buf`] |
70 | | /// |
71 | | /// All errors should be considered fatal, and decoding aborted |
72 | | /// |
73 | | /// Once an entire [`Block`] has been decoded this method will not read any further |
74 | | /// input bytes, until [`Self::flush`] is called. Afterwards [`Self::decode`] |
75 | | /// can then be used again to read the next block, if any |
76 | | /// |
77 | | /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf |
78 | 73 | pub fn decode(&mut self, mut buf: &[u8]) -> Result<usize, ArrowError> { |
79 | 73 | let max_read = buf.len(); |
80 | 365 | while !buf.is_empty() { |
81 | 292 | match self.state { |
82 | | BlockDecoderState::Count => { |
83 | 73 | if let Some(c) = self.vlq_decoder.long(&mut buf) { |
84 | 73 | self.in_progress.count = c.try_into().map_err(|_| {0 |
85 | 0 | ArrowError::ParseError(format!( |
86 | 0 | "Block count cannot be negative, got {c}" |
87 | 0 | )) |
88 | 0 | })?; |
89 | | |
90 | 73 | self.state = BlockDecoderState::Size; |
91 | 0 | } |
92 | | } |
93 | | BlockDecoderState::Size => { |
94 | 73 | if let Some(c) = self.vlq_decoder.long(&mut buf) { |
95 | 73 | self.bytes_remaining = c.try_into().map_err(|_| {0 |
96 | 0 | ArrowError::ParseError(format!( |
97 | 0 | "Block size cannot be negative, got {c}" |
98 | 0 | )) |
99 | 0 | })?; |
100 | | |
101 | 73 | self.in_progress.data.reserve(self.bytes_remaining); |
102 | 73 | self.state = BlockDecoderState::Data; |
103 | 0 | } |
104 | | } |
105 | | BlockDecoderState::Data => { |
106 | 73 | let to_read = self.bytes_remaining.min(buf.len()); |
107 | 73 | self.in_progress.data.extend_from_slice(&buf[..to_read]); |
108 | 73 | buf = &buf[to_read..]; |
109 | 73 | self.bytes_remaining -= to_read; |
110 | 73 | if self.bytes_remaining == 0 { |
111 | 73 | self.bytes_remaining = 16; |
112 | 73 | self.state = BlockDecoderState::Sync; |
113 | 73 | }0 |
114 | | } |
115 | | BlockDecoderState::Sync => { |
116 | 73 | let to_decode = buf.len().min(self.bytes_remaining); |
117 | 73 | let write = &mut self.in_progress.sync[16 - to_decode..]; |
118 | 73 | write[..to_decode].copy_from_slice(&buf[..to_decode]); |
119 | 73 | self.bytes_remaining -= to_decode; |
120 | 73 | buf = &buf[to_decode..]; |
121 | 73 | if self.bytes_remaining == 0 { |
122 | 73 | self.state = BlockDecoderState::Finished; |
123 | 73 | }0 |
124 | | } |
125 | 0 | BlockDecoderState::Finished => return Ok(max_read - buf.len()), |
126 | | } |
127 | | } |
128 | 73 | Ok(max_read) |
129 | 73 | } |
130 | | |
131 | | /// Flush this decoder returning the parsed [`Block`] if any |
132 | 73 | pub fn flush(&mut self) -> Option<Block> { |
133 | 73 | match self.state { |
134 | | BlockDecoderState::Finished => { |
135 | 73 | self.state = BlockDecoderState::Count; |
136 | 73 | Some(std::mem::take(&mut self.in_progress)) |
137 | | } |
138 | 0 | _ => None, |
139 | | } |
140 | 73 | } |
141 | | } |