|
11 | 11 | //! UTF-8 and UTF-16 decoding iterators |
12 | 12 |
|
13 | 13 | use fmt; |
14 | | -use iter::FusedIterator; |
15 | 14 | use super::from_u32_unchecked; |
16 | 15 |
|
17 | | -/// An iterator over an iterator of bytes of the characters the bytes represent |
18 | | -/// as UTF-8 |
19 | | -#[unstable(feature = "decode_utf8", issue = "33906")] |
20 | | -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
21 | | - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
22 | | -#[derive(Clone, Debug)] |
23 | | -#[allow(deprecated)] |
24 | | -pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>); |
25 | | - |
26 | | -/// Decodes an `Iterator` of bytes as UTF-8. |
27 | | -#[unstable(feature = "decode_utf8", issue = "33906")] |
28 | | -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
29 | | - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
30 | | -#[allow(deprecated)] |
31 | | -#[inline] |
32 | | -pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> { |
33 | | - DecodeUtf8(i.into_iter().peekable()) |
34 | | -} |
35 | | - |
36 | | -/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence. |
37 | | -#[unstable(feature = "decode_utf8", issue = "33906")] |
38 | | -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
39 | | - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
40 | | -#[derive(PartialEq, Eq, Debug)] |
41 | | -#[allow(deprecated)] |
42 | | -pub struct InvalidSequence(()); |
43 | | - |
44 | | -#[unstable(feature = "decode_utf8", issue = "33906")] |
45 | | -#[allow(deprecated)] |
46 | | -impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> { |
47 | | - type Item = Result<char, InvalidSequence>; |
48 | | - #[inline] |
49 | | - |
50 | | - fn next(&mut self) -> Option<Result<char, InvalidSequence>> { |
51 | | - self.0.next().map(|first_byte| { |
52 | | - // Emit InvalidSequence according to |
53 | | - // Unicode §5.22 Best Practice for U+FFFD Substitution |
54 | | - // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 |
55 | | - |
56 | | - // Roughly: consume at least one byte, |
57 | | - // then validate one byte at a time and stop before the first unexpected byte |
58 | | - // (which might be the valid start of the next byte sequence). |
59 | | - |
60 | | - let mut code_point; |
61 | | - macro_rules! first_byte { |
62 | | - ($mask: expr) => { |
63 | | - code_point = u32::from(first_byte & $mask) |
64 | | - } |
65 | | - } |
66 | | - macro_rules! continuation_byte { |
67 | | - () => { continuation_byte!(0x80..=0xBF) }; |
68 | | - ($range: pat) => { |
69 | | - match self.0.peek() { |
70 | | - Some(&byte @ $range) => { |
71 | | - code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); |
72 | | - self.0.next(); |
73 | | - } |
74 | | - _ => return Err(InvalidSequence(())) |
75 | | - } |
76 | | - } |
77 | | - } |
78 | | - |
79 | | - match first_byte { |
80 | | - 0x00..=0x7F => { |
81 | | - first_byte!(0b1111_1111); |
82 | | - } |
83 | | - 0xC2..=0xDF => { |
84 | | - first_byte!(0b0001_1111); |
85 | | - continuation_byte!(); |
86 | | - } |
87 | | - 0xE0 => { |
88 | | - first_byte!(0b0000_1111); |
89 | | - continuation_byte!(0xA0..=0xBF); // 0x80..=0x9F here are overlong |
90 | | - continuation_byte!(); |
91 | | - } |
92 | | - 0xE1..=0xEC | 0xEE..=0xEF => { |
93 | | - first_byte!(0b0000_1111); |
94 | | - continuation_byte!(); |
95 | | - continuation_byte!(); |
96 | | - } |
97 | | - 0xED => { |
98 | | - first_byte!(0b0000_1111); |
99 | | - continuation_byte!(0x80..=0x9F); // 0xA0..0xBF here are surrogates |
100 | | - continuation_byte!(); |
101 | | - } |
102 | | - 0xF0 => { |
103 | | - first_byte!(0b0000_0111); |
104 | | - continuation_byte!(0x90..=0xBF); // 0x80..0x8F here are overlong |
105 | | - continuation_byte!(); |
106 | | - continuation_byte!(); |
107 | | - } |
108 | | - 0xF1..=0xF3 => { |
109 | | - first_byte!(0b0000_0111); |
110 | | - continuation_byte!(); |
111 | | - continuation_byte!(); |
112 | | - continuation_byte!(); |
113 | | - } |
114 | | - 0xF4 => { |
115 | | - first_byte!(0b0000_0111); |
116 | | - continuation_byte!(0x80..=0x8F); // 0x90..0xBF here are beyond char::MAX |
117 | | - continuation_byte!(); |
118 | | - continuation_byte!(); |
119 | | - } |
120 | | - _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX |
121 | | - } |
122 | | - unsafe { |
123 | | - Ok(from_u32_unchecked(code_point)) |
124 | | - } |
125 | | - }) |
126 | | - } |
127 | | - |
128 | | - #[inline] |
129 | | - fn size_hint(&self) -> (usize, Option<usize>) { |
130 | | - let (lower, upper) = self.0.size_hint(); |
131 | | - |
132 | | - // A code point is at most 4 bytes long. |
133 | | - let min_code_points = lower / 4; |
134 | | - |
135 | | - (min_code_points, upper) |
136 | | - } |
137 | | -} |
138 | | - |
139 | | -#[unstable(feature = "decode_utf8", issue = "33906")] |
140 | | -#[allow(deprecated)] |
141 | | -impl<I: FusedIterator<Item = u8>> FusedIterator for DecodeUtf8<I> {} |
142 | | - |
143 | 16 | /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. |
144 | 17 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
145 | 18 | #[derive(Clone, Debug)] |
|
0 commit comments