|
1 | | -use crate::char; |
2 | | -use crate::fmt::{self, Write}; |
3 | | -use crate::mem; |
| 1 | +use crate::fmt; |
| 2 | +use crate::fmt::Formatter; |
| 3 | +use crate::fmt::Write; |
| 4 | +use crate::iter::FusedIterator; |
4 | 5 |
|
5 | 6 | use super::from_utf8_unchecked; |
6 | 7 | use super::validations::utf8_char_width; |
7 | 8 |
|
8 | | -/// Lossy UTF-8 string. |
9 | | -#[unstable(feature = "str_internals", issue = "none")] |
10 | | -pub struct Utf8Lossy { |
11 | | - bytes: [u8], |
| 9 | +/// An item returned by the [`Utf8Chunks`] iterator. |
| 10 | +/// |
| 11 | +/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
| 12 | +/// when decoding a UTF-8 string. |
| 13 | +/// |
| 14 | +/// # Examples |
| 15 | +/// |
| 16 | +/// ``` |
| 17 | +/// #![feature(utf8_chunks)] |
| 18 | +/// |
| 19 | +/// use std::str::Utf8Chunks; |
| 20 | +/// |
| 21 | +/// // An invalid UTF-8 string |
| 22 | +/// let bytes = b"foo\xF1\x80bar"; |
| 23 | +/// |
| 24 | +/// // Decode the first `Utf8Chunk` |
| 25 | +/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); |
| 26 | +/// |
| 27 | +/// // The first three characters are valid UTF-8 |
| 28 | +/// assert_eq!("foo", chunk.valid()); |
| 29 | +/// |
| 30 | +/// // The fourth character is broken |
| 31 | +/// assert_eq!(b"\xF1\x80", chunk.invalid()); |
| 32 | +/// ``` |
| 33 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 34 | +#[derive(Clone, Debug, PartialEq, Eq)] |
| 35 | +pub struct Utf8Chunk<'a> { |
| 36 | + valid: &'a str, |
| 37 | + invalid: &'a [u8], |
12 | 38 | } |
13 | 39 |
|
14 | | -impl Utf8Lossy { |
| 40 | +impl<'a> Utf8Chunk<'a> { |
| 41 | + /// Returns the next validated UTF-8 substring. |
| 42 | + /// |
| 43 | + /// This substring can be empty at the start of the string or between |
| 44 | + /// broken UTF-8 characters. |
15 | 45 | #[must_use] |
16 | | - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { |
17 | | - // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. |
18 | | - unsafe { mem::transmute(bytes) } |
| 46 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 47 | + pub fn valid(&self) -> &'a str { |
| 48 | + self.valid |
19 | 49 | } |
20 | 50 |
|
21 | | - pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { |
22 | | - Utf8LossyChunksIter { source: &self.bytes } |
| 51 | + /// Returns the invalid sequence that caused a failure. |
| 52 | + /// |
| 53 | + /// The returned slice will have a maximum length of 3 and starts after the |
| 54 | + /// substring given by [`valid`]. Decoding will resume after this sequence. |
| 55 | + /// |
| 56 | + /// If empty, this is the last chunk in the string. If non-empty, an |
| 57 | + /// unexpected byte was encountered or the end of the input was reached |
| 58 | + /// unexpectedly. |
| 59 | + /// |
| 60 | + /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
| 61 | + /// CHARACTER`]. |
| 62 | + /// |
| 63 | + /// [`valid`]: Self::valid |
| 64 | + /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
| 65 | + #[must_use] |
| 66 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 67 | + pub fn invalid(&self) -> &'a [u8] { |
| 68 | + self.invalid |
23 | 69 | } |
24 | 70 | } |
25 | 71 |
|
26 | | -/// Iterator over lossy UTF-8 string |
27 | | -#[must_use = "iterators are lazy and do nothing unless consumed"] |
| 72 | +#[must_use] |
| 73 | +#[unstable(feature = "str_internals", issue = "none")] |
| 74 | +pub struct Debug<'a>(&'a [u8]); |
| 75 | + |
28 | 76 | #[unstable(feature = "str_internals", issue = "none")] |
29 | | -#[allow(missing_debug_implementations)] |
30 | | -pub struct Utf8LossyChunksIter<'a> { |
| 77 | +impl fmt::Debug for Debug<'_> { |
| 78 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 79 | + f.write_char('"')?; |
| 80 | + |
| 81 | + for chunk in Utf8Chunks::new(self.0) { |
| 82 | + // Valid part. |
| 83 | + // Here we partially parse UTF-8 again which is suboptimal. |
| 84 | + { |
| 85 | + let valid = chunk.valid(); |
| 86 | + let mut from = 0; |
| 87 | + for (i, c) in valid.char_indices() { |
| 88 | + let esc = c.escape_debug(); |
| 89 | + // If char needs escaping, flush backlog so far and write, else skip |
| 90 | + if esc.len() != 1 { |
| 91 | + f.write_str(&valid[from..i])?; |
| 92 | + for c in esc { |
| 93 | + f.write_char(c)?; |
| 94 | + } |
| 95 | + from = i + c.len_utf8(); |
| 96 | + } |
| 97 | + } |
| 98 | + f.write_str(&valid[from..])?; |
| 99 | + } |
| 100 | + |
| 101 | + // Broken parts of string as hex escape. |
| 102 | + for &b in chunk.invalid() { |
| 103 | + write!(f, "\\x{:02X}", b)?; |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + f.write_char('"') |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
| 112 | +/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
| 113 | +/// |
| 114 | +/// If you want a simple conversion from UTF-8 byte slices to string slices, |
| 115 | +/// [`from_utf8`] is easier to use. |
| 116 | +/// |
| 117 | +/// [byteslice]: slice |
| 118 | +/// [`from_utf8`]: super::from_utf8 |
| 119 | +/// |
| 120 | +/// # Examples |
| 121 | +/// |
| 122 | +/// This can be used to create functionality similar to |
| 123 | +/// [`String::from_utf8_lossy`] without allocating heap memory: |
| 124 | +/// |
| 125 | +/// ``` |
| 126 | +/// #![feature(utf8_chunks)] |
| 127 | +/// |
| 128 | +/// use std::str::Utf8Chunks; |
| 129 | +/// |
| 130 | +/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
| 131 | +/// for chunk in Utf8Chunks::new(input) { |
| 132 | +/// push(chunk.valid()); |
| 133 | +/// |
| 134 | +/// if !chunk.invalid().is_empty() { |
| 135 | +/// push("\u{FFFD}"); |
| 136 | +/// } |
| 137 | +/// } |
| 138 | +/// } |
| 139 | +/// ``` |
| 140 | +/// |
| 141 | +/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
| 142 | +#[must_use = "iterators are lazy and do nothing unless consumed"] |
| 143 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 144 | +#[derive(Clone)] |
| 145 | +pub struct Utf8Chunks<'a> { |
31 | 146 | source: &'a [u8], |
32 | 147 | } |
33 | 148 |
|
34 | | -#[unstable(feature = "str_internals", issue = "none")] |
35 | | -#[derive(PartialEq, Eq, Debug)] |
36 | | -pub struct Utf8LossyChunk<'a> { |
37 | | - /// Sequence of valid chars. |
38 | | - /// Can be empty between broken UTF-8 chars. |
39 | | - pub valid: &'a str, |
40 | | - /// Single broken char, empty if none. |
41 | | - /// Empty iff iterator item is last. |
42 | | - pub broken: &'a [u8], |
| 149 | +impl<'a> Utf8Chunks<'a> { |
| 150 | + /// Creates a new iterator to decode the bytes. |
| 151 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 152 | + pub fn new(bytes: &'a [u8]) -> Self { |
| 153 | + Self { source: bytes } |
| 154 | + } |
| 155 | + |
| 156 | + #[doc(hidden)] |
| 157 | + #[unstable(feature = "str_internals", issue = "none")] |
| 158 | + pub fn debug(&self) -> Debug<'_> { |
| 159 | + Debug(self.source) |
| 160 | + } |
43 | 161 | } |
44 | 162 |
|
45 | | -impl<'a> Iterator for Utf8LossyChunksIter<'a> { |
46 | | - type Item = Utf8LossyChunk<'a>; |
| 163 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 164 | +impl<'a> Iterator for Utf8Chunks<'a> { |
| 165 | + type Item = Utf8Chunk<'a>; |
47 | 166 |
|
48 | | - fn next(&mut self) -> Option<Utf8LossyChunk<'a>> { |
| 167 | + fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
49 | 168 | if self.source.is_empty() { |
50 | 169 | return None; |
51 | 170 | } |
@@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { |
130 | 249 |
|
131 | 250 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
132 | 251 | // `valid_up_to = i` and `i` only increases. |
133 | | - let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
| 252 | + let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
134 | 253 |
|
135 | | - Some(Utf8LossyChunk { |
| 254 | + Some(Utf8Chunk { |
136 | 255 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
137 | 256 | valid: unsafe { from_utf8_unchecked(valid) }, |
138 | | - broken, |
| 257 | + invalid, |
139 | 258 | }) |
140 | 259 | } |
141 | 260 | } |
142 | 261 |
|
143 | | -impl fmt::Display for Utf8Lossy { |
144 | | - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
145 | | - // If we're the empty string then our iterator won't actually yield |
146 | | - // anything, so perform the formatting manually |
147 | | - if self.bytes.is_empty() { |
148 | | - return "".fmt(f); |
149 | | - } |
150 | | - |
151 | | - for Utf8LossyChunk { valid, broken } in self.chunks() { |
152 | | - // If we successfully decoded the whole chunk as a valid string then |
153 | | - // we can return a direct formatting of the string which will also |
154 | | - // respect various formatting flags if possible. |
155 | | - if valid.len() == self.bytes.len() { |
156 | | - assert!(broken.is_empty()); |
157 | | - return valid.fmt(f); |
158 | | - } |
159 | | - |
160 | | - f.write_str(valid)?; |
161 | | - if !broken.is_empty() { |
162 | | - f.write_char(char::REPLACEMENT_CHARACTER)?; |
163 | | - } |
164 | | - } |
165 | | - Ok(()) |
166 | | - } |
167 | | -} |
168 | | - |
169 | | -impl fmt::Debug for Utf8Lossy { |
170 | | - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
171 | | - f.write_char('"')?; |
| 262 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 263 | +impl FusedIterator for Utf8Chunks<'_> {} |
172 | 264 |
|
173 | | - for Utf8LossyChunk { valid, broken } in self.chunks() { |
174 | | - // Valid part. |
175 | | - // Here we partially parse UTF-8 again which is suboptimal. |
176 | | - { |
177 | | - let mut from = 0; |
178 | | - for (i, c) in valid.char_indices() { |
179 | | - let esc = c.escape_debug(); |
180 | | - // If char needs escaping, flush backlog so far and write, else skip |
181 | | - if esc.len() != 1 { |
182 | | - f.write_str(&valid[from..i])?; |
183 | | - for c in esc { |
184 | | - f.write_char(c)?; |
185 | | - } |
186 | | - from = i + c.len_utf8(); |
187 | | - } |
188 | | - } |
189 | | - f.write_str(&valid[from..])?; |
190 | | - } |
191 | | - |
192 | | - // Broken parts of string as hex escape. |
193 | | - for &b in broken { |
194 | | - write!(f, "\\x{:02x}", b)?; |
195 | | - } |
196 | | - } |
197 | | - |
198 | | - f.write_char('"') |
| 265 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 266 | +impl fmt::Debug for Utf8Chunks<'_> { |
| 267 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 268 | + f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() |
199 | 269 | } |
200 | 270 | } |
0 commit comments