11use super :: from_utf8_unchecked;
22use super :: validations:: utf8_char_width;
3- use crate :: fmt;
43use crate :: fmt:: { Formatter , Write } ;
54use crate :: iter:: FusedIterator ;
5+ use crate :: { fmt, slice} ;
66
77impl [ u8 ] {
88 /// Creates an iterator over the contiguous valid UTF-8 ranges of this
@@ -152,7 +152,7 @@ impl fmt::Debug for Debug<'_> {
152152///
153153/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154154///
155- /// [byteslice]: slice
155+ /// [byteslice]: prim@ slice
156156/// [`from_utf8`]: super::from_utf8
157157///
158158/// # Examples
@@ -197,86 +197,29 @@ impl<'a> Iterator for Utf8Chunks<'a> {
197197 return None ;
198198 }
199199
200- const TAG_CONT_U8 : u8 = 128 ;
201- fn safe_get ( xs : & [ u8 ] , i : usize ) -> u8 {
202- * xs. get ( i) . unwrap_or ( & 0 )
203- }
204-
205- let mut i = 0 ;
206- let mut valid_up_to = 0 ;
207- while i < self . source . len ( ) {
208- // SAFETY: `i < self.source.len()` per previous line.
209- // For some reason the following are both significantly slower:
210- // while let Some(&byte) = self.source.get(i) {
211- // while let Some(byte) = self.source.get(i).copied() {
212- let byte = unsafe { * self . source . get_unchecked ( i) } ;
213- i += 1 ;
214-
215- if byte < 128 {
216- // This could be a `1 => ...` case in the match below, but for
217- // the common case of all-ASCII inputs, we bypass loading the
218- // sizeable UTF8_CHAR_WIDTH table into cache.
219- } else {
220- let w = utf8_char_width ( byte) ;
221-
222- match w {
223- 2 => {
224- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
225- break ;
226- }
227- i += 1 ;
228- }
229- 3 => {
230- match ( byte, safe_get ( self . source , i) ) {
231- ( 0xE0 , 0xA0 ..=0xBF ) => ( ) ,
232- ( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => ( ) ,
233- ( 0xED , 0x80 ..=0x9F ) => ( ) ,
234- ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => ( ) ,
235- _ => break ,
236- }
237- i += 1 ;
238- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
239- break ;
240- }
241- i += 1 ;
242- }
243- 4 => {
244- match ( byte, safe_get ( self . source , i) ) {
245- ( 0xF0 , 0x90 ..=0xBF ) => ( ) ,
246- ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => ( ) ,
247- ( 0xF4 , 0x80 ..=0x8F ) => ( ) ,
248- _ => break ,
249- }
250- i += 1 ;
251- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
252- break ;
253- }
254- i += 1 ;
255- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
256- break ;
257- }
258- i += 1 ;
259- }
260- _ => break ,
261- }
200+ let mut iter = self . source . iter ( ) ;
201+ let mut len_after_valid = iter. len ( ) ;
202+ while !iter. is_empty ( ) {
203+ if !advance_utf8 ( & mut iter) {
204+ // Stop at the first invalid sequence.
205+ break ;
262206 }
263-
264- valid_up_to = i;
207+ len_after_valid = iter. len ( ) ;
265208 }
209+ let valid_up_to = self . source . len ( ) - len_after_valid;
210+ let inspected_len = self . source . len ( ) - iter. len ( ) ;
266211
267- // SAFETY: `i <= self.source.len()` because it is only ever incremented
268- // via `i += 1` and in between every single one of those increments, `i`
269- // is compared against `self.source.len()`. That happens either
270- // literally by `i < self.source.len()` in the while-loop's condition,
271- // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272- // loop is terminated as soon as the latest `i += 1` has made `i` no
273- // longer less than `self.source.len()`, which means it'll be at most
274- // equal to `self.source.len()`.
275- let ( inspected, remaining) = unsafe { self . source . split_at_unchecked ( i) } ;
212+ // SAFETY: The length of the remaining bytes in `iter` only decreases,
213+ // so `iter.len() <= self.source.len()`. The length of inspected bytes,
214+ // `self.source.len() - iter.len()`, then only increases and can be at
215+ // most `self.source.len()`.
216+ let ( inspected, remaining) = unsafe { self . source . split_at_unchecked ( inspected_len) } ;
276217 self . source = remaining;
277218
278- // SAFETY: `valid_up_to <= i` because it is only ever assigned via
279- // `valid_up_to = i` and `i` only increases.
219+ // SAFETY: Since `iter.len()` only decreases and `len_after_valid` is
220+ // the value of `iter.len()` from the previous iteration, it follows
221+ // that `len_after_valid <= iter.len()`, which is equivalent to
222+ // `valid_up_to <= inspected_len` by simple substitution.
280223 let ( valid, invalid) = unsafe { inspected. split_at_unchecked ( valid_up_to) } ;
281224
282225 Some ( Utf8Chunk {
@@ -296,3 +239,65 @@ impl fmt::Debug for Utf8Chunks<'_> {
296239 f. debug_struct ( "Utf8Chunks" ) . field ( "source" , & self . debug ( ) ) . finish ( )
297240 }
298241}
242+
243+ /// Advances the byte iterator by one UTF-8 scalar value, allowing invalid UTF-8
244+ /// sequences. When the current sequence is invalid, the maximal prefix of a
245+ /// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
246+ /// a valid Unicode scalar value.
247+ #[ inline]
248+ fn advance_utf8 ( bytes : & mut slice:: Iter < ' _ , u8 > ) -> bool {
249+ const TAG_CONT_U8 : u8 = 128 ;
250+ #[ inline]
251+ fn peek ( bytes : & slice:: Iter < ' _ , u8 > ) -> u8 {
252+ * bytes. clone ( ) . next ( ) . unwrap_or ( & 0 )
253+ }
254+
255+ let Some ( & byte) = bytes. next ( ) else { return false } ;
256+ if byte < 128 {
257+ // This could be a `1 => ...` case in the match below, but for the
258+ // common case of all-ASCII inputs, we bypass loading the sizeable
259+ // UTF8_CHAR_WIDTH table into cache.
260+ } else {
261+ match utf8_char_width ( byte) {
262+ 2 => {
263+ if peek ( bytes) & 192 != TAG_CONT_U8 {
264+ return false ;
265+ }
266+ bytes. next ( ) ;
267+ }
268+ 3 => {
269+ match ( byte, peek ( bytes) ) {
270+ ( 0xE0 , 0xA0 ..=0xBF ) => { }
271+ ( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => { }
272+ ( 0xED , 0x80 ..=0x9F ) => { }
273+ ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
274+ _ => return false ,
275+ }
276+ bytes. next ( ) ;
277+ if peek ( bytes) & 192 != TAG_CONT_U8 {
278+ return false ;
279+ }
280+ bytes. next ( ) ;
281+ }
282+ 4 => {
283+ match ( byte, peek ( bytes) ) {
284+ ( 0xF0 , 0x90 ..=0xBF ) => { }
285+ ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => { }
286+ ( 0xF4 , 0x80 ..=0x8F ) => { }
287+ _ => return false ,
288+ }
289+ bytes. next ( ) ;
290+ if peek ( bytes) & 192 != TAG_CONT_U8 {
291+ return false ;
292+ }
293+ bytes. next ( ) ;
294+ if peek ( bytes) & 192 != TAG_CONT_U8 {
295+ return false ;
296+ }
297+ bytes. next ( ) ;
298+ }
299+ _ => return false ,
300+ }
301+ }
302+ true
303+ }
0 commit comments