@@ -89,6 +89,24 @@ impl CodePoint {
8989 self . value
9090 }
9191
92+ /// Returns the numeric value of the code point if it is a leading surrogate.
93+ #[ inline]
94+ pub fn to_lead_surrogate ( & self ) -> Option < u16 > {
95+ match self . value {
96+ lead @ 0xD800 ..=0xDBFF => Some ( lead as u16 ) ,
97+ _ => None ,
98+ }
99+ }
100+
101+ /// Returns the numeric value of the code point if it is a trailing surrogate.
102+ #[ inline]
103+ pub fn to_trail_surrogate ( & self ) -> Option < u16 > {
104+ match self . value {
105+ trail @ 0xDC00 ..=0xDFFF => Some ( trail as u16 ) ,
106+ _ => None ,
107+ }
108+ }
109+
92110 /// Optionally returns a Unicode scalar value for the code point.
93111 ///
94112 /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
117135#[ derive( Eq , PartialEq , Ord , PartialOrd , Clone ) ]
118136pub struct Wtf8Buf {
119137 bytes : Vec < u8 > ,
138+
139+ /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140+ /// know this if we're constructed from a `String` or `&str`.
141+ ///
142+ /// It is possible for `bytes` to have valid UTF-8 without this being
143+ /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144+ /// paired, as we don't bother to rescan the entire string.
145+ is_known_utf8 : bool ,
120146}
121147
122148impl ops:: Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
147173 /// Creates a new, empty WTF-8 string.
148174 #[ inline]
149175 pub fn new ( ) -> Wtf8Buf {
150- Wtf8Buf { bytes : Vec :: new ( ) }
176+ Wtf8Buf { bytes : Vec :: new ( ) , is_known_utf8 : true }
151177 }
152178
153179 /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
154180 #[ inline]
155181 pub fn with_capacity ( capacity : usize ) -> Wtf8Buf {
156- Wtf8Buf { bytes : Vec :: with_capacity ( capacity) }
182+ Wtf8Buf { bytes : Vec :: with_capacity ( capacity) , is_known_utf8 : true }
157183 }
158184
159185 /// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
163189 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
164190 #[ inline]
165191 pub fn from_string ( string : String ) -> Wtf8Buf {
166- Wtf8Buf { bytes : string. into_bytes ( ) }
192+ Wtf8Buf { bytes : string. into_bytes ( ) , is_known_utf8 : true }
167193 }
168194
169195 /// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
173199 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
174200 #[ inline]
175201 pub fn from_str ( str : & str ) -> Wtf8Buf {
176- Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) }
202+ Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) , is_known_utf8 : true }
177203 }
178204
179205 pub fn clear ( & mut self ) {
180- self . bytes . clear ( )
206+ self . bytes . clear ( ) ;
207+ self . is_known_utf8 = true ;
181208 }
182209
183210 /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -193,17 +220,19 @@ impl Wtf8Buf {
193220 let surrogate = surrogate. unpaired_surrogate ( ) ;
194221 // Surrogates are known to be in the code point range.
195222 let code_point = unsafe { CodePoint :: from_u32_unchecked ( surrogate as u32 ) } ;
223+ // The string will now contain an unpaired surrogate.
224+ string. is_known_utf8 = false ;
196225 // Skip the WTF-8 concatenation check,
197226 // surrogate pairs are already decoded by decode_utf16
198- string. push_code_point_unchecked ( code_point)
227+ string. push_code_point_unchecked ( code_point) ;
199228 }
200229 }
201230 }
202231 string
203232 }
204233
205234 /// Copied from String::push
206- /// This does **not** include the WTF-8 concatenation check.
235+ /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check .
207236 fn push_code_point_unchecked ( & mut self , code_point : CodePoint ) {
208237 let mut bytes = [ 0 ; 4 ] ;
209238 let bytes = char:: encode_utf8_raw ( code_point. value , & mut bytes) ;
@@ -217,6 +246,9 @@ impl Wtf8Buf {
217246
218247 #[ inline]
219248 pub fn as_mut_slice ( & mut self ) -> & mut Wtf8 {
249+ // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
250+ // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
251+ // which would break the assumptions of the `is_known_utf8` field.
220252 unsafe { Wtf8 :: from_mut_bytes_unchecked ( & mut self . bytes ) }
221253 }
222254
@@ -314,7 +346,15 @@ impl Wtf8Buf {
314346 self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
315347 self . bytes . extend_from_slice ( other_without_trail_surrogate) ;
316348 }
317- _ => self . bytes . extend_from_slice ( & other. bytes ) ,
349+ _ => {
350+ // If we'll be pushing a string containing a surrogate, we may
351+ // no longer have UTF-8.
352+ if other. next_surrogate ( 0 ) . is_some ( ) {
353+ self . is_known_utf8 = false ;
354+ }
355+
356+ self . bytes . extend_from_slice ( & other. bytes ) ;
357+ }
318358 }
319359 }
320360
@@ -331,13 +371,19 @@ impl Wtf8Buf {
331371 /// like concatenating ill-formed UTF-16 strings effectively would.
332372 #[ inline]
333373 pub fn push ( & mut self , code_point : CodePoint ) {
334- if let trail @ 0xDC00 ..= 0xDFFF = code_point. to_u32 ( ) {
374+ if let Some ( trail) = code_point. to_trail_surrogate ( ) {
335375 if let Some ( lead) = ( & * self ) . final_lead_surrogate ( ) {
336376 let len_without_lead_surrogate = self . len ( ) - 3 ;
337377 self . bytes . truncate ( len_without_lead_surrogate) ;
338- self . push_char ( decode_surrogate_pair ( lead, trail as u16 ) ) ;
378+ self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
339379 return ;
340380 }
381+
382+ // We're pushing a trailing surrogate.
383+ self . is_known_utf8 = false ;
384+ } else if code_point. to_lead_surrogate ( ) . is_some ( ) {
385+ // We're pushing a leading surrogate.
386+ self . is_known_utf8 = false ;
341387 }
342388
343389 // No newly paired surrogates at the boundary.
@@ -364,9 +410,10 @@ impl Wtf8Buf {
364410 /// (that is, if the string contains surrogates),
365411 /// the original WTF-8 string is returned instead.
366412 pub fn into_string ( self ) -> Result < String , Wtf8Buf > {
367- match self . next_surrogate ( 0 ) {
368- None => Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } ) ,
369- Some ( _) => Err ( self ) ,
413+ if self . is_known_utf8 || self . next_surrogate ( 0 ) . is_none ( ) {
414+ Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } )
415+ } else {
416+ Err ( self )
370417 }
371418 }
372419
@@ -376,6 +423,11 @@ impl Wtf8Buf {
376423 ///
377424 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
378425 pub fn into_string_lossy ( mut self ) -> String {
426+ // Fast path: If we already have UTF-8, we can return it immediately.
427+ if self . is_known_utf8 {
428+ return unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
429+ }
430+
379431 let mut pos = 0 ;
380432 loop {
381433 match self . next_surrogate ( pos) {
@@ -398,7 +450,7 @@ impl Wtf8Buf {
398450 /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
399451 pub fn from_box ( boxed : Box < Wtf8 > ) -> Wtf8Buf {
400452 let bytes: Box < [ u8 ] > = unsafe { mem:: transmute ( boxed) } ;
401- Wtf8Buf { bytes : bytes. into_vec ( ) }
453+ Wtf8Buf { bytes : bytes. into_vec ( ) , is_known_utf8 : false }
402454 }
403455}
404456
@@ -576,6 +628,11 @@ impl Wtf8 {
576628 }
577629 }
578630
631+ /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
632+ pub fn to_owned ( & self ) -> Wtf8Buf {
633+ Wtf8Buf { bytes : self . bytes . to_vec ( ) , is_known_utf8 : false }
634+ }
635+
579636 /// Lossily converts the string to UTF-8.
580637 /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
581638 ///
@@ -665,7 +722,8 @@ impl Wtf8 {
665722 }
666723
667724 pub fn clone_into ( & self , buf : & mut Wtf8Buf ) {
668- self . bytes . clone_into ( & mut buf. bytes )
725+ buf. is_known_utf8 = false ;
726+ self . bytes . clone_into ( & mut buf. bytes ) ;
669727 }
670728
671729 /// Boxes this `Wtf8`.
@@ -705,12 +763,12 @@ impl Wtf8 {
705763
706764 #[ inline]
707765 pub fn to_ascii_lowercase ( & self ) -> Wtf8Buf {
708- Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) }
766+ Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) , is_known_utf8 : false }
709767 }
710768
711769 #[ inline]
712770 pub fn to_ascii_uppercase ( & self ) -> Wtf8Buf {
713- Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) }
771+ Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) , is_known_utf8 : false }
714772 }
715773
716774 #[ inline]
0 commit comments