@@ -85,7 +85,7 @@ impl EscapeError {
8585///
8686/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8787/// the callback will be called exactly once.
88- pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F )
88+ pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
8989where
9090 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
9191{
@@ -94,16 +94,17 @@ where
9494 let mut chars = src. chars ( ) ;
9595 let res = unescape_char_or_byte ( & mut chars, mode) ;
9696 callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
97+ Rfc3349 :: Unused // rfc3349 not relevant for `Mode::{Char,Byte}`
9798 }
98- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99+ Str => unescape_non_raw_common ( src, mode, callback) ,
99100 RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100101 RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101102 if let Ok ( '\0' ) = result {
102103 result = Err ( EscapeError :: NulInCStr ) ;
103104 }
104105 callback ( r, result)
105106 } ) ,
106- CStr => unreachable ! ( ) ,
107+ ByteStr | CStr => unreachable ! ( ) ,
107108 }
108109}
109110
@@ -142,18 +143,19 @@ impl From<u8> for MixedUnit {
142143/// a sequence of escaped characters or errors.
143144///
144145/// Values are returned by invoking `callback`.
145- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
146+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
146147where
147148 F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148149{
149150 match mode {
151+ ByteStr => unescape_non_raw_common ( src, mode, callback) ,
150152 CStr => unescape_non_raw_common ( src, mode, & mut |r, mut result| {
151153 if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152154 result = Err ( EscapeError :: NulInCStr ) ;
153155 }
154156 callback ( r, result)
155157 } ) ,
156- Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
158+ Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157159 }
158160}
159161
@@ -169,6 +171,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169171 unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
170172}
171173
174+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
175+ /// literal to be valid. Once rfc3349 is stabilized this type can be removed.
176+ #[ derive( Debug , PartialEq ) ]
177+ #[ must_use]
178+ pub enum Rfc3349 {
179+ Used ,
180+ Unused ,
181+ }
182+
172183/// What kind of literal do we parse.
173184#[ derive( Debug , Clone , Copy , PartialEq ) ]
174185pub enum Mode {
@@ -205,17 +216,25 @@ impl Mode {
205216
206217 /// Are unicode (non-ASCII) chars allowed?
207218 #[ inline]
208- fn allow_unicode_chars ( self ) -> bool {
219+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
209220 match self {
210- Byte | ByteStr | RawByteStr => false ,
221+ Byte => false ,
222+ ByteStr | RawByteStr => {
223+ * rfc3349 = Rfc3349 :: Used ;
224+ true
225+ }
211226 Char | Str | RawStr | CStr | RawCStr => true ,
212227 }
213228 }
214229
215230 /// Are unicode escapes (`\u`) allowed?
216- fn allow_unicode_escapes ( self ) -> bool {
231+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
217232 match self {
218- Byte | ByteStr => false ,
233+ Byte => false ,
234+ ByteStr => {
235+ * rfc3349 = Rfc3349 :: Used ;
236+ true
237+ }
219238 Char | Str | CStr => true ,
220239 RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
221240 }
@@ -233,6 +252,7 @@ impl Mode {
233252fn scan_escape < T : From < char > + From < u8 > > (
234253 chars : & mut Chars < ' _ > ,
235254 mode : Mode ,
255+ rfc3349 : & mut Rfc3349 ,
236256) -> Result < T , EscapeError > {
237257 // Previous character was '\\', unescape what follows.
238258 let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -262,13 +282,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262282 Ok ( T :: from ( value as u8 ) )
263283 } ;
264284 }
265- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
285+ 'u' => return scan_unicode ( chars, mode, rfc3349 ) . map ( T :: from) ,
266286 _ => return Err ( EscapeError :: InvalidEscape ) ,
267287 } ;
268288 Ok ( T :: from ( res) )
269289}
270290
271- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
291+ fn scan_unicode (
292+ chars : & mut Chars < ' _ > ,
293+ mode : Mode ,
294+ rfc3349 : & mut Rfc3349 ,
295+ ) -> Result < char , EscapeError > {
272296 // We've parsed '\u', now we have to parse '{..}'.
273297
274298 if chars. next ( ) != Some ( '{' ) {
@@ -296,7 +320,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296320
297321 // Incorrect syntax has higher priority for error reporting
298322 // than unallowed value for a literal.
299- if !allow_unicode_escapes {
323+ if !mode . allow_unicode_escapes ( rfc3349 ) {
300324 return Err ( EscapeError :: UnicodeEscapeInByte ) ;
301325 }
302326
@@ -322,18 +346,27 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322346}
323347
324348#[ inline]
325- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
326- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
349+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
350+ // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
351+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
352+ Ok ( c)
353+ } else {
354+ Err ( EscapeError :: NonAsciiCharInByte )
355+ }
327356}
328357
329358fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
330359 let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
360+ let mut rfc3349 = Rfc3349 :: Unused ;
331361 let res = match c {
332- '\\' => scan_escape ( chars, mode) ,
362+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
333363 '\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
334364 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
335- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
365+ _ => ascii_check ( c, mode, & mut rfc3349 ) ,
336366 } ?;
367+
368+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ; // rfc3349 not relevant for `Mode::{Char,Byte}`
369+
337370 if chars. next ( ) . is_some ( ) {
338371 return Err ( EscapeError :: MoreThanOneChar ) ;
339372 }
@@ -342,12 +375,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342375
343376/// Takes a contents of a string literal (without quotes) and produces a
344377/// sequence of escaped characters or errors.
345- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
378+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > (
379+ src : & str ,
380+ mode : Mode ,
381+ callback : & mut F ,
382+ ) -> Rfc3349
346383where
347384 F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
348385{
349386 let mut chars = src. chars ( ) ;
350- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
387+ let mut rfc3349 = Rfc3349 :: Unused ;
351388
352389 // The `start` and `end` computation here is complicated because
353390 // `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +404,17 @@ where
367404 } ) ;
368405 continue ;
369406 }
370- _ => scan_escape :: < T > ( & mut chars, mode) ,
407+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
371408 }
372409 }
373410 '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374411 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375- _ => ascii_check ( c, allow_unicode_chars ) . map ( T :: from) ,
412+ _ => ascii_check ( c, mode , & mut rfc3349 ) . map ( T :: from) ,
376413 } ;
377414 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378415 callback ( start..end, res) ;
379416 }
417+ rfc3349
380418}
381419
382420fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -409,12 +447,12 @@ where
409447/// sequence of characters or errors.
410448/// NOTE: Raw strings do not perform any explicit character escaping, here we
411449/// only produce errors on bare CR.
412- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
450+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
413451where
414452 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415453{
416454 let mut chars = src. chars ( ) ;
417- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
455+ let mut rfc3349 = Rfc3349 :: Unused ;
418456
419457 // The `start` and `end` computation here matches the one in
420458 // `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +461,17 @@ where
423461 let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
424462 let res = match c {
425463 '\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426- _ => ascii_check ( c, allow_unicode_chars ) ,
464+ _ => ascii_check ( c, mode , & mut rfc3349 ) ,
427465 } ;
428466 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429467 callback ( start..end, res) ;
430468 }
469+ rfc3349
431470}
432471
433472#[ inline]
434- pub fn byte_from_char ( c : char ) -> u8 {
473+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
435474 let res = c as u32 ;
436- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
475+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
437476 res as u8
438477}
0 commit comments