@@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
898898 validate : bool ,
899899 scratch : & mut Vec < u8 > ,
900900) -> Result < ( ) > {
901- let c = match tri ! ( read. decode_hex_escape( ) ) {
902- n @ 0xDC00 ..=0xDFFF => {
903- return if validate {
904- error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape )
905- } else {
906- push_wtf8_codepoint ( n as u32 , scratch) ;
907- Ok ( ( ) )
908- } ;
909- }
901+ let n = tri ! ( read. decode_hex_escape( ) ) ;
910902
911- // Non-BMP characters are encoded as a sequence of two hex
912- // escapes, representing UTF-16 surrogates. If deserializing a
913- // utf-8 string the surrogates are required to be paired,
914- // whereas deserializing a byte string accepts lone surrogates.
915- n1 @ 0xD800 ..=0xDBFF => {
916- if tri ! ( peek_or_eof( read) ) == b'\\' {
917- read. discard ( ) ;
918- } else {
919- return if validate {
920- read. discard ( ) ;
921- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
922- } else {
923- push_wtf8_codepoint ( n1 as u32 , scratch) ;
924- Ok ( ( ) )
925- } ;
926- }
903+ // Non-BMP characters are encoded as a sequence of two hex
904+ // escapes, representing UTF-16 surrogates. If deserializing a
905+ // utf-8 string the surrogates are required to be paired,
906+ // whereas deserializing a byte string accepts lone surrogates.
907+ if validate && n >= 0xDC00 && n <= 0xDFFF {
908+ // XXX: This is actually a trailing surrogate.
909+ return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
910+ }
927911
928- if tri ! ( peek_or_eof( read) ) == b'u' {
929- read. discard ( ) ;
930- } else {
931- return if validate {
932- read. discard ( ) ;
933- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
934- } else {
935- push_wtf8_codepoint ( n1 as u32 , scratch) ;
936- // The \ prior to this byte started an escape sequence,
937- // so we need to parse that now. This recursive call
938- // does not blow the stack on malicious input because
939- // the escape is not \u, so it will be handled by one
940- // of the easy nonrecursive cases.
941- parse_escape ( read, validate, scratch)
942- } ;
943- }
912+ if n < 0xD800 || n > 0xDBFF {
913+ // Every u16 outside of the surrogate ranges is guaranteed to be a
914+ // legal char.
915+ push_wtf8_codepoint ( n as u32 , scratch) ;
916+ return Ok ( ( ) ) ;
917+ }
944918
945- let n2 = tri ! ( read. decode_hex_escape( ) ) ;
919+ // n is a leading surrogate, we now expect a trailing surrogate.
920+ let n1 = n;
946921
947- if n2 < 0xDC00 || n2 > 0xDFFF {
948- return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
949- }
922+ if tri ! ( peek_or_eof( read) ) == b'\\' {
923+ read. discard ( ) ;
924+ } else {
925+ return if validate {
926+ read. discard ( ) ;
927+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
928+ } else {
929+ push_wtf8_codepoint ( n1 as u32 , scratch) ;
930+ Ok ( ( ) )
931+ } ;
932+ }
950933
951- // This value is in range U+10000..=U+10FFFF, which is always a
952- // valid codepoint.
953- ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000
954- }
934+ if tri ! ( peek_or_eof( read) ) == b'u' {
935+ read. discard ( ) ;
936+ } else {
937+ return if validate {
938+ read. discard ( ) ;
939+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
940+ } else {
941+ push_wtf8_codepoint ( n1 as u32 , scratch) ;
942+ // The \ prior to this byte started an escape sequence,
943+ // so we need to parse that now. This recursive call
944+ // does not blow the stack on malicious input because
945+ // the escape is not \u, so it will be handled by one
946+ // of the easy nonrecursive cases.
947+ parse_escape ( read, validate, scratch)
948+ } ;
949+ }
955950
956- // Every u16 outside of the surrogate ranges above is guaranteed
957- // to be a legal char.
958- n => n as u32 ,
959- } ;
951+ let n2 = tri ! ( read. decode_hex_escape( ) ) ;
952+
953+ if n2 < 0xDC00 || n2 > 0xDFFF {
954+ return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
955+ }
960956
961- push_wtf8_codepoint ( c, scratch) ;
957+ // This value is in range U+10000..=U+10FFFF, which is always a
958+ // valid codepoint.
959+ let n = ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000 ;
960+ push_wtf8_codepoint ( n, scratch) ;
962961 Ok ( ( ) )
963962}
964963
0 commit comments