diff --git a/crates/swc_common/src/input.rs b/crates/swc_common/src/input.rs index 6cd4d8e35e83..f46a436df776 100644 --- a/crates/swc_common/src/input.rs +++ b/crates/swc_common/src/input.rs @@ -10,8 +10,8 @@ pub type SourceFileInput<'a> = StringInput<'a>; #[derive(Clone)] pub struct StringInput<'a> { last_pos: BytePos, - /// Current cursor - iter: str::Chars<'a>, + /// Remaining input as str - we slice this as we consume bytes + remaining: &'a str, orig: &'a str, /// Original start position. orig_start: BytePos, @@ -33,7 +33,7 @@ impl<'a> StringInput<'a> { StringInput { last_pos: start, orig: src, - iter: src.chars(), + remaining: src, orig_start: start, orig_end: end, } @@ -41,7 +41,7 @@ impl<'a> StringInput<'a> { #[inline(always)] pub fn as_str(&self) -> &str { - self.iter.as_str() + self.remaining } #[inline(always)] @@ -68,21 +68,22 @@ impl<'a> StringInput<'a> { let ret = unsafe { s.get_unchecked(start_idx..end_idx) }; - self.iter = unsafe { s.get_unchecked(end_idx..) }.chars(); + self.remaining = unsafe { s.get_unchecked(end_idx..) }; ret } #[inline] pub fn bump_bytes(&mut self, n: usize) { - let s = self.iter.as_str(); - self.iter = unsafe { s.get_unchecked(n..) }.chars(); + debug_assert!(n <= self.remaining.len()); + self.remaining = unsafe { self.remaining.get_unchecked(n..) }; self.last_pos.0 += n as u32; } #[inline] pub fn bump_one(&mut self) { - if self.iter.next().is_some() { + if !self.remaining.is_empty() { + self.remaining = unsafe { self.remaining.get_unchecked(1..) }; self.last_pos.0 += 1; } else { unsafe { @@ -114,41 +115,56 @@ impl<'a> From<&'a SourceFile> for StringInput<'a> { impl<'a> Input<'a> for StringInput<'a> { #[inline] - fn cur(&self) -> Option { - self.iter.clone().next() + fn cur(&self) -> Option { + self.remaining.as_bytes().first().copied() } #[inline] - fn peek(&self) -> Option { - let mut iter = self.iter.clone(); - // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster. - iter.next(); - iter.next() + fn peek(&self) -> Option { + self.remaining.as_bytes().get(1).copied() } #[inline] - fn peek_ahead(&self) -> Option { - let mut iter = self.iter.clone(); - // https://github.com/rust-lang/rust/blob/1.86.0/compiler/rustc_lexer/src/cursor.rs#L56 say `next` is faster - iter.next(); - iter.next(); - iter.next() + fn peek_ahead(&self) -> Option { + self.remaining.as_bytes().get(2).copied() } #[inline] unsafe fn bump(&mut self) { - if let Some(c) = self.iter.next() { - self.last_pos = self.last_pos + BytePos((c.len_utf8()) as u32); - } else { + let bytes = self.remaining.as_bytes(); + if bytes.is_empty() { unsafe { debug_unreachable!("bump should not be called when cur() == None"); } } + + let first_byte = unsafe { *bytes.get_unchecked(0) }; + + // Calculate the number of bytes in this UTF-8 character + let len = if first_byte < 0x80 { + 1 // ASCII + } else if first_byte < 0xe0 { + 2 // 2-byte UTF-8 + } else if first_byte < 0xf0 { + 3 // 3-byte UTF-8 + } else { + 4 // 4-byte UTF-8 + }; + + self.remaining = unsafe { self.remaining.get_unchecked(len..) }; + self.last_pos = self.last_pos + BytePos(len as u32); + } + + #[inline] + fn bump_bytes(&mut self, n: usize) { + debug_assert!(n <= self.remaining.len()); + self.remaining = unsafe { self.remaining.get_unchecked(n..) }; + self.last_pos.0 += n as u32; } #[inline] fn cur_as_ascii(&self) -> Option { - let first_byte = *self.as_str().as_bytes().first()?; + let first_byte = *self.remaining.as_bytes().first()?; if first_byte <= 0x7f { Some(first_byte) } else { @@ -156,6 +172,11 @@ impl<'a> Input<'a> for StringInput<'a> { } } + #[inline] + fn cur_as_char(&self) -> Option { + self.remaining.chars().next() + } + #[inline] fn is_at_start(&self) -> bool { self.orig_start == self.last_pos @@ -184,7 +205,7 @@ impl<'a> Input<'a> for StringInput<'a> { let ret = unsafe { s.get_unchecked(start_idx..end_idx) }; - self.iter = unsafe { s.get_unchecked(end_idx..) }.chars(); + self.remaining = unsafe { s.get_unchecked(end_idx..) }; self.last_pos = end; ret @@ -197,7 +218,7 @@ impl<'a> Input<'a> for StringInput<'a> { { let last = { let mut last = 0; - for c in self.iter.clone() { + for c in self.remaining.chars() { if pred(c) { last += c.len_utf8(); } else { @@ -207,12 +228,11 @@ impl<'a> Input<'a> for StringInput<'a> { last }; - let s = self.iter.as_str(); - debug_assert!(last <= s.len()); - let ret = unsafe { s.get_unchecked(..last) }; + debug_assert!(last <= self.remaining.len()); + let ret = unsafe { self.remaining.get_unchecked(..last) }; self.last_pos = self.last_pos + BytePos(last as _); - self.iter = unsafe { s.get_unchecked(last..) }.chars(); + self.remaining = unsafe { self.remaining.get_unchecked(last..) }; ret } @@ -228,15 +248,13 @@ impl<'a> Input<'a> for StringInput<'a> { let idx = (to - self.orig_start).0 as usize; debug_assert!(idx <= orig.len()); - let s = unsafe { orig.get_unchecked(idx..) }; - self.iter = s.chars(); + self.remaining = unsafe { orig.get_unchecked(idx..) }; self.last_pos = to; } #[inline] fn is_byte(&self, c: u8) -> bool { - self.iter - .as_str() + self.remaining .as_bytes() .first() .map(|b| *b == c) @@ -245,13 +263,13 @@ impl<'a> Input<'a> for StringInput<'a> { #[inline] fn is_str(&self, s: &str) -> bool { - self.as_str().starts_with(s) + self.remaining.starts_with(s) } #[inline] fn eat_byte(&mut self, c: u8) -> bool { if self.is_byte(c) { - self.iter.next(); + self.remaining = unsafe { self.remaining.get_unchecked(1..) }; self.last_pos = self.last_pos + BytePos(1_u32); true } else { @@ -261,9 +279,14 @@ impl<'a> Input<'a> for StringInput<'a> { } pub trait Input<'a>: Clone { - fn cur(&self) -> Option; - fn peek(&self) -> Option; - fn peek_ahead(&self) -> Option; + /// Returns the current byte. Returns [None] if at end of input. + fn cur(&self) -> Option; + + /// Returns the next byte without consuming the current byte. + fn peek(&self) -> Option; + + /// Returns the byte after the next byte without consuming anything. + fn peek_ahead(&self) -> Option; /// # Safety /// @@ -271,18 +294,31 @@ pub trait Input<'a>: Clone { /// when the Input is not empty. unsafe fn bump(&mut self); - /// Returns [None] if it's end of input **or** current character is not an - /// ascii character. + /// Advances the input by `n` bytes. This is more efficient than calling + /// `bump()` when you already know the number of bytes to advance (e.g., + /// when you've just decoded a UTF-8 character and know its length). + /// + /// # Safety + /// + /// - The caller must ensure that `n` bytes are available in the input + /// - `n` must not exceed the actual number of bytes remaining + /// - For UTF-8 inputs, `n` must align with character boundaries + fn bump_bytes(&mut self, n: usize); + + /// Returns the current byte as ASCII if it's valid ASCII (0x00-0x7F). + /// Returns [None] if it's end of input or if the byte is not ASCII. #[inline] fn cur_as_ascii(&self) -> Option { - self.cur().and_then(|i| { - if i.is_ascii() { - return Some(i as u8); - } - None - }) + self.cur() + .and_then(|b| if b <= 0x7f { Some(b) } else { None }) } + /// Returns the current position as a UTF-8 char for cases where we need + /// full character processing (identifiers, strings, etc). + /// Returns [None] if at end of input or if the bytes don't form valid + /// UTF-8. + fn cur_as_char(&self) -> Option; + fn is_at_start(&self) -> bool; fn cur_pos(&self) -> BytePos; @@ -306,16 +342,12 @@ pub trait Input<'a>: Clone { /// - `to` be in the valid range of input. unsafe fn reset_to(&mut self, to: BytePos); - /// Implementors can override the method to make it faster. - /// - /// `c` must be ASCII. + /// Check if the current byte equals the given byte. + /// `c` should typically be an ASCII byte for performance. #[inline] #[allow(clippy::wrong_self_convention)] fn is_byte(&self, c: u8) -> bool { - match self.cur() { - Some(ch) => ch == c as char, - _ => false, - } + self.cur() == Some(c) } /// Implementors can override the method to make it faster. @@ -360,12 +392,12 @@ mod tests { with_test_sess("foo/d", |mut i| { assert_eq!(unsafe { i.slice(BytePos(1), BytePos(2)) }, "f"); assert_eq!(i.last_pos, BytePos(2)); - assert_eq!(i.cur(), Some('o')); + assert_eq!(i.cur(), Some(b'o')); assert_eq!(unsafe { i.slice(BytePos(2), BytePos(4)) }, "oo"); assert_eq!(unsafe { i.slice(BytePos(1), BytePos(4)) }, "foo"); assert_eq!(i.last_pos, BytePos(4)); - assert_eq!(i.cur(), Some('/')); + assert_eq!(i.cur(), Some(b'/')); }); } @@ -374,10 +406,10 @@ mod tests { with_test_sess("load", |mut i| { assert_eq!(unsafe { i.slice(BytePos(1), BytePos(3)) }, "lo"); assert_eq!(i.last_pos, BytePos(3)); - assert_eq!(i.cur(), Some('a')); + assert_eq!(i.cur(), Some(b'a')); unsafe { i.reset_to(BytePos(1)) }; - assert_eq!(i.cur(), Some('l')); + assert_eq!(i.cur(), Some(b'l')); assert_eq!(i.last_pos, BytePos(1)); }); } @@ -391,13 +423,13 @@ mod tests { // assert_eq!(i.cur_pos(), BytePos(4)); assert_eq!(i.last_pos, BytePos(4)); - assert_eq!(i.cur(), Some('/')); + assert_eq!(i.cur(), Some(b'/')); unsafe { i.bump(); } assert_eq!(i.last_pos, BytePos(5)); - assert_eq!(i.cur(), Some('d')); + assert_eq!(i.cur(), Some(b'd')); unsafe { i.bump(); diff --git a/crates/swc_css_parser/src/lexer/mod.rs b/crates/swc_css_parser/src/lexer/mod.rs index 904633584aab..2d00990a5ebf 100644 --- a/crates/swc_css_parser/src/lexer/mod.rs +++ b/crates/swc_css_parser/src/lexer/mod.rs @@ -26,7 +26,7 @@ where comments: Option<&'a dyn Comments>, pending_leading_comments: Vec, input: I, - cur: Option, + cur: Option, cur_pos: BytePos, start_pos: BytePos, /// Used to override last_pos @@ -172,7 +172,11 @@ where loop { self.read_comments(); - if self.input.uncons_while(is_whitespace).is_empty() { + if self + .input + .uncons_while(|c| is_whitespace(c as u8)) + .is_empty() + { break; } } @@ -190,36 +194,42 @@ where I: Input<'a>, { #[inline(always)] - fn cur(&mut self) -> Option { + fn cur(&mut self) -> Option { self.cur } #[inline(always)] - fn next(&mut self) -> Option { + fn next(&mut self) -> Option { self.input.cur() } #[inline(always)] - fn next_next(&mut self) -> Option { + fn next_next(&mut self) -> Option { self.input.peek() } #[inline(always)] - fn next_next_next(&mut self) -> Option { + fn next_next_next(&mut self) -> Option { self.input.peek_ahead() } #[inline(always)] - fn consume(&mut self) -> Option { + fn consume(&mut self) -> Option { let cur = self.input.cur(); self.cur = cur; self.cur_pos = self.input.last_pos(); - if cur.is_some() { - unsafe { - // Safety: cur is Some - self.input.bump(); + if let Some(c) = cur { + // Optimize: use bump_bytes directly for ASCII (fast path) + if c < 0x80 { + self.input.bump_bytes(1); + } else { + // Non-ASCII: use bump() which calculates UTF-8 length + unsafe { + // Safety: cur is Some + self.input.bump(); + } } } @@ -258,7 +268,7 @@ where // whitespace // Consume as much whitespace as possible. Return a . Some(c) if is_whitespace(c) => self.with_buf(|l, buf| { - buf.push(c); + buf.push(c as char); loop { let c = l.next(); @@ -267,7 +277,7 @@ where Some(c) if is_whitespace(c) => { l.consume(); - buf.push(c); + buf.push(c as char); } _ => { break; @@ -281,9 +291,9 @@ where }), // U+0022 QUOTATION MARK (") // Consume a string token and return it. - Some('"') => self.read_str(None), + Some(b'"') => self.read_str(None), // U+0023 NUMBER SIGN (#) - Some('#') => { + Some(b'#') => { let first = self.next(); let second = self.next_next(); @@ -311,19 +321,21 @@ where }); } - Ok(Token::Delim { value: '#' }) + Ok(Token::Delim { + value: b'#' as char, + }) } // U+0027 APOSTROPHE (') // Consume a string token and return it. - Some('\'') => self.read_str(None), + Some(b'\'') => self.read_str(None), // U+0028 LEFT PARENTHESIS (() // Return a <(-token>. - Some('(') => Ok(tok!("(")), + Some(b'(') => Ok(tok!("(")), // U+0029 RIGHT PARENTHESIS ()) // Return a <)-token>. - Some(')') => Ok(tok!(")")), + Some(b')') => Ok(tok!(")")), // U+002B PLUS SIGN (+) - Some('+') => { + Some(b'+') => { // If the input stream starts with a number, reconsume the current input code // point, consume a numeric token and return it. if self.would_start_number(None, None, None) { @@ -338,9 +350,9 @@ where } // U+002C COMMA (,) // Return a . - Some(',') => Ok(tok!(",")), + Some(b',') => Ok(tok!(",")), // U+002D HYPHEN-MINUS (-) - Some('-') => { + Some(b'-') => { // If the input stream starts with a number, reconsume the current input code // point, consume a numeric token, and return it. if self.would_start_number(None, None, None) { @@ -350,7 +362,7 @@ where } // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E // GREATER-THAN SIGN (->), consume them and return a . - else if self.next() == Some('-') && self.next_next() == Some('>') { + else if self.next() == Some(b'-') && self.next_next() == Some(b'>') { self.consume(); self.consume(); @@ -369,7 +381,7 @@ where Ok(tok!("-")) } // U+002E FULL STOP (.) - Some('.') => { + Some(b'.') => { // If the input stream starts with a number, reconsume the current input code // point, consume a numeric token, and return it. if self.would_start_number(None, None, None) { @@ -384,18 +396,18 @@ where } // U+003A COLON (:) // Return a . - Some(':') => Ok(tok!(":")), + Some(b':') => Ok(tok!(":")), // U+003B SEMICOLON (;) // Return a . - Some(';') => Ok(tok!(";")), + Some(b';') => Ok(tok!(";")), // U+003C LESS-THAN SIGN (<) - Some('<') => { + Some(b'<') => { // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a // . - if self.next() == Some('!') - && self.next_next() == Some('-') - && self.next_next_next() == Some('-') + if self.next() == Some(b'!') + && self.next_next() == Some(b'-') + && self.next_next_next() == Some(b'-') { self.consume(); // ! self.consume(); // - @@ -409,7 +421,7 @@ where Ok(tok!("<")) } // U+0040 COMMERCIAL AT (@) - Some('@') => { + Some(b'@') => { let first = self.next(); let second = self.next_next(); let third = self.next_next_next(); @@ -428,13 +440,15 @@ where // Otherwise, return a with its value set to the current input // code point. - Ok(Token::Delim { value: '@' }) + Ok(Token::Delim { + value: b'@' as char, + }) } // U+005B LEFT SQUARE BRACKET ([) // Return a <[-token>. - Some('[') => Ok(tok!("[")), + Some(b'[') => Ok(tok!("[")), // U+005C REVERSE SOLIDUS (\) - Some('\\') => { + Some(b'\\') => { // If the input stream starts with a valid escape, reconsume the current input // code point, consume an ident-like token, and return it. if self.is_valid_escape(None, None) { @@ -447,20 +461,22 @@ where // to the current input code point. self.emit_error(ErrorKind::InvalidEscape); - Ok(Token::Delim { value: '\\' }) + Ok(Token::Delim { + value: b'\\' as char, + }) } // U+005D RIGHT SQUARE BRACKET (]) // Return a <]-token>. - Some(']') => Ok(tok!("]")), + Some(b']') => Ok(tok!("]")), // U+007B LEFT CURLY BRACKET ({) // Return a <{-token>. - Some('{') => Ok(tok!("{")), + Some(b'{') => Ok(tok!("{")), // U+007D RIGHT CURLY BRACKET (}) // Return a <}-token>. - Some('}') => Ok(tok!("}")), + Some(b'}') => Ok(tok!("}")), // digit // Reconsume the current input code point, consume a numeric token, and return it. - Some('0'..='9') => { + Some(b'0'..=b'9') => { self.reconsume(); self.read_numeric() @@ -477,7 +493,7 @@ where None => Err(ErrorKind::Eof), // anything else // Return a with its value set to the current input code point. - Some(c) => Ok(Token::Delim { value: c }), + Some(c) => Ok(Token::Delim { value: c as char }), } } @@ -490,16 +506,16 @@ where // the first U+002A ASTERISK (*) followed by a U+002F SOLIDUS (/), or up to an // EOF code point. Return to the start of this step. // NOTE: We allow to parse line comments under the option. - if self.next() == Some('/') && self.next_next() == Some('*') { + if self.next() == Some(b'/') && self.next_next() == Some(b'*') { let cmt_start = self.input.last_pos(); - while self.next() == Some('/') && self.next_next() == Some('*') { + while self.next() == Some(b'/') && self.next_next() == Some(b'*') { self.consume(); // '*' self.consume(); // '/' loop { match self.consume() { - Some('*') if self.next() == Some('/') => { + Some(b'*') if self.next() == Some(b'/') => { self.consume(); // '/' if self.comments.is_some() { @@ -532,10 +548,10 @@ where } } } else if self.config.allow_wrong_line_comments - && self.next() == Some('/') - && self.next_next() == Some('/') + && self.next() == Some(b'/') + && self.next_next() == Some(b'/') { - while self.next() == Some('/') && self.next_next() == Some('/') { + while self.next() == Some(b'/') && self.next_next() == Some(b'/') { self.consume(); // '/' self.consume(); // '/' @@ -600,7 +616,7 @@ where } // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. Create // a with the same value as number, and return it. - else if next_first == Some('%') { + else if next_first == Some(b'%') { self.consume(); return Ok(Token::Percentage { @@ -625,9 +641,9 @@ where // Consume a name, and let string be the result. let ident_sequence = self.read_ident_sequence()?; - // If string’s value is an ASCII case-insensitive match for "url", and the next + // If string's value is an ASCII case-insensitive match for "url", and the next // input code point is U+0028 LEFT PARENTHESIS ((), consume it. - if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some('(') { + if matches_eq_ignore_ascii_case!(ident_sequence.0, "url") && self.next() == Some(b'(') { self.consume(); let start_whitespace = self.input.last_pos(); @@ -639,7 +655,7 @@ where if is_whitespace(next) && is_whitespace(next_next) { l.consume(); - buf.push(next); + buf.push(next as char); } else { break; } @@ -655,7 +671,7 @@ where // return it. Some(c) if is_whitespace(c) - && (self.next_next() == Some('"') || self.next_next() == Some('\'')) => + && (self.next_next() == Some(b'"') || self.next_next() == Some(b'\'')) => { // Override last position because we consumed whitespaces, but they // should not be part of token @@ -666,7 +682,7 @@ where raw: ident_sequence.1, }); } - Some('"' | '\'') => { + Some(b'"' | b'\'') => { return Ok(Token::Function { value: ident_sequence.0, raw: ident_sequence.1, @@ -680,7 +696,7 @@ where } // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. // Create a with its value set to string and return it. - else if self.next() == Some('(') { + else if self.next() == Some(b'(') { self.consume(); return Ok(Token::Function { @@ -699,7 +715,7 @@ where // This section describes how to consume a string token from a stream of code // points. It returns either a or . - fn read_str(&mut self, maybe_ending_code_point: Option) -> LexResult { + fn read_str(&mut self, maybe_ending_code_point: Option) -> LexResult { self.with_buf_and_raw_buf(|l, buf, raw| { // This algorithm may be called with an ending code point, which denotes the // code point that ends the string. If an ending code point is not specified, @@ -709,15 +725,27 @@ where // Initially create a with its value set to the empty string. // Done above - raw.push(ending_code_point.unwrap()); + raw.push(ending_code_point.unwrap() as char); // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // ending code point // Return the . Some(c) if c == ending_code_point.unwrap() => { - raw.push(c); + raw.push(c as char); break; } @@ -746,7 +774,7 @@ where } // U+005C REVERSE SOLIDUS (\) - Some(c) if c == '\\' => { + Some(c) if c == b'\\' => { let next = l.next(); // If the next input code point is EOF, do nothing. @@ -757,26 +785,28 @@ where else if l.next().is_some() && is_newline(l.next().unwrap()) { l.consume(); - raw.push(c); - raw.push(next.unwrap()); + raw.push(c as char); + raw.push(next.unwrap() as char); } // Otherwise, (the stream starts with a valid escape) consume an escaped // code point and append the returned code point to - // the ’s value. + // the 's value. else if l.is_valid_escape(None, None) { let escape = l.read_escape()?; buf.push(escape.0); - raw.push(c); + raw.push(c as char); raw.push_str(&escape.1); } } // Anything else - // Append the current input code point to the ’s value. - Some(c) => { - buf.push(c); - raw.push(c); + // Append the current input code point to the 's value. + Some(_) => { + if let Some(ch) = cur_char { + buf.push(ch); + raw.push(ch); + } } } } @@ -798,9 +828,15 @@ where // Consume as much whitespace as possible. while let Some(c) = l.next() { if is_whitespace(c) { + // Get char before consuming + let ch = if is_non_ascii(c) { + l.input.cur_as_char().unwrap_or(c as char) + } else { + c as char + }; l.consume(); - raw.push(c); + raw.push(ch); } else { break; } @@ -808,10 +844,22 @@ where // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // U+0029 RIGHT PARENTHESIS ()) // Return the . - Some(')') => { + Some(b')') => { return Ok(Token::Url { value: l.atoms.atom(&**out), raw: Box::new(UrlKeyValue(name.1, l.atoms.atom(&**raw))), @@ -833,13 +881,21 @@ where Some(c) if is_whitespace(c) => { // Consume as much whitespace as possible. let whitespaces: String = l.with_sub_buf(|l, buf| { - buf.push(c); + if let Some(ch) = cur_char { + buf.push(ch); + } while let Some(c) = l.next() { if is_whitespace(c) { + // Get char before consuming + let ch = if is_non_ascii(c) { + l.input.cur_as_char().unwrap_or(c as char) + } else { + c as char + }; l.consume(); - buf.push(c); + buf.push(ch); } else { break; } @@ -852,7 +908,7 @@ where // consume it and return the (if EOF was // encountered, this is a parse error); match l.next() { - Some(')') => { + Some(b')') => { l.consume(); raw.push_str(&whitespaces); @@ -894,12 +950,12 @@ where // non-printable code point // This is a parse error. Consume the remnants of a bad url, create a // , and return it. - Some(c) if c == '"' || c == '\'' || c == '(' || is_non_printable(c) => { + Some(c) if c == b'"' || c == b'\'' || c == b'(' || is_non_printable(c) => { l.emit_error(ErrorKind::UnexpectedCharInUrl); let remnants = l.read_bad_url_remnants()?; - raw.push(c); + raw.push(c as char); raw.push_str(&remnants); return Ok(Token::BadUrl { @@ -908,15 +964,15 @@ where } // U+005C REVERSE SOLIDUS (\) - Some(c) if c == '\\' => { + Some(c) if c == b'\\' => { // If the stream starts with a valid escape, consume an escaped code point // and append the returned code point to the - // ’s value. + // 's value. if l.is_valid_escape(None, None) { let escaped = l.read_escape()?; out.push(escaped.0); - raw.push(c); + raw.push(c as char); raw.push_str(&escaped.1); } // Otherwise, this is a parse error. Consume the remnants of a bad url, @@ -926,7 +982,7 @@ where let remnants = l.read_bad_url_remnants()?; - raw.push(c); + raw.push(c as char); raw.push_str(&remnants); return Ok(Token::BadUrl { @@ -936,10 +992,12 @@ where } // anything else - // Append the current input code point to the ’s value. - Some(c) => { - out.push(c); - raw.push(c); + // Append the current input code point to the 's value. + Some(_) => { + if let Some(ch) = cur_char { + out.push(ch); + raw.push(ch); + } } } } @@ -953,26 +1011,38 @@ where // will return a code point. fn read_escape(&mut self) -> LexResult<(char, String)> { self.with_sub_buf(|l, buf| { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + // Consume the next input code point. match l.consume() { // hex digit Some(c) if is_hex_digit(c) => { - let mut hex = c.to_digit(16).unwrap(); + let mut hex = (c as char).to_digit(16).unwrap(); - buf.push(c); + buf.push(c as char); // Consume as many hex digits as possible, but no more than 5. // Note that this means 1-6 hex digits have been consumed in total. for _ in 0..5 { let next = l.next(); - let digit = match next.and_then(|c| c.to_digit(16)) { + let digit = match next.and_then(|c| (c as char).to_digit(16)) { Some(v) => v, None => break, }; l.consume(); - buf.push(next.unwrap()); + buf.push(next.unwrap() as char); hex = hex * 16 + digit; } @@ -983,7 +1053,7 @@ where if is_whitespace(next) { l.consume(); - buf.push(next); + buf.push(next as char); } } @@ -1017,9 +1087,10 @@ where // anything else // Return the current input code point. Some(c) => { - buf.push(c); + let ch = cur_char.unwrap_or(c as char); + buf.push(ch); - Ok((c, (&**buf).into())) + Ok((ch, (&**buf).into())) } } }) @@ -1031,9 +1102,9 @@ where // or can be called with the input stream itself. In the latter case, the two // code points in question are the current input code point and the next input // code point, in that order. - fn is_valid_escape(&mut self, maybe_first: Option, maybe_second: Option) -> bool { + fn is_valid_escape(&mut self, maybe_first: Option, maybe_second: Option) -> bool { // If the first code point is not U+005C REVERSE SOLIDUS (\), return false. - if maybe_first.or_else(|| self.cur()) != Some('\\') { + if maybe_first.or_else(|| self.cur()) != Some(b'\\') { return false; } @@ -1053,16 +1124,16 @@ where // the next two input code points, in that order. fn would_start_ident( &mut self, - maybe_first: Option, - maybe_second: Option, - maybe_third: Option, + maybe_first: Option, + maybe_second: Option, + maybe_third: Option, ) -> bool { // Look at the first code point: let first = maybe_first.or_else(|| self.cur()); match first { // U+002D HYPHEN-MINUS - Some('-') => { + Some(b'-') => { let second = maybe_second.or_else(|| self.next()); match second { @@ -1071,7 +1142,7 @@ where Some(c) if is_name_start(c) => true, // or a U+002D HYPHEN-MINUS, // return true. - Some('-') => true, + Some(b'-') => true, // or the second and third code points are a valid escape // return true. Some(_) => { @@ -1089,7 +1160,7 @@ where // U+005C REVERSE SOLIDUS (\) // If the first and second code points are a valid escape, return true. Otherwise, // return false. - Some('\\') => { + Some(b'\\') => { let second = maybe_second.or_else(|| self.next()); self.is_valid_escape(first, second) @@ -1107,9 +1178,9 @@ where #[allow(clippy::needless_return)] fn would_start_number( &mut self, - maybe_first: Option, - maybe_second: Option, - maybe_third: Option, + maybe_first: Option, + maybe_second: Option, + maybe_third: Option, ) -> bool { // Look at the first code point: let first = maybe_first.or_else(|| self.cur()); @@ -1117,13 +1188,13 @@ where match first { // U+002B PLUS SIGN (+) // U+002D HYPHEN-MINUS (-) - Some('+') | Some('-') => { + Some(b'+') | Some(b'-') => { match maybe_second.or_else(|| self.next()) { // If the second code point is a digit, return true. Some(second) if second.is_ascii_digit() => return true, // Otherwise, if the second code point is a U+002E FULL STOP (.) and the // third code point is a digit, return true. - Some('.') => { + Some(b'.') => { if let Some(third) = maybe_third.or_else(|| self.next_next()) { if third.is_ascii_digit() { return true; @@ -1137,7 +1208,7 @@ where }; } // U+002E FULL STOP (.) - Some('.') => { + Some(b'.') => { // If the second code point is a digit, return true. if let Some(second) = self.next() { if second.is_ascii_digit() { @@ -1168,20 +1239,37 @@ where // Repeatedly consume the next input code point from the stream: loop { - match l.consume() { + // For non-ASCII bytes, we need to get the full UTF-8 character before consuming + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + + let c = l.consume(); + + match c { // name code point // Append the code point to result. - Some(c) if is_name(c) => { - buf.push(c); - raw.push(c); + Some(byte) if is_name(byte) => { + // Use the full character we got earlier + if let Some(ch) = cur_char { + buf.push(ch); + raw.push(ch); + } } // the stream starts with a valid escape // Consume an escaped code point. Append the returned code point to result. - Some(c) if l.is_valid_escape(None, None) => { + Some(byte) if l.is_valid_escape(None, None) => { let escaped = l.read_escape()?; buf.push(escaped.0); - raw.push(c); + raw.push(byte as char); raw.push_str(&escaped.1); } // anything else @@ -1209,10 +1297,10 @@ where // (-), consume it and append it to repr. let next = l.next(); - if next == Some('+') || next == Some('-') { + if next == Some(b'+') || next == Some(b'-') { l.consume(); - out.push(next.unwrap()); + out.push(next.unwrap() as char); } // While the next input code point is a digit, consume it and append it to repr. @@ -1220,7 +1308,7 @@ where if c.is_ascii_digit() { l.consume(); - out.push(c); + out.push(c as char); } else { break; } @@ -1230,7 +1318,7 @@ where // then: let next = l.next(); - if next == Some('.') { + if next == Some(b'.') { if let Some(n) = l.next_next() { if n.is_ascii_digit() { // Consume them. @@ -1238,8 +1326,8 @@ where l.consume(); // Append them to repr. - out.push(next.unwrap()); - out.push(n); + out.push(next.unwrap() as char); + out.push(n as char); // Set type to "number". type_flag = NumberType::Number; @@ -1250,7 +1338,7 @@ where if c.is_ascii_digit() { l.consume(); - out.push(c); + out.push(c as char); } else { break; } @@ -1264,12 +1352,12 @@ where // (-) or U+002B PLUS SIGN (+), followed by a digit, then: let next = l.next(); - if next == Some('E') || next == Some('e') { + if next == Some(b'E') || next == Some(b'e') { let next_next = l.next_next(); let next_next_next = l.next_next_next(); - if (next_next == Some('-') - || next_next == Some('+') + if (next_next == Some(b'-') + || next_next == Some(b'+') && next_next_next.is_some() && next_next_next.unwrap().is_ascii_digit()) || next_next.is_some() && next_next.unwrap().is_ascii_digit() @@ -1279,8 +1367,8 @@ where l.consume(); // Append them to repr. - out.push(next.unwrap()); - out.push(next_next.unwrap()); + out.push(next.unwrap() as char); + out.push(next_next.unwrap() as char); // Set type to "number". type_flag = NumberType::Number; @@ -1291,7 +1379,7 @@ where if c.is_ascii_digit() { l.consume(); - out.push(c); + out.push(c as char); } else { break; } @@ -1322,12 +1410,24 @@ where self.with_sub_buf(|l, raw| { // Repeatedly consume the next input code point from the stream: loop { + // Get the full character before consuming (for non-ASCII) + let cur_byte = l.input.cur(); + let cur_char = if let Some(b) = cur_byte { + if is_non_ascii(b) { + l.input.cur_as_char() + } else { + Some(b as char) + } + } else { + None + }; + match l.consume() { // U+0029 RIGHT PARENTHESIS ()) // EOF // Return. - Some(c @ ')') => { - raw.push(c); + Some(c @ b')') => { + raw.push(c as char); break; } @@ -1340,13 +1440,15 @@ where // ("\)") to be encountered without ending the . let escaped = l.read_escape()?; - raw.push(c); + raw.push(c as char); raw.push_str(&escaped.1); } // anything else // Do nothing. - Some(c) => { - raw.push(c); + Some(_) => { + if let Some(ch) = cur_char { + raw.push(ch); + } } } } @@ -1357,61 +1459,61 @@ where } #[inline(always)] -fn is_digit(c: char) -> bool { +fn is_digit(c: u8) -> bool { c.is_ascii_digit() } #[inline(always)] -fn is_hex_digit(c: char) -> bool { +fn is_hex_digit(c: u8) -> bool { match c { c if is_digit(c) => true, - 'A'..='F' => true, - 'a'..='f' => true, + b'A'..=b'F' => true, + b'a'..=b'f' => true, _ => false, } } #[inline(always)] -fn is_uppercase_letter(c: char) -> bool { +fn is_uppercase_letter(c: u8) -> bool { c.is_ascii_uppercase() } #[inline(always)] -fn is_lowercase_letter(c: char) -> bool { +fn is_lowercase_letter(c: u8) -> bool { c.is_ascii_lowercase() } #[inline(always)] -fn is_letter(c: char) -> bool { +fn is_letter(c: u8) -> bool { is_uppercase_letter(c) || is_lowercase_letter(c) } #[inline(always)] -fn is_non_ascii(c: char) -> bool { - c as u32 >= 0x80 +fn is_non_ascii(c: u8) -> bool { + c >= 0x80 } #[inline(always)] -fn is_name_start(c: char) -> bool { - matches!(c, c if is_letter(c) || is_non_ascii(c) || c == '_' || c == '\x00') +fn is_name_start(c: u8) -> bool { + matches!(c, c if is_letter(c) || is_non_ascii(c) || c == b'_' || c == 0x00) } #[inline(always)] -fn is_name(c: char) -> bool { - is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == '-') +fn is_name(c: u8) -> bool { + is_name_start(c) || matches!(c, c if c.is_ascii_digit() || c == b'-') } #[inline(always)] -fn is_non_printable(c: char) -> bool { - matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F') +fn is_non_printable(c: u8) -> bool { + matches!(c, 0x00..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F) } #[inline(always)] -fn is_newline(c: char) -> bool { - matches!(c, '\n' | '\r' | '\x0C') +fn is_newline(c: u8) -> bool { + matches!(c, b'\n' | b'\r' | 0x0c) } #[inline(always)] -fn is_whitespace(c: char) -> bool { - matches!(c, c if c == ' ' || c == '\t' || is_newline(c)) +fn is_whitespace(c: u8) -> bool { + matches!(c, c if c == b' ' || c == b'\t' || is_newline(c)) } diff --git a/crates/swc_ecma_lexer/src/common/lexer/char.rs b/crates/swc_ecma_lexer/src/common/lexer/char.rs index 705a3fd05f70..62f4e4d08a49 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/char.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/char.rs @@ -1,8 +1,8 @@ -/// Implemented for `char`. +/// Implemented for `u8` - operates on bytes for performance. pub trait CharExt: Copy { fn to_char(self) -> Option; - /// Test whether a given character code starts an identifier. + /// Test whether a given byte/character starts an identifier. /// /// https://tc39.github.io/ecma262/#prod-IdentifierStart #[inline] @@ -14,7 +14,7 @@ pub trait CharExt: Copy { swc_ecma_ast::Ident::is_valid_start(c) } - /// Test whether a given character is part of an identifier. + /// Test whether a given byte/character is part of an identifier. #[inline] fn is_ident_part(self) -> bool { let c = match self.to_char() { @@ -65,6 +65,20 @@ pub trait CharExt: Copy { } } +impl CharExt for u8 { + #[inline(always)] + fn to_char(self) -> Option { + // For ASCII bytes, this is a fast path + if self <= 0x7f { + Some(self as char) + } else { + // For non-ASCII bytes, we can't convert a single byte to a char + // The caller should use cur_as_char() on the Input trait instead + None + } + } +} + impl CharExt for char { #[inline(always)] fn to_char(self) -> Option { diff --git a/crates/swc_ecma_lexer/src/common/lexer/mod.rs b/crates/swc_ecma_lexer/src/common/lexer/mod.rs index 852877860f3f..3bc438454779 100644 --- a/crates/swc_ecma_lexer/src/common/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/common/lexer/mod.rs @@ -175,20 +175,25 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } #[inline(always)] - fn cur(&self) -> Option { + fn cur(&self) -> Option { self.input().cur() } #[inline(always)] - fn peek(&self) -> Option { + fn peek(&self) -> Option { self.input().peek() } #[inline(always)] - fn peek_ahead(&self) -> Option { + fn peek_ahead(&self) -> Option { self.input().peek_ahead() } + #[inline(always)] + fn cur_as_char(&self) -> Option { + self.input().cur_as_char() + } + #[inline(always)] fn cur_pos(&self) -> BytePos { self.input().cur_pos() @@ -364,8 +369,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { fn skip_block_comment(&mut self) { let start = self.cur_pos(); - debug_assert_eq!(self.cur(), Some('/')); - debug_assert_eq!(self.peek(), Some('*')); + debug_assert_eq!(self.cur(), Some(b'/')); + debug_assert_eq!(self.peek(), Some(b'*')); // Consume initial "/*" self.input_mut().bump_bytes(2); @@ -417,7 +422,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { match matched_byte { b'*' => { - if self.peek() == Some('/') { + if self.peek() == Some(b'/') { // Consume "*/" self.input_mut().bump_bytes(2); @@ -476,13 +481,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { b'\r' => { should_mark_had_line_break = true; self.bump(); - if self.peek() == Some('\n') { + if self.peek() == Some(b'\n') { self.bump(); } } _ => { // Unicode line terminator (LS/PS) or other character - if let Some('\u{2028}' | '\u{2029}') = self.cur() { + if let Some('\u{2028}' | '\u{2029}') = self.cur_as_char() { should_mark_had_line_break = true; } self.bump(); @@ -516,10 +521,10 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { if LEX_COMMENTS && self.input().is_byte(b'/') { if let Some(c) = self.peek() { - if c == '/' { + if c == b'/' { self.skip_line_comment(2); continue; - } else if c == '*' { + } else if c == b'*' { self.skip_block_comment(); continue; } @@ -575,24 +580,24 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let mut prev = None; while let Some(c) = self.cur() { - if c == '_' { + if c == b'_' { *has_underscore = true; if allow_num_separator { - let is_allowed = |c: Option| { + let is_allowed = |c: Option| { let Some(c) = c else { return false; }; - c.is_digit(RADIX as _) + (c as char).is_digit(RADIX as _) }; - let is_forbidden = |c: Option| { + let is_forbidden = |c: Option| { let Some(c) = c else { return false; }; if RADIX == 16 { - matches!(c, '.' | 'X' | '_' | 'x') + matches!(c, b'.' | b'X' | b'_' | b'x') } else { - matches!(c, '.' | 'B' | 'E' | 'O' | '_' | 'b' | 'e' | 'o') + matches!(c, b'.' | b'B' | b'E' | b'O' | b'_' | b'b' | b'e' | b'o') } }; @@ -616,7 +621,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } // e.g. (val for a) = 10 where radix = 16 - let val = if let Some(val) = c.to_digit(RADIX as _) { + let val = if let Some(val) = (c as char).to_digit(RADIX as _) { val } else { return Ok(total); @@ -692,7 +697,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let lazy_integer = if START_WITH_DOT { // first char is '.' debug_assert!( - self.cur().is_some_and(|c| c == '.'), + self.cur().is_some_and(|c| c == b'.'), "read_number expects current char to be '.'" ); LazyInteger { @@ -703,7 +708,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } } else { debug_assert!(!START_WITH_DOT); - debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == '0'); + debug_assert!(!START_WITH_ZERO || self.cur().unwrap() == b'0'); // Use read_number_no_dot to support long numbers. let lazy_integer = self.read_number_no_dot_as_str::<10>()?; @@ -771,7 +776,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { has_underscore |= lazy_integer.has_underscore; // At this point, number cannot be an octal literal. - let has_dot = self.cur() == Some('.'); + let has_dot = self.cur() == Some(b'.'); // `0.a`, `08.a`, `102.a` are invalid. // // `.1.a`, `.1e-4.a` are valid, @@ -785,7 +790,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.read_digits::<_, (), 10>(|_, _, _| Ok(((), true)), true, &mut has_underscore)?; } - let has_e = self.cur().is_some_and(|c| c == 'e' || c == 'E'); + let has_e = self.cur().is_some_and(|c| c == b'e' || c == b'E'); // Handle 'e' and 'E' // // .5e1 = 5 @@ -803,7 +808,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } }; - if next == '+' || next == '-' { + if next == b'+' || next == b'-' { self.bump(); // remove '+', '-' } @@ -875,12 +880,12 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { ); let start = self.cur_pos(); - debug_assert_eq!(self.cur(), Some('0')); + debug_assert_eq!(self.cur(), Some(b'0')); self.bump(); debug_assert!(self .cur() - .is_some_and(|c| matches!(c, 'b' | 'B' | 'o' | 'O' | 'x' | 'X'))); + .is_some_and(|c| matches!(c, b'b' | b'B' | b'o' | b'O' | b'x' | b'X'))); self.bump(); let lazy_integer = self.read_number_no_dot_as_str::()?; @@ -996,14 +1001,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let mut s = SmartString::::default(); - debug_assert!(self.input().cur().is_some_and(|c| c == '&')); + debug_assert!(self.input().cur().is_some_and(|c| c == b'&')); self.bump(); let start_pos = self.input().cur_pos(); for _ in 0..10 { let c = match self.input().cur() { - Some(c) => c, + Some(c) => c as char, None => break, }; self.bump(); @@ -1041,10 +1046,10 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { fn read_jsx_new_line(&mut self, normalize_crlf: bool) -> LexResult> { debug_assert!(self.syntax().jsx()); - let ch = self.input().cur().unwrap(); + let ch = self.input().cur().unwrap() as char; self.bump(); - let out = if ch == '\r' && self.input().cur() == Some('\n') { + let out = if ch == '\r' && self.input().cur() == Some(b'\n') { self.bump(); // `\n` Either::Left(if normalize_crlf { "\n" } else { "\r\n" }) } else { @@ -1064,7 +1069,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let mut chunk_start = self.input().cur_pos(); loop { let ch = match self.input().cur() { - Some(c) => c, + Some(c) => c as char, None => { self.emit_error(start, SyntaxError::UnterminatedStrLit); break; @@ -1185,8 +1190,8 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { // returned `Some`, and already exited. debug_assert!(high >= MIN_HIGH); let is_pair = high <= MAX_HIGH - && self.input().cur() == Some('\\') - && self.input().peek() == Some('u'); + && self.input().cur() == Some(b'\\') + && self.input().peek() == Some(b'u'); if !is_pair { return Ok(Some(UnicodeEscape::LoneSurrogate(high))); } @@ -1220,7 +1225,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } fn read_unicode_escape(&mut self) -> LexResult { - debug_assert_eq!(self.cur(), Some('u')); + debug_assert_eq!(self.cur(), Some(b'u')); let mut is_curly = false; @@ -1304,7 +1309,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { #[cold] fn read_shebang(&mut self) -> LexResult> { - if self.input().cur() != Some('#') || self.input().peek() != Some('!') { + if self.input().cur() != Some(b'#') || self.input().peek() != Some(b'!') { return Ok(None); } self.bump(); // `#` @@ -1336,11 +1341,11 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { // Handle edge case for immediate template end if start == self.cur_pos() && self.state().last_was_tpl_element() { if let Some(c) = self.cur() { - if c == '$' && self.peek() == Some('{') { + if c == b'$' && self.peek() == Some(b'{') { self.bump(); // '$' self.bump(); // '{' return Ok(Self::Token::DOLLAR_LBRACE); - } else if c == '`' { + } else if c == b'`' { self.bump(); // '`' return Ok(Self::Token::BACKQUOTE); } @@ -1361,7 +1366,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { match matched_byte { b'$' => { // Check if this is ${ - if self.peek() == Some('{') { + if self.peek() == Some(b'{') { // Found template substitution let cooked = if cooked_slice_start == raw_slice_start { let last_pos = self.cur_pos(); @@ -1413,7 +1418,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { // Handle carriage return - consume \r and optionally \n, normalize to \n self.bump(); // '\r' - if self.peek() == Some('\n') { + if self.peek() == Some(b'\n') { self.bump(); // '\n' } @@ -1449,14 +1454,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// /// In template literal, we should preserve raw string. fn read_escaped_char(&mut self, in_template: bool) -> LexResult> { - debug_assert_eq!(self.cur(), Some('\\')); + debug_assert_eq!(self.cur(), Some(b'\\')); let start = self.cur_pos(); self.bump(); // '\' let c = match self.cur() { - Some(c) => c, + Some(c) => c as char, None => self.error_span(pos_span(start), SyntaxError::InvalidStrEscape)?, }; @@ -1510,7 +1515,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let first_c = if c == '0' { match self.cur() { - Some(next) if next.is_digit(8) => c, + Some(next) if (next as char).is_digit(8) => c, // \0 is not an octal literal nor decimal literal. _ => return Ok(Some(CodePoint::from_char('\u{0000}'))), } @@ -1531,7 +1536,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { ($check:expr) => {{ let cur = self.cur(); - match cur.and_then(|c| c.to_digit(8)) { + match cur.and_then(|c| (c as char).to_digit(8)) { Some(v) => { value = if $check { let new_val = value @@ -1575,7 +1580,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.input_mut().reset_to(start); } - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.cur(), Some(b'/')); let start = self.cur_pos(); @@ -1586,6 +1591,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { let (mut escaped, mut in_class) = (false, false); while let Some(c) = self.cur() { + let c = c as char; // This is ported from babel. // Seems like regexp literal cannot contain linebreak. if c.is_line_terminator() { @@ -1780,7 +1786,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { // ASCII but not a valid identifier break; - } else if let Some(c) = self.input().cur() { + } else if let Some(c) = self.input().cur_as_char() { if Ident::is_valid_non_ascii_continue(c) { self.bump(); continue; @@ -1813,14 +1819,14 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// `#` fn read_token_number_sign(&mut self) -> LexResult { - debug_assert!(self.cur().is_some_and(|c| c == '#')); + debug_assert!(self.cur().is_some_and(|c| c == b'#')); self.bump(); // '#' // `#` can also be a part of shebangs, however they should have been // handled by `read_shebang()` debug_assert!( - !self.input().is_at_start() || self.cur() != Some('!'), + !self.input().is_at_start() || self.cur() != Some(b'!'), "#! should have already been handled by read_shebang()" ); Ok(Self::Token::HASH) @@ -1831,7 +1837,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// This is extracted as a method to reduce size of `read_token`. #[inline(never)] fn read_token_dot(&mut self) -> LexResult { - debug_assert!(self.cur().is_some_and(|c| c == '.')); + debug_assert!(self.cur().is_some_and(|c| c == b'.')); // Check for eof let next = match self.input().peek() { Some(next) => next, @@ -1849,7 +1855,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { self.bump(); // 1st `.` - if next == '.' && self.input().peek() == Some('.') { + if next == b'.' && self.input().peek() == Some(b'.') { self.bump(); // 2nd `.` self.bump(); // 3rd `.` @@ -1864,7 +1870,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// This is extracted as a method to reduce size of `read_token`. #[inline(never)] fn read_token_question_mark(&mut self) -> LexResult { - debug_assert!(self.cur().is_some_and(|c| c == '?')); + debug_assert!(self.cur().is_some_and(|c| c == b'?')); self.bump(); if self.input_mut().eat_byte(b'?') { if self.input_mut().eat_byte(b'=') { @@ -1882,7 +1888,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// This is extracted as a method to reduce size of `read_token`. #[inline(never)] fn read_token_colon(&mut self) -> LexResult { - debug_assert!(self.cur().is_some_and(|c| c == ':')); + debug_assert!(self.cur().is_some_and(|c| c == b':')); self.bump(); // ':' Ok(Self::Token::COLON) } @@ -1892,13 +1898,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// This is extracted as a method to reduce size of `read_token`. #[inline(never)] fn read_token_zero(&mut self) -> LexResult { - debug_assert_eq!(self.cur(), Some('0')); + debug_assert_eq!(self.cur(), Some(b'0')); let next = self.input().peek(); let bigint = match next { - Some('x') | Some('X') => self.read_radix_number::<16>(), - Some('o') | Some('O') => self.read_radix_number::<8>(), - Some('b') | Some('B') => self.read_radix_number::<2>(), + Some(b'x') | Some(b'X') => self.read_radix_number::<16>(), + Some(b'o') | Some(b'O') => self.read_radix_number::<8>(), + Some(b'b') | Some(b'B') => self.read_radix_number::<2>(), _ => { return self.read_number::().map(|v| match v { Left((value, raw)) => Self::Token::num(value, raw, self), @@ -1944,13 +1950,13 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { } // '||', '&&' - if self.input().cur() == Some(C as char) { + if self.input().cur() == Some(C) { unsafe { // Safety: cur() is Some(c) self.input_mut().bump(); } - if self.input().cur() == Some('=') { + if self.input().cur() == Some(b'=') { unsafe { // Safety: cur() is Some('=') self.input_mut().bump(); @@ -1990,7 +1996,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// This is extracted as a method to reduce size of `read_token`. #[inline(never)] fn read_token_mul_mod(&mut self, is_mul: bool) -> LexResult { - debug_assert!(self.cur().is_some_and(|c| c == '*' || c == '%')); + debug_assert!(self.cur().is_some_and(|c| c == b'*' || c == b'%')); self.bump(); let token = if is_mul { if self.input_mut().eat_byte(b'*') { @@ -2019,7 +2025,7 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { #[inline(never)] fn read_slash(&mut self) -> LexResult { - debug_assert_eq!(self.cur(), Some('/')); + debug_assert_eq!(self.cur(), Some(b'/')); self.bump(); // '/' Ok(if self.eat(b'=') { Self::Token::DIV_EQ @@ -2047,9 +2053,9 @@ pub trait Lexer<'a, TokenAndSpan>: Tokens + Sized { /// See https://tc39.github.io/ecma262/#sec-literals-string-literals // TODO: merge `read_str_lit` and `read_jsx_str` fn read_str_lit(&mut self) -> LexResult { - debug_assert!(self.cur() == Some('\'') || self.cur() == Some('"')); + debug_assert!(self.cur() == Some(b'\'') || self.cur() == Some(b'"')); let start = self.cur_pos(); - let quote = self.cur().unwrap() as u8; + let quote = self.cur().unwrap(); self.bump(); // '"' or '\'' diff --git a/crates/swc_ecma_lexer/src/lexer/jsx.rs b/crates/swc_ecma_lexer/src/lexer/jsx.rs index 5fbc92266b80..82825c00459f 100644 --- a/crates/swc_ecma_lexer/src/lexer/jsx.rs +++ b/crates/swc_ecma_lexer/src/lexer/jsx.rs @@ -12,7 +12,7 @@ impl Lexer<'_> { loop { let cur = match self.input.cur() { - Some(c) => c, + Some(c) => c as char, None => { let start = self.state.start; self.error(start, SyntaxError::UnterminatedJSXContents)? diff --git a/crates/swc_ecma_lexer/src/lexer/mod.rs b/crates/swc_ecma_lexer/src/lexer/mod.rs index 1f245fc57a0c..49e5be16bfc5 100644 --- a/crates/swc_ecma_lexer/src/lexer/mod.rs +++ b/crates/swc_ecma_lexer/src/lexer/mod.rs @@ -176,7 +176,7 @@ impl<'a> Lexer<'a> { } // '++', '--' - Ok(if self.input.cur() == Some(C as char) { + Ok(if self.input.cur() == Some(C) { unsafe { // Safety: cur() is Some(c) self.input.bump(); @@ -273,7 +273,10 @@ impl Lexer<'_> { } // XML style comment. `")); } // U+0021 EXCLAMATION MARK (!) // Switch to the comment end bang state. - Some('!') => { + Some(b'!') => { self.state = State::CommentEndBang; } // U+002D HYPHEN-MINUS (-) // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. - Some(c @ '-') => { - self.append_to_comment_token(c, c); + Some(c @ b'-') => { + self.append_to_comment_token(c as char, c as char); } // EOF // This is an eof-in-comment parse error. Emit the current comment token. @@ -3275,16 +3349,16 @@ where // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION // MARK character (!) to the comment token's data. Switch to the comment end // dash state. - Some(c @ '-') => { - self.append_to_comment_token(c, c); - self.append_to_comment_token('-', '-'); + Some(c @ b'-') => { + self.append_to_comment_token(c as char, c as char); + self.append_to_comment_token(c as char, c as char); self.append_to_comment_token('!', '!'); self.state = State::CommentEndDash; } // U+003E GREATER-THAN SIGN (>) // This is an incorrectly-closed-comment parse error. Switch to the data // state. Emit the current comment token. - Some('>') => { + Some(b'>') => { self.emit_error(ErrorKind::IncorrectlyClosedComment); self.state = State::Data; self.emit_comment_token(Some(">")); @@ -3326,7 +3400,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Reconsume in the before DOCTYPE name state. - Some('>') => { + Some(b'>') => { self.reconsume_in_state(State::BeforeDoctypeName); } // EOF @@ -3370,14 +3444,14 @@ where Some(c) if is_ascii_upper_alpha(c) => { self.append_raw_to_doctype_token(c); self.create_doctype_token(); - self.set_doctype_token_name(c.to_ascii_lowercase()); + self.set_doctype_token_name(c.to_ascii_lowercase() as char); self.state = State::DoctypeName; } // U+0000 NULL // This is an unexpected-null-character parse error. Create a new DOCTYPE // token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character. // Switch to the DOCTYPE name state. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); self.create_doctype_token(); @@ -3388,7 +3462,7 @@ where // This is a missing-doctype-name parse error. Create a new DOCTYPE token. // Set its force-quirks flag to on. Switch to the data state. Emit the // current token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypeName); self.create_doctype_token(); @@ -3416,7 +3490,12 @@ where self.validate_input_stream_character(c); self.append_raw_to_doctype_token(c); self.create_doctype_token(); - self.set_doctype_token_name(c); + let ch = if is_non_ascii(c) { + self.current_char.unwrap_or(c as char) + } else { + c as char + }; + self.set_doctype_token_name(ch); self.state = State::DoctypeName; } } @@ -3437,7 +3516,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_name(); self.state = State::Data; @@ -3447,15 +3526,18 @@ where // Append the lowercase version of the current input character (add 0x0020 // to the character's code point) to the current DOCTYPE token's name. Some(c) if is_ascii_upper_alpha(c) => { - self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha); + self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha_char); } // U+0000 NULL // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's name. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(Some(REPLACEMENT_CHARACTER), None, None); + + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // EOF // This is an eof-in-doctype parse error. Set the current DOCTYPE token's @@ -3474,12 +3556,14 @@ where // Append the current input character to the current DOCTYPE token's name. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_name(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_name(c, |ch| { + if !is_allowed_character(ch) { return false; } - !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c) + !is_spacy_char(ch) + && !matches!(ch, '>' | '\x00') + && !is_ascii_upper_alpha_char(ch) }); } } @@ -3500,7 +3584,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3534,12 +3618,12 @@ where let b = self.buf.clone(); let mut buf = b.borrow_mut(); - buf.push(c); + buf.push(c as char); for _ in 0..5 { match self.consume_next_char() { Some(c) => { - buf.push(c); + buf.push(c as char); } _ => { break; @@ -3602,7 +3686,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword); self.set_doctype_token_public_id(); @@ -3613,7 +3697,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword); self.set_doctype_token_public_id(); @@ -3623,7 +3707,7 @@ where // This is a missing-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypePublicIdentifier); self.set_doctype_token_force_quirks(); @@ -3669,7 +3753,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_public_id(); self.state = State::DoctypePublicIdentifierDoubleQuoted; @@ -3678,7 +3762,7 @@ where // Set the current DOCTYPE token's public identifier to the empty string // (not missing), then switch to the DOCTYPE public identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_public_id(); self.state = State::DoctypePublicIdentifierSingleQuoted; @@ -3687,7 +3771,7 @@ where // This is a missing-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypePublicIdentifier); self.set_doctype_token_force_quirks(); @@ -3723,7 +3807,7 @@ where match self.consume_next_char() { // U+0022 QUOTATION MARK (") // Switch to the after DOCTYPE public identifier state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_public_id(); self.state = State::AfterDoctypePublicIdentifier; @@ -3732,16 +3816,18 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's public // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.finish_doctype_token_public_id(); self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier); @@ -3767,12 +3853,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_public_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_public_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '"' | '\x00' | '>' | '\r') + !matches!(ch, '"' | '\x00' | '>' | '\r') }); } } @@ -3783,7 +3869,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE public identifier state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.finish_doctype_token_public_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypePublicIdentifier; @@ -3792,16 +3878,18 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's public // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, Some(REPLACEMENT_CHARACTER), None); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-public-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_public_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypePublicIdentifier); @@ -3827,12 +3915,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_public_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_public_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '\'' | '\x00' | '>' | '\r') + !matches!(ch, '\'' | '\x00' | '>' | '\r') }); } } @@ -3852,7 +3940,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3862,7 +3950,7 @@ where // parse error. Set the current DOCTYPE token's system // identifier to the empty string (not missing), then switch // to the DOCTYPE system identifier (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error( ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, @@ -3875,7 +3963,7 @@ where // parse error. Set the current DOCTYPE token's system // identifier to the empty string (not missing), then switch // to the DOCTYPE system identifier (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error( ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, @@ -3920,7 +4008,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -3929,7 +4017,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierDoubleQuoted; @@ -3938,7 +4026,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierSingleQuoted; @@ -3984,7 +4072,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword); self.set_doctype_token_system_id(); @@ -3995,7 +4083,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword); self.set_doctype_token_system_id(); @@ -4005,7 +4093,7 @@ where // This is a missing-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::MissingDoctypeSystemIdentifier); self.set_doctype_token_force_quirks(); @@ -4051,7 +4139,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (double-quoted) state. - Some(c @ '"') => { + Some(c @ b'"') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierDoubleQuoted; @@ -4060,7 +4148,7 @@ where // Set the current DOCTYPE token's system identifier to the empty string // (not missing), then switch to the DOCTYPE system identifier // (single-quoted) state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.append_raw_to_doctype_token(c); self.set_doctype_token_system_id(); self.state = State::DoctypeSystemIdentifierSingleQuoted; @@ -4069,7 +4157,7 @@ where // This is a missing-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::EofInDoctype); self.set_doctype_token_force_quirks(); @@ -4105,7 +4193,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE system identifier state. - Some(c @ '"') => { + Some(c @ b'"') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypeSystemIdentifier; @@ -4114,16 +4202,18 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's system // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER)); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier); @@ -4149,12 +4239,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_system_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_system_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '"' | '\x00' | '>' | '\r') + !matches!(ch, '"' | '\x00' | '>' | '\r') }); } } @@ -4165,7 +4255,7 @@ where match self.consume_next_char() { // U+0027 APOSTROPHE (') // Switch to the after DOCTYPE system identifier state. - Some(c @ '\'') => { + Some(c @ b'\'') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.state = State::AfterDoctypeSystemIdentifier; @@ -4174,16 +4264,18 @@ where // This is an unexpected-null-character parse error. Append a U+FFFD // REPLACEMENT CHARACTER character to the current DOCTYPE token's system // identifier. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.append_to_doctype_token(None, None, Some(REPLACEMENT_CHARACTER)); + let b = self.buf.clone(); + let mut buf = b.borrow_mut(); + buf.push(REPLACEMENT_CHARACTER); } // U+003E GREATER-THAN SIGN (>) // This is an abrupt-doctype-system-identifier parse error. Set the current // DOCTYPE token's force-quirks flag to on. Switch to the data state. Emit // the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.finish_doctype_token_system_id(); self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::AbruptDoctypeSystemIdentifier); @@ -4209,12 +4301,12 @@ where // identifier. Some(c) => { self.validate_input_stream_character(c); - self.consume_and_append_to_doctype_token_system_id(c, |c| { - if !is_allowed_character(c) { + self.consume_and_append_to_doctype_token_system_id(c, |ch| { + if !is_allowed_character(ch) { return false; } - !matches!(c, '\'' | '\x00' | '>' | '\r') + !matches!(ch, '\'' | '\x00' | '>' | '\r') }); } } @@ -4233,7 +4325,7 @@ where } // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the current DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); @@ -4266,14 +4358,14 @@ where match self.consume_next_char() { // U+003E GREATER-THAN SIGN (>) // Switch to the data state. Emit the DOCTYPE token. - Some(c @ '>') => { + Some(c @ b'>') => { self.append_raw_to_doctype_token(c); self.state = State::Data; self.emit_doctype_token(); } // U+0000 NULL // This is an unexpected-null-character parse error. Ignore the character. - Some(c @ '\x00') => { + Some(c @ b'\x00') => { self.append_raw_to_doctype_token(c); self.emit_error(ErrorKind::UnexpectedNullCharacter); } @@ -4299,7 +4391,7 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Switch to the CDATA section bracket state. - Some(']') => { + Some(b']') => { self.state = State::CdataSectionBracket; } // EOF @@ -4324,14 +4416,14 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Switch to the CDATA section end state. - Some(']') => { + Some(b']') => { self.state = State::CdataSectionEnd; } // Anything else // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the // CDATA section state. _ => { - self.emit_character_token(']'); + self.emit_character_token(b']'); self.reconsume_in_state(State::CdataSection); } } @@ -4342,20 +4434,20 @@ where match self.consume_next_char() { // U+005D RIGHT SQUARE BRACKET (]) // Emit a U+005D RIGHT SQUARE BRACKET character token. - Some(c @ ']') => { + Some(c @ b']') => { self.emit_character_token_with_raw(']', c); } // U+003E GREATER-THAN SIGN character // Switch to the data state. - Some('>') => { + Some(b'>') => { self.state = State::Data; } // Anything else // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the // CDATA section state. _ => { - self.emit_character_token(']'); - self.emit_character_token(']'); + self.emit_character_token(b']'); + self.emit_character_token(b']'); self.reconsume_in_state(State::CdataSection); } } @@ -4377,8 +4469,8 @@ where // U+0023 NUMBER SIGN (#) // Append the current input character to the temporary buffer. Switch to the // numeric character reference state. - Some(c @ '#') => { - self.temporary_buffer.push(c); + Some(c @ b'#') => { + self.temporary_buffer.push(c as char); self.state = State::NumericCharacterReference; } // Anything else @@ -4409,7 +4501,7 @@ where // No need to validate input, because we reset position if nothing was found while let Some(c) = &self.consume_next_char() { - entity_temporary_buffer.push(*c); + entity_temporary_buffer.push(*c as char); if let Some(found_entity) = HTML_ENTITIES.get(&entity_temporary_buffer) { entity = Some(found_entity); @@ -4448,7 +4540,7 @@ where match entity { Some(entity) => { let is_next_equals_sign_or_ascii_alphanumeric = match self.next() { - Some('=') => true, + Some(b'=') => true, Some(c) if c.is_ascii_alphanumeric() => true, _ => false, }; @@ -4514,7 +4606,7 @@ where // Otherwise, emit the current input character as a character token. Some(c) if c.is_ascii_alphanumeric() => { if self.is_consumed_as_part_of_an_attribute() { - self.append_to_attribute_token_value(Some(c), Some(c)); + self.append_to_attribute_token_value(Some(c as char), Some(c as char)); } else { self.emit_character_token(c); } @@ -4522,7 +4614,7 @@ where // U+003B SEMICOLON (;) // This is an unknown-named-character-reference parse error. Reconsume in // the return state. - Some(';') => { + Some(b';') => { self.emit_error(ErrorKind::UnknownNamedCharacterReference); self.reconsume_in_state(self.return_state.clone()); } @@ -4543,8 +4635,8 @@ where // U+0058 LATIN CAPITAL LETTER X // Append the current input character to the temporary buffer. Switch to the // hexadecimal character reference start state. - Some(c @ 'x' | c @ 'X') => { - self.temporary_buffer.push(c); + Some(c @ b'x' | c @ b'X') => { + self.temporary_buffer.push(c as char); self.state = State::HexademicalCharacterReferenceStart; } // Anything else @@ -4604,7 +4696,7 @@ where // to the character reference code. Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x30, Some(c))); + character_reference_code.push((16, c as u32 - 0x30, Some(c as char))); } _ => { unreachable!(); @@ -4616,7 +4708,7 @@ where // character's code point) to the character reference code. Some(c) if is_upper_hex_digit(c) => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x37, Some(c))); + character_reference_code.push((16, c as u32 - 0x37, Some(c as char))); } _ => { unreachable!(); @@ -4628,7 +4720,7 @@ where // character's code point) to the character reference code. Some(c) if is_lower_hex_digit(c) => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((16, c as u32 - 0x57, Some(c))); + character_reference_code.push((16, c as u32 - 0x57, Some(c as char))); } _ => { unreachable!(); @@ -4636,7 +4728,7 @@ where }, // U+003B SEMICOLON // Switch to the numeric character reference end state. - Some(';') => { + Some(b';') => { self.state = State::NumericCharacterReferenceEnd; } // Anything else @@ -4658,7 +4750,7 @@ where // to the character reference code. Some(c) if c.is_ascii_digit() => match &mut self.character_reference_code { Some(character_reference_code) => { - character_reference_code.push((10, c as u32 - 0x30, Some(c))); + character_reference_code.push((10, c as u32 - 0x30, Some(c as char))); } _ => { unreachable!(); @@ -4666,7 +4758,7 @@ where }, // U+003B SEMICOLON // Switch to the numeric character reference end state. - Some(';') => self.state = State::NumericCharacterReferenceEnd, + Some(b';') => self.state = State::NumericCharacterReferenceEnd, // Anything else // This is a missing-semicolon-after-character-reference parse error. // Reconsume in the numeric character reference end state. @@ -4832,7 +4924,7 @@ where raw.push_str(&old_temporary_buffer); raw.push_str(&raw_char_ref); - if self.cur == Some(';') { + if self.cur == Some(b';') { raw.push(';'); } @@ -4855,12 +4947,10 @@ where } #[inline(always)] - fn skip_whitespaces(&mut self, c: char) { - if c == '\r' && self.input.cur() == Some('\n') { - unsafe { - // Safety: cur() is Some - self.input.bump(); - } + fn skip_whitespaces(&mut self, c: u8) { + if c == b'\r' && self.input.cur() == Some(b'\n') { + // ASCII newline is always 1 byte + self.input.bump_bytes(1); } } } @@ -4868,8 +4958,13 @@ where // By spec '\r` removed before tokenizer, but we keep them to have better AST // and don't break logic to ignore characters #[inline(always)] -fn is_spacy(c: char) -> bool { - matches!(c, '\x09' | '\x0a' | '\x0d' | '\x0c' | '\x20') +fn is_spacy(c: u8) -> bool { + matches!(c, b'\x09' | b'\x0a' | b'\x0d' | b'\x0c' | b'\x20') +} + +#[inline(always)] +fn is_spacy_char(c: char) -> bool { + is_spacy(c as u8) } #[inline(always)] @@ -4932,35 +5027,40 @@ fn is_noncharacter(c: u32) -> bool { } #[inline(always)] -fn is_upper_hex_digit(c: char) -> bool { - matches!(c, '0'..='9' | 'A'..='F') +fn is_upper_hex_digit(c: u8) -> bool { + matches!(c, b'0'..=b'9' | b'A'..=b'F') } #[inline(always)] -fn is_lower_hex_digit(c: char) -> bool { - matches!(c, '0'..='9' | 'a'..='f') +fn is_lower_hex_digit(c: u8) -> bool { + matches!(c, b'0'..=b'9' | b'a'..=b'f') } #[inline(always)] -fn is_ascii_hex_digit(c: char) -> bool { +fn is_ascii_hex_digit(c: u8) -> bool { is_upper_hex_digit(c) || is_lower_hex_digit(c) } #[inline(always)] -fn is_ascii_upper_alpha(c: char) -> bool { +fn is_ascii_upper_alpha(c: u8) -> bool { c.is_ascii_uppercase() } #[inline(always)] -fn is_ascii_lower_alpha(c: char) -> bool { +fn is_ascii_lower_alpha(c: u8) -> bool { c.is_ascii_lowercase() } #[inline(always)] -fn is_ascii_alpha(c: char) -> bool { +fn is_ascii_alpha(c: u8) -> bool { is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c) } +#[inline(always)] +fn is_ascii_upper_alpha_char(c: char) -> bool { + c.is_ascii_uppercase() +} + #[inline(always)] fn is_allowed_control_character(c: u32) -> bool { c != 0x00 && is_control(c) @@ -4976,3 +5076,8 @@ fn is_allowed_character(c: char) -> bool { return true; } + +#[inline(always)] +fn is_non_ascii(c: u8) -> bool { + c >= 0x80 +} diff --git a/crates/swc_xml_parser/src/lexer/mod.rs b/crates/swc_xml_parser/src/lexer/mod.rs index 95b4c5057614..ebac6ad14726 100644 --- a/crates/swc_xml_parser/src/lexer/mod.rs +++ b/crates/swc_xml_parser/src/lexer/mod.rs @@ -173,9 +173,9 @@ where // A leading Byte Order Mark (BOM) causes the character encoding argument to be // ignored and will itself be skipped. - if lexer.input.is_at_start() && lexer.input.cur() == Some('\u{feff}') { + if lexer.input.is_at_start() && lexer.input.cur_as_char() == Some('\u{feff}') { unsafe { - // Safety: cur() is Some('\u{feff}') + // Safety: cur_as_char() is Some('\u{feff}') lexer.input.bump(); } } @@ -224,7 +224,7 @@ where { #[inline(always)] fn next(&mut self) -> Option { - self.input.cur() + self.input.cur_as_char() } // Any occurrences of surrogates are surrogate-in-input-stream parse errors. Any @@ -249,12 +249,12 @@ where #[inline(always)] fn consume(&mut self) { - self.cur = self.input.cur(); + self.cur = self.input.cur_as_char(); self.cur_pos = self.input.cur_pos(); if self.cur.is_some() { unsafe { - // Safety: cur() is Some(c) + // Safety: cur_as_char() is Some(c) self.input.bump(); } } @@ -573,9 +573,9 @@ where raw.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -895,9 +895,9 @@ where raw_c.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -962,9 +962,9 @@ where raw.push(c); - if self.input.cur() == Some('\n') { + if self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } @@ -3104,9 +3104,9 @@ where #[inline(always)] fn skip_next_lf(&mut self, c: char) { - if c == '\r' && self.input.cur() == Some('\n') { + if c == '\r' && self.input.cur() == Some(b'\n') { unsafe { - // Safety: cur() is Some('\n') + // Safety: cur() is Some(b'\n') self.input.bump(); } }