Skip to content

Commit dd120a9

Browse files
committed
Require escaping of [, &, - and ~ in classes.
The escaping of &, - and ~ is only required when the characters are repeated adjacently, which should be quite rare. Escaping of [ is always required, unless it appear in the second position of a range. These rules enable us to add character class sets as described in UTS#18 RL1.3 in a backward compatible way.
1 parent bc06024 commit dd120a9

File tree

5 files changed

+77
-11
lines changed

5 files changed

+77
-11
lines changed

regex-syntax/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,6 +1401,17 @@ pub enum ErrorKind {
14011401
/// A character class was constructed such that it is empty.
14021402
/// e.g., `[^\d\D]`.
14031403
EmptyClass,
1404+
/// Indicates that unsupported notation was used in a character class.
1405+
///
1406+
/// The char in this error corresponds to the illegal character.
1407+
///
1408+
/// The intent of this error is to carve a path to support set notation
1409+
/// as described in UTS#18 RL1.3. We do this by rejecting regexes that
1410+
/// would use the notation.
1411+
///
1412+
/// The work around for end users is to escape the character included in
1413+
/// this error message.
1414+
UnsupportedClassChar(char),
14041415
/// Hints that destructuring should not be exhaustive.
14051416
///
14061417
/// This enum may grow additional variants, so this makes sure clients
@@ -1464,6 +1475,7 @@ impl ErrorKind {
14641475
UnicodeNotAllowed => "Unicode features not allowed",
14651476
InvalidUtf8 => "matching arbitrary bytes is not allowed",
14661477
EmptyClass => "empty character class",
1478+
UnsupportedClassChar(_) => "unsupported class notation",
14671479
__Nonexhaustive => unreachable!(),
14681480
}
14691481
}
@@ -1576,6 +1588,9 @@ repetition operator."),
15761588
write!(f, "Matching arbitrary bytes is not allowed."),
15771589
EmptyClass =>
15781590
write!(f, "Empty character classes are not allowed."),
1591+
UnsupportedClassChar(c) =>
1592+
write!(f, "Use of unescaped '{}' in character class is \
1593+
not allowed.", c),
15791594
__Nonexhaustive => unreachable!(),
15801595
}
15811596
}

regex-syntax/src/parser.rs

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -548,8 +548,8 @@ impl Parser {
548548
'[' => match self.maybe_parse_ascii() {
549549
Some(class2) => class.ranges.extend(class2),
550550
None => {
551-
self.bump();
552-
try!(self.parse_class_range(&mut class, '['))
551+
return Err(self.err(
552+
ErrorKind::UnsupportedClassChar('[')));
553553
}
554554
},
555555
'\\' => match try!(self.parse_escape()) {
@@ -582,6 +582,16 @@ impl Parser {
582582
let _ = try!(self.codepoint_to_one_byte(start));
583583
}
584584
self.bump();
585+
match start {
586+
'&'|'~'|'-' => {
587+
// Only report an error if we see && or ~~ or --.
588+
if self.peek_is(start) {
589+
return Err(self.err(
590+
ErrorKind::UnsupportedClassChar(start)));
591+
}
592+
}
593+
_ => {}
594+
}
585595
try!(self.parse_class_range(&mut class, start));
586596
}
587597
}
@@ -654,8 +664,11 @@ impl Parser {
654664
// Because `parse_escape` can never return `LeftParen`.
655665
_ => unreachable!(),
656666
},
657-
_ => {
658-
let c = self.bump();
667+
c => {
668+
self.bump();
669+
if c == '-' {
670+
return Err(self.err(ErrorKind::UnsupportedClassChar('-')));
671+
}
659672
if !self.flags.unicode {
660673
let _ = try!(self.codepoint_to_one_byte(c));
661674
}
@@ -1212,7 +1225,7 @@ fn is_valid_capture_char(c: char) -> bool {
12121225
pub fn is_punct(c: char) -> bool {
12131226
match c {
12141227
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1215-
'[' | ']' | '{' | '}' | '^' | '$' | '#' => true,
1228+
'[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
12161229
_ => false,
12171230
}
12181231
}
@@ -2191,9 +2204,9 @@ mod tests {
21912204

21922205
#[test]
21932206
fn class_brackets() {
2194-
assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')])));
2195-
assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')])));
2196-
assert_eq!(p("[[]]"), Expr::Concat(vec![
2207+
assert_eq!(p(r"[]]"), Expr::Class(class(&[(']', ']')])));
2208+
assert_eq!(p(r"[]\[]"), Expr::Class(class(&[('[', '['), (']', ']')])));
2209+
assert_eq!(p(r"[\[]]"), Expr::Concat(vec![
21972210
Expr::Class(class(&[('[', '[')])),
21982211
lit(']'),
21992212
]));
@@ -2208,6 +2221,31 @@ mod tests {
22082221
]));
22092222
}
22102223

2224+
#[test]
2225+
fn class_special_escaped_set_chars() {
2226+
// These tests ensure that some special characters require escaping
2227+
// for use in character classes. The intention is to use these
2228+
// characters to implement sets as described in UTC#18 RL1.3. Once
2229+
// that's done, these tests should be removed and replaced with others.
2230+
assert_eq!(p(r"[\[]"), Expr::Class(class(&[('[', '[')])));
2231+
assert_eq!(p(r"[&]"), Expr::Class(class(&[('&', '&')])));
2232+
assert_eq!(p(r"[\&]"), Expr::Class(class(&[('&', '&')])));
2233+
assert_eq!(p(r"[\&\&]"), Expr::Class(class(&[('&', '&')])));
2234+
assert_eq!(p(r"[\x00-&]"), Expr::Class(class(&[('\u{0}', '&')])));
2235+
assert_eq!(p(r"[&-\xFF]"), Expr::Class(class(&[('&', '\u{FF}')])));
2236+
2237+
assert_eq!(p(r"[~]"), Expr::Class(class(&[('~', '~')])));
2238+
assert_eq!(p(r"[\~]"), Expr::Class(class(&[('~', '~')])));
2239+
assert_eq!(p(r"[\~\~]"), Expr::Class(class(&[('~', '~')])));
2240+
assert_eq!(p(r"[\x00-~]"), Expr::Class(class(&[('\u{0}', '~')])));
2241+
assert_eq!(p(r"[~-\xFF]"), Expr::Class(class(&[('~', '\u{FF}')])));
2242+
2243+
assert_eq!(p(r"[+-\-]"), Expr::Class(class(&[('+', '-')])));
2244+
assert_eq!(p(r"[a-a\--\xFF]"), Expr::Class(class(&[
2245+
('-', '\u{FF}'),
2246+
])));
2247+
}
2248+
22112249
#[test]
22122250
fn class_overlapping() {
22132251
assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')])));
@@ -2759,6 +2797,19 @@ mod tests {
27592797
test_err!(r"(?-u)[^\x00-\xFF]", 17, ErrorKind::EmptyClass, flags);
27602798
}
27612799

2800+
#[test]
2801+
fn error_class_unsupported_char() {
2802+
// These tests ensure that some unescaped special characters are
2803+
// rejected in character classes. The intention is to use these
2804+
// characters to implement sets as described in UTC#18 RL1.3. Once
2805+
// that's done, these tests should be removed and replaced with others.
2806+
test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('['));
2807+
test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&'));
2808+
test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~'));
2809+
test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-'));
2810+
test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-'));
2811+
}
2812+
27622813
#[test]
27632814
fn error_duplicate_capture_name() {
27642815
test_err!("(?P<a>.)(?P<a>.)", 14,

tests/api.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ fn quoted_bracket_set() {
4040

4141
#[test]
4242
fn first_range_starts_with_left_bracket() {
43-
let re = regex!(r"([[-z])");
43+
let re = regex!(r"([\[-z])");
4444
assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
4545
}
4646

tests/bytes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ matiter!(invalidutf8_anchor1,
4747
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
4848
(0, 0));
4949
matiter!(invalidutf8_anchor2,
50-
r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] #####\x80\S7|$",
50+
r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
5151
R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
5252
(22, 22));
5353
matiter!(invalidutf8_anchor3,

tests/regression.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None);
2323
mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None);
2424

2525
// See: https://github.com/rust-lang/regex/issues/101
26-
mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1)));
26+
mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1)));
2727

2828
// See: https://github.com/rust-lang-nursery/regex/issues/129
2929
#[test]

0 commit comments

Comments
 (0)