Require escaping of [, &, - and ~ in classes.

BurntSushi · BurntSushi · commit dd120a963a48 · 2016-12-30T01:06:18.000-05:00
The escaping of &amp;, - and ~ is only required when the characters are
repeated adjacently, which should be quite rare. Escaping of [ is always
required, unless it appear in the second position of a range.

These rules enable us to add character class sets as described in
UTS#18 RL1.3 in a backward compatible way.
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -1401,6 +1401,17 @@ pub enum ErrorKind {
     /// A character class was constructed such that it is empty.
     /// e.g., `[^\d\D]`.
     EmptyClass,
+    /// Indicates that unsupported notation was used in a character class.
+    ///
+    /// The char in this error corresponds to the illegal character.
+    ///
+    /// The intent of this error is to carve a path to support set notation
+    /// as described in UTS#18 RL1.3. We do this by rejecting regexes that
+    /// would use the notation.
+    ///
+    /// The work around for end users is to escape the character included in
+    /// this error message.
+    UnsupportedClassChar(char),
     /// Hints that destructuring should not be exhaustive.
     ///
     /// This enum may grow additional variants, so this makes sure clients
@@ -1464,6 +1475,7 @@ impl ErrorKind {
             UnicodeNotAllowed => "Unicode features not allowed",
             InvalidUtf8 => "matching arbitrary bytes is not allowed",
             EmptyClass => "empty character class",
+            UnsupportedClassChar(_) => "unsupported class notation",
             __Nonexhaustive => unreachable!(),
         }
     }
@@ -1576,6 +1588,9 @@ repetition operator."),
                 write!(f, "Matching arbitrary bytes is not allowed."),
             EmptyClass =>
                 write!(f, "Empty character classes are not allowed."),
+            UnsupportedClassChar(c) =>
+                write!(f, "Use of unescaped '{}' in character class is \
+                           not allowed.", c),
             __Nonexhaustive => unreachable!(),
         }
     }
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
@@ -548,8 +548,8 @@ impl Parser {
                 '[' => match self.maybe_parse_ascii() {
                     Some(class2) => class.ranges.extend(class2),
                     None => {
-                        self.bump();
-                        try!(self.parse_class_range(&mut class, '['))
+                        return Err(self.err(
+                            ErrorKind::UnsupportedClassChar('[')));
                     }
                 },
                 '\\' => match try!(self.parse_escape()) {
@@ -582,6 +582,16 @@ impl Parser {
                         let _ = try!(self.codepoint_to_one_byte(start));
                     }
                     self.bump();
+                    match start {
+                        '&'|'~'|'-' => {
+                            // Only report an error if we see && or ~~ or --.
+                            if self.peek_is(start) {
+                                return Err(self.err(
+                                    ErrorKind::UnsupportedClassChar(start)));
+                            }
+                        }
+                        _ => {}
+                    }
                     try!(self.parse_class_range(&mut class, start));
                 }
             }
@@ -654,8 +664,11 @@ impl Parser {
                 // Because `parse_escape` can never return `LeftParen`.
                 _ => unreachable!(),
             },
-            _ => {
-                let c = self.bump();
+            c => {
+                self.bump();
+                if c == '-' {
+                    return Err(self.err(ErrorKind::UnsupportedClassChar('-')));
+                }
                 if !self.flags.unicode {
                     let _ = try!(self.codepoint_to_one_byte(c));
                 }
@@ -1212,7 +1225,7 @@ fn is_valid_capture_char(c: char) -> bool {
 pub fn is_punct(c: char) -> bool {
     match c {
         '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
-        '[' | ']' | '{' | '}' | '^' | '$' | '#' => true,
+        '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
         _ => false,
     }
 }
@@ -2191,9 +2204,9 @@ mod tests {
 
     #[test]
     fn class_brackets() {
-        assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')])));
-        assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')])));
-        assert_eq!(p("[[]]"), Expr::Concat(vec![
+        assert_eq!(p(r"[]]"), Expr::Class(class(&[(']', ']')])));
+        assert_eq!(p(r"[]\[]"), Expr::Class(class(&[('[', '['), (']', ']')])));
+        assert_eq!(p(r"[\[]]"), Expr::Concat(vec![
             Expr::Class(class(&[('[', '[')])),
             lit(']'),
         ]));
@@ -2208,6 +2221,31 @@ mod tests {
         ]));
     }
 
+    #[test]
+    fn class_special_escaped_set_chars() {
+        // These tests ensure that some special characters require escaping
+        // for use in character classes. The intention is to use these
+        // characters to implement sets as described in UTC#18 RL1.3. Once
+        // that's done, these tests should be removed and replaced with others.
+        assert_eq!(p(r"[\[]"), Expr::Class(class(&[('[', '[')])));
+        assert_eq!(p(r"[&]"), Expr::Class(class(&[('&', '&')])));
+        assert_eq!(p(r"[\&]"), Expr::Class(class(&[('&', '&')])));
+        assert_eq!(p(r"[\&\&]"), Expr::Class(class(&[('&', '&')])));
+        assert_eq!(p(r"[\x00-&]"), Expr::Class(class(&[('\u{0}', '&')])));
+        assert_eq!(p(r"[&-\xFF]"), Expr::Class(class(&[('&', '\u{FF}')])));
+
+        assert_eq!(p(r"[~]"), Expr::Class(class(&[('~', '~')])));
+        assert_eq!(p(r"[\~]"), Expr::Class(class(&[('~', '~')])));
+        assert_eq!(p(r"[\~\~]"), Expr::Class(class(&[('~', '~')])));
+        assert_eq!(p(r"[\x00-~]"), Expr::Class(class(&[('\u{0}', '~')])));
+        assert_eq!(p(r"[~-\xFF]"), Expr::Class(class(&[('~', '\u{FF}')])));
+
+        assert_eq!(p(r"[+-\-]"), Expr::Class(class(&[('+', '-')])));
+        assert_eq!(p(r"[a-a\--\xFF]"), Expr::Class(class(&[
+            ('-', '\u{FF}'),
+        ])));
+    }
+
     #[test]
     fn class_overlapping() {
         assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')])));
@@ -2759,6 +2797,19 @@ mod tests {
         test_err!(r"(?-u)[^\x00-\xFF]", 17, ErrorKind::EmptyClass, flags);
     }
 
+    #[test]
+    fn error_class_unsupported_char() {
+        // These tests ensure that some unescaped special characters are
+        // rejected in character classes. The intention is to use these
+        // characters to implement sets as described in UTC#18 RL1.3. Once
+        // that's done, these tests should be removed and replaced with others.
+        test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('['));
+        test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&'));
+        test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~'));
+        test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-'));
+        test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-'));
+    }
+
     #[test]
     fn error_duplicate_capture_name() {
         test_err!("(?P<a>.)(?P<a>.)", 14,
diff --git a/tests/api.rs b/tests/api.rs
@@ -40,7 +40,7 @@ fn quoted_bracket_set() {
 
 #[test]
 fn first_range_starts_with_left_bracket() {
-    let re = regex!(r"([[-z])");
+    let re = regex!(r"([\[-z])");
     assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));
 }
 
diff --git a/tests/bytes.rs b/tests/bytes.rs
@@ -47,7 +47,7 @@ matiter!(invalidutf8_anchor1,
          R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
          (0, 0));
 matiter!(invalidutf8_anchor2,
-         r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] #####\x80\S7|$",
+         r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
          R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
          (22, 22));
 matiter!(invalidutf8_anchor3,
diff --git a/tests/regression.rs b/tests/regression.rs
@@ -23,7 +23,7 @@ mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None);
 mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None);
 
 // See: https://github.com/rust-lang/regex/issues/101
-mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1)));
+mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1)));
 
 // See: https://github.com/rust-lang-nursery/regex/issues/129
 #[test]

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ fn quoted_bracket_set() {`
`40`	`40`
`41`	`41`	`#[test]`
`42`	`42`	`fn first_range_starts_with_left_bracket() {`
`43`		`- let re = regex!(r"([[-z])");`
	`43`	`+ let re = regex!(r"([\[-z])");`
`44`	`44`	`assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]"));`
`45`	`45`	`}`
`46`	`46`