From 8dd0c77791206cd81250f099b9eafb599fcb6b38 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 May 2016 18:23:51 -0400 Subject: [PATCH 01/18] Switch bytes::Regex to using Unicode mode by default. --- src/lib.rs | 34 +++++++++------------- src/re_builder.rs | 7 ++--- tests/api_str.rs | 4 +-- tests/bytes.rs | 41 +++++++++++++------------- tests/crazy.rs | 2 +- tests/macros_bytes.rs | 2 +- tests/macros_str.rs | 2 +- tests/regression.rs | 2 +- tests/test_backtrack_bytes.rs | 2 -- tests/test_nfa_bytes.rs | 2 -- tests/unicode.rs | 54 +++++++++++++++++------------------ tests/word_boundary_ascii.rs | 12 ++++---- 12 files changed, 77 insertions(+), 87 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 95b70e0247..baaf6cfd8b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -244,16 +244,11 @@ //! # Opt out of Unicode support //! //! The `bytes` sub-module provides a `Regex` type that can be used to match -//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with -//! all Unicode support disabled (e.g., `.` matches any byte instead of any -//! Unicode codepoint). Unicode support can be selectively enabled with the -//! `u` flag. See the `bytes` module documentation for more details. -//! -//! Unicode support can also be selectively *disabled* with the main `Regex` -//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII -//! word boundary. Note though that invalid UTF-8 is not allowed to be matched -//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an -//! error, since `.` matches *any byte* when Unicode support is disabled. +//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +//! the main `Regex` type. However, this behavior can be disabled by turning +//! off the `u` flag, even if doing so could result in matching invalid UTF-8. +//! For example, when the `u` flag is disabled, `.` will match any byte instead +//! of any Unicode codepoint. //! //! # Syntax //! @@ -480,11 +475,8 @@ top-level of this crate. There are two important differences: 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` is used where `String` would have been used. -2. Regular expressions are compiled with Unicode support *disabled* by -default. This means that while Unicode regular expressions can only match valid -UTF-8, regular expressions in this module can match arbitrary bytes. Unicode -support can be selectively enabled via the `u` flag in regular expressions -provided by this sub-module. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. # Example: match null terminated string @@ -492,7 +484,7 @@ This shows how to find all null-terminated strings in a slice of bytes: ```rust # use regex::bytes::Regex; -let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); let text = b"foo\x00bar\x00baz\x00"; // Extract all of the strings without the null terminator from each match. @@ -512,7 +504,9 @@ string (e.g., to extract a title from a Matroska file): ```rust # use std::str; # use regex::bytes::Regex; -let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap(); +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; let caps = re.captures(text).unwrap(); @@ -536,9 +530,9 @@ The supported syntax is pretty much the same as the syntax for Unicode regular expressions with a few changes that make sense for matching arbitrary bytes: -1. The `u` flag is *disabled* by default, but can be selectively enabled. (The -opposite is true for the main `Regex` type.) Disabling the `u` flag is said to -invoke "ASCII compatible" mode. +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. 2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) diff --git a/src/re_builder.rs b/src/re_builder.rs index ca030b3ef5..c769cc5d17 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -39,7 +39,7 @@ impl Default for RegexOptions { } macro_rules! define_builder { - ($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use error::Error; use exec::ExecBuilder; @@ -62,7 +62,6 @@ impl RegexBuilder { pub fn new(pattern: &str) -> RegexBuilder { let mut builder = RegexBuilder(RegexOptions::default()); builder.0.pats.push(pattern.to_owned()); - builder.0.unicode = $unicode; builder } @@ -150,5 +149,5 @@ impl RegexBuilder { } } -define_builder!(bytes, re_bytes, false, false); -define_builder!(unicode, re_unicode, true, true); +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); diff --git a/tests/api_str.rs b/tests/api_str.rs index 266b6455b2..e5e667863d 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -5,7 +5,7 @@ fn empty_match_unicode_find_iter() { // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries // even when we're susceptible to empty width matches. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], findall!(re, "Ⅰ1Ⅱ2")); } @@ -13,7 +13,7 @@ fn empty_match_unicode_find_iter() { #[test] fn empty_match_unicode_captures_iter() { // Same as empty_match_unicode_find_iter, but tests capture iteration. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) .map(|c| c.pos(0).unwrap()) .collect(); diff --git a/tests/bytes.rs b/tests/bytes.rs index a290630d8d..e7748e91c9 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -5,36 +5,37 @@ struct R<'a>(&'a [u8]); impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } -mat!(word_boundary, r" \b", " δ", None); -mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1))); -mat!(word_not_boundary, r" \B", " δ", Some((0, 1))); -mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None); - -mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1))); -mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3))); -mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1))); -mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8))); -mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1))); -mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4))); +mat!(word_boundary, r"(?-u) \b", " δ", None); +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); // The first `(.+)` matches two Unicode codepoints, but can't match the 5th // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and // matches. -mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), +mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), Some((0, 5)), Some((0, 4)), Some((4, 5))); -mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1))); -mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5))); -mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7))); -mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2))); +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); -mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2))); -mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1))); +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); // This doesn't match in a normal Unicode regex because the implicit preceding // `.*?` is Unicode aware. -mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); // Have fun with null bytes. -mat!(null_bytes, r"(?P[^\x00]+)\x00", +mat!(null_bytes, r"(?-u)(?P[^\x00]+)\x00", R(b"foo\x00"), Some((0, 4)), Some((0, 3))); diff --git a/tests/crazy.rs b/tests/crazy.rs index bed66277e5..ade839ade1 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -1,4 +1,4 @@ -mat!(ascii_literal, u!(r"a"), "a", Some((0, 1))); +mat!(ascii_literal, r"a", "a", Some((0, 1))); // Some crazy expressions from regular-expressions.info. mat!(match_ranges, diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index a68fada744..89c236ff31 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -5,7 +5,7 @@ macro_rules! t { ($re:expr) => { text!($re) } } macro_rules! bytes { ($text:expr) => { $text } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } +// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } macro_rules! no_expand { ($text:expr) => {{ diff --git a/tests/macros_str.rs b/tests/macros_str.rs index 7ea29335de..c419ee90dd 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -5,7 +5,7 @@ macro_rules! t { ($text:expr) => { text!($text) } } macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { $re } } +// macro_rules! u { ($re:expr) => { $re } } macro_rules! no_expand { ($text:expr) => {{ diff --git a/tests/regression.rs b/tests/regression.rs index 3b7a1fe917..ccb4fab8ca 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); // See: https://github.com/rust-lang/regex/issues/76 -mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); // See: https://github.com/rust-lang-nursery/regex/issues/191 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index 57074f1870..4ea60e7d0f 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -16,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -34,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 83eea01a2d..a084c804fe 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -17,7 +17,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -35,7 +34,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/unicode.rs b/tests/unicode.rs index 5357a18c96..48e9a95aaf 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -1,31 +1,31 @@ -mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3))); -mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3))); -mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3))); -mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3))); -mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3))); -mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8))); -mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5))); -mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2))); -mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8))); -mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10))); +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); // Test the Unicode friendliness of Perl character classes. -mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4))); -mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None); -mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3))); -mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8))); -mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None); -mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3))); -mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3))); -mat!(uni_perl_s_not, u!(r"\s+"), "☃", None); -mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3))); +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); // And do the same for word boundaries. -mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None); -mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1))); -mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1))); -mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None); +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index 9beb7c0cb1..5a3cf1166c 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -1,9 +1,9 @@ // ASCII word boundaries are completely oblivious to Unicode characters. // For Unicode word boundaries, the tests are precisely inverted. -matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); -matiter!(ascii2, r"\Bx\B", "áxβ"); -matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); -// We can still get Unicode mode in byte regexes. -matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); -matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3)); +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); From 8b18b29eb2105b65663ba6973f4630cd3119bb62 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 May 2016 19:12:53 -0400 Subject: [PATCH 02/18] Update Replacer trait for Unicode regexes. This uses the new Replacer trait essentially as defined in the `bytes` sub-module and described in #151. Fixes #151 --- src/expand.rs | 142 +++++++++++++++++++++++++++++---- src/re_bytes.rs | 23 +++++- src/re_unicode.rs | 177 +++++++++++++++++++++++------------------- tests/macros_bytes.rs | 3 - tests/macros_str.rs | 3 +- 5 files changed, 248 insertions(+), 100 deletions(-) diff --git a/src/expand.rs b/src/expand.rs index 9bea703881..40c4c87152 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -2,9 +2,50 @@ use std::str; use memchr::memchr; -use bytes::Captures; +use re_bytes; +use re_unicode; -pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { +pub fn expand_str( + caps: &re_unicode::Captures, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")), + Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")), + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures, + mut replacement: &[u8], + dst: &mut Vec, +) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, @@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { continue; } }; - replacement = cap_ref.rest; + replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), @@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { dst.extend(replacement); } +/// CaptureRef represents a reference to a capture group inside some text. The +/// reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { - rest: &'a [u8], cap: Ref<'a>, + end: usize, } +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } -fn find_cap_ref(mut replacement: &[u8]) -> Option { - if replacement.len() <= 1 || replacement[0] != b'$' { +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref>( + replacement: &T, +) -> Option { + let mut i = 0; + let rep: &[u8] = replacement.as_ref(); + if rep.len() <= 1 || rep[0] != b'$' { return None; } let mut brace = false; - replacement = &replacement[1..]; - if replacement[0] == b'{' { + i += 1; + if rep[i] == b'{' { brace = true; - replacement = &replacement[1..]; + i += 1; } - let mut cap_end = 0; - while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + let mut cap_end = i; + while rep.get(cap_end).map_or(false, is_valid_cap_letter) { cap_end += 1; } - if cap_end == 0 { + if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check with either unsafe or by parsing the number straight from &[u8]. - let cap = str::from_utf8(&replacement[..cap_end]) + let cap = str::from_utf8(&rep[i..cap_end]) .ok().expect("valid UTF-8 capture name"); if brace { - if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + if !rep.get(cap_end).map_or(false, |&b| b == b'}') { return None; } cap_end += 1; } Some(CaptureRef { - rest: &replacement[cap_end..], cap: match cap.parse::() { Ok(i) => Ref::Number(i as usize), Err(_) => Ref::Named(cap), }, + end: cap_end, }) } +/// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, _ => false, } } + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text)); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text)); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); +} diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 97ac5b923a..ed517364a7 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use memchr::memchr; use exec::{Exec, ExecNoSync}; -use expand::expand; +use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; use re_trait::{self, RegularExpression, Slot}; @@ -375,6 +375,25 @@ impl Regex { /// If no match is found, then a copy of the byte string is returned /// unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -768,7 +787,7 @@ impl<'t> Captures<'t> { /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &[u8], dst: &mut Vec) { - expand(self, replacement, dst) + expand_bytes(self, replacement, dst) } /// Returns the number of captured groups. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index ed3c6b5bde..359b6e0736 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -15,10 +15,12 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; +use memchr::memchr; use syntax; use error::Error; use exec::{Exec, ExecNoSyncStr}; +use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; use re_trait::{self, RegularExpression, Slot}; @@ -478,6 +480,25 @@ impl Regex { /// /// If no match is found, then a copy of the string is returned unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -574,7 +595,7 @@ impl Regex { // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. - if let Some(rep) = rep.no_expand() { + if let Some(rep) = rep.no_expansion() { let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, (s, e)) in self.find_iter(text).enumerate() { @@ -600,7 +621,7 @@ impl Regex { // unwrap on 0 is OK because captures only reports matches let (s, e) = cap.pos(0).unwrap(); new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)); + rep.replace_append(&cap, &mut new); last_match = e; } new.push_str(&text[last_match..]); @@ -714,58 +735,6 @@ impl<'r> Iterator for CaptureNames<'r> { } } -/// NoExpand indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal -/// string replacement without expanding `$name` to their corresponding -/// capture groups. -/// -/// `'t` is the lifetime of the literal text. -pub struct NoExpand<'t>(pub &'t str); - -/// Replacer describes types that can be used to replace matches in a string. -pub trait Replacer { - /// Returns a possibly owned string that is used to replace the match - /// corresponding to the `caps` capture group. - /// - /// The `'a` lifetime refers to the lifetime of a borrowed string when - /// a new owned string isn't needed (e.g., for `NoExpand`). - fn reg_replace(&mut self, caps: &Captures) -> Cow; - - /// Returns a possibly owned string that never needs expansion. - fn no_expand(&mut self) -> Option> { None } -} - -impl<'t> Replacer for NoExpand<'t> { - fn reg_replace(&mut self, _: &Captures) -> Cow { - self.0.into() - } - - fn no_expand(&mut self) -> Option> { - Some(self.0.into()) - } -} - -impl<'t> Replacer for &'t str { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - caps.expand(*self).into() - } - - fn no_expand(&mut self) -> Option> { - // if there is a $ there may be an expansion - match self.find('$') { - Some(_) => None, - None => Some((*self).into()), - } - } -} - -impl Replacer for F where F: FnMut(&Captures) -> String { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - (*self)(caps).into() - } -} - /// Yields all substrings delimited by a regular expression match. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the @@ -951,39 +920,23 @@ impl<'t> Captures<'t> { } /// Expands all instances of `$name` in `text` to the corresponding capture - /// group `name`. + /// group `name`, and writes them to the `dst` buffer given. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// - /// If `name` isn't a valid capture group (whether the name doesn't exist or - /// isn't a valid index), then it is replaced with the empty string. + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. - pub fn expand(&self, text: &str) -> String { - const REPLACE_EXPAND: &'static str = r"(?x) - (?P^|\b|[^$]) # Ignore `$$name`. - \$ - (?P # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) - "; - // How evil can you get? - let re = Regex::new(REPLACE_EXPAND).unwrap(); - let text = re.replace_all(text, |refs: &Captures| -> String { - let before = refs.name("before").unwrap_or(""); - let name = refs.name("name").unwrap_or(""); - format!("{}{}", before, match name.parse::() { - Err(_) => self.name(name).unwrap_or("").to_owned(), - Ok(i) => self.at(i).unwrap_or("").to_owned(), - }) - }); - let re = Regex::new(r"\$\$").unwrap(); - re.replace_all(&text, NoExpand("$")) + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) } /// Returns the number of captured groups. @@ -1204,3 +1157,69 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { } } } + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` and +/// `FnMut(&Captures) -> String`, which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(caps.at(0).unwrap())`. + fn replace_append(&mut self, caps: &Captures, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option> { + None + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + match memchr(b'$', self.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(*self)), + } + } +} + +impl Replacer for F where F: FnMut(&Captures) -> String { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + dst.push_str(&(*self)(caps)); + } +} + +/// NoExpand indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +pub struct NoExpand<'r>(pub &'r str); + +impl<'a> Replacer for NoExpand<'a> { + fn replace_append(&mut self, _: &Captures, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index 89c236ff31..c0875ab074 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -25,9 +25,6 @@ macro_rules! show { }} } -// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, -// but they should be unified in 1.0. Then we can move this macro back into -// tests/api.rs where it is used. ---AG macro_rules! expand { ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { #[test] diff --git a/tests/macros_str.rs b/tests/macros_str.rs index c419ee90dd..5acbe282b6 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -26,7 +26,8 @@ macro_rules! expand { let re = regex!($re); let cap = re.captures(t!($text)).unwrap(); - let got = cap.expand(t!($expand)); + let mut got = String::new(); + cap.expand(t!($expand), &mut got); assert_eq!(show!(t!($expected)), show!(&*got)); } } From be0d1910c9e0937cebf21a09bf241d0bb2e51ee1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 21:58:49 -0400 Subject: [PATCH 03/18] Remove the is_empty method on Captures. It is useless because it will always return false (since every regex has at least one capture group corresponding to the full match). Fixes #179 --- src/re_bytes.rs | 9 +++------ src/re_unicode.rs | 9 +++------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index ed517364a7..fb9d0ded3b 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -791,16 +791,13 @@ impl<'t> Captures<'t> { } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { self.slots.len() / 2 } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } } impl<'t> fmt::Debug for Captures<'t> { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 359b6e0736..5a3ede915d 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -940,16 +940,13 @@ impl<'t> Captures<'t> { } /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { self.slots.len() / 2 } - - /// Returns true if and only if there are no captured groups. - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } } impl<'t> fmt::Debug for Captures<'t> { From d4680165afa4eedb3df00aae4b50d2f6584dfe96 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 22:05:32 -0400 Subject: [PATCH 04/18] Drop the PartialEq and Eq impls on Regex. It is misleading to suggest that Regex implements equality, since equality is a well defined operation on regular expressions and this particular implementation doesn't correspond to that definition at all. Moreover, I suspect the actual use cases for such an impl are rather niche. A simple newtype+deref should resolve any such use cases. Fixes #178 --- src/re_unicode.rs | 12 ------------ tests/api_str.rs | 6 ------ tests/misc.rs | 7 ------- 3 files changed, 25 deletions(-) diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 5a3ede915d..adeba1cdd5 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -136,18 +136,6 @@ impl From for Regex { } } -/// Equality comparison is based on the original string. It is possible that -/// different regular expressions have the same matching behavior, but are -/// still compared unequal. For example, `\d+` and `\d\d*` match the same set -/// of strings, but are not considered equal. -impl PartialEq for Regex { - fn eq(&self, other: &Regex) -> bool { - self.as_str() == other.as_str() - } -} - -impl Eq for Regex {} - impl FromStr for Regex { type Err = Error; diff --git a/tests/api_str.rs b/tests/api_str.rs index e5e667863d..c6d392876b 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -19,9 +19,3 @@ fn empty_match_unicode_captures_iter() { .collect(); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); } - -#[test] -fn eq() { - use regex::Regex; - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} diff --git a/tests/misc.rs b/tests/misc.rs index 293cddb322..dfe28c9707 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -8,14 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use regex::Regex; - mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); mat!(one_literal_edge, r"abc", r"xxxxxab", None); matiter!(terminates, r"a$", r"a", (0, 1)); - -#[test] -fn eq() { - assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); -} From 47bd416e1d380d672f267d13b161961334f04505 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 22:31:03 -0400 Subject: [PATCH 05/18] Use correct lifetimes for SubCaptures and SubCapturesNamed types. This corrects a gaffe of mine. In particular, both types contain references to a `Captures` *and* the text that was searched, but only names one lifetime. In practice, this means that the shortest lifetime is used, which can be problematic for when one is trying to extract submatch text. This also fixes the lifetime annotation on `iter_pos`, which should be tied to the Captures and not the text. It was always possible to work around this by using indices. Fixes #168 --- src/re_bytes.rs | 6 +++--- src/re_unicode.rs | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index fb9d0ded3b..3a9712325e 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -749,21 +749,21 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture groups in order of appearance /// in the regular expression. - pub fn iter<'a>(&'a self) -> SubCaptures<'a, 't> { + pub fn iter<'c>(&'c self) -> SubCaptures<'c, 't> { SubCaptures { idx: 0, caps: self } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { + pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> { SubCapturesPos { idx: 0, slots: &self.slots } } /// Creates an iterator of all named groups as an tuple with the group /// name and the value. The iterator returns these values in arbitrary /// order. - pub fn iter_named<'a>(&'a self) -> SubCapturesNamed<'a, 't> { + pub fn iter_named<'c>(&'c self) -> SubCapturesNamed<'c, 't> { SubCapturesNamed { caps: self, names: self.named_groups.iter() diff --git a/src/re_unicode.rs b/src/re_unicode.rs index adeba1cdd5..c22b0c54c6 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -886,21 +886,21 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture groups in order of appearance /// in the regular expression. - pub fn iter(&'t self) -> SubCaptures<'t> { + pub fn iter<'c>(&'c self) -> SubCaptures<'c, 't> { SubCaptures { idx: 0, caps: self, } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { + pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> { SubCapturesPos { idx: 0, slots: &self.slots } } /// Creates an iterator of all named groups as an tuple with the group /// name and the value. The iterator returns these values in arbitrary /// order. - pub fn iter_named(&'t self) -> SubCapturesNamed<'t> { + pub fn iter_named<'c>(&'c self) -> SubCapturesNamed<'c, 't> { SubCapturesNamed { caps: self, names: self.named_groups.iter() @@ -1007,15 +1007,15 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// expression. /// /// `'c` is the lifetime of the captures. -pub struct SubCaptures<'c> { +pub struct SubCaptures<'c, 't: 'c> { idx: usize, - caps: &'c Captures<'c>, + caps: &'c Captures<'t>, } -impl<'c> Iterator for SubCaptures<'c> { - type Item = Option<&'c str>; +impl<'c, 't> Iterator for SubCaptures<'c, 't> { + type Item = Option<&'t str>; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option> { if self.idx < self.caps.len() { self.idx += 1; Some(self.caps.at(self.idx - 1)) @@ -1057,15 +1057,15 @@ impl<'c> Iterator for SubCapturesPos<'c> { /// name and the value. /// /// `'c` is the lifetime of the captures. -pub struct SubCapturesNamed<'c> { - caps: &'c Captures<'c>, +pub struct SubCapturesNamed<'c, 't: 'c> { + caps: &'c Captures<'t>, names: NamedGroupsIter<'c>, } -impl<'c> Iterator for SubCapturesNamed<'c> { - type Item = (&'c str, Option<&'c str>); +impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { + type Item = (&'c str, Option<&'t str>); - fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> { + fn next(&mut self) -> Option<(&'c str, Option<&'t str>)> { self.names.next().map(|(name, pos)| (name, self.caps.at(pos))) } } From 16bb79a2e74fdee9f6cdacf4312987867f4e135c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 22:10:24 -0400 Subject: [PATCH 06/18] Remove Regex::with_size_limit. This is replaced by using RegexBuilder. Fixes #166 --- src/re_bytes.rs | 11 +---------- src/re_unicode.rs | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 3a9712325e..d11d96b78c 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -77,16 +77,7 @@ impl Regex { /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).compile() } /// Returns true if and only if the regex matches the string given. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index c22b0c54c6..036893c755 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -151,16 +151,7 @@ impl Regex { /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - /// Compiles a regular expression with the given size limit. - /// - /// The size limit is applied to the size of the *compiled* data structure. - /// If the data structure exceeds the size given, then an error is - /// returned. - pub fn with_size_limit(size: usize, re: &str) -> Result { - RegexBuilder::new(re).size_limit(size).compile() + RegexBuilder::new(re).compile() } /// Returns true if and only if the regex matches the string given. From a0498dbfc2077a525c431dbcb02331723758e2d4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 22:13:19 -0400 Subject: [PATCH 07/18] Remove free is_match function. It encourages compiling a regex for every use, which can be convenient in some circumstances but deadly for performance. Fixes #165 --- src/lib.rs | 2 +- src/re_unicode.rs | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index baaf6cfd8b..54e3bdb655 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -464,7 +464,7 @@ pub use re_unicode::{ Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, CaptureNames, FindCaptures, FindMatches, Replacer, NoExpand, RegexSplits, RegexSplitsN, - quote, is_match, + quote, }; /** diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 036893c755..943ff733dc 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -33,17 +33,6 @@ pub fn quote(text: &str) -> String { syntax::quote(text) } -/// Tests if the given regular expression matches somewhere in the text given. -/// -/// If there was a problem compiling the regular expression, an error is -/// returned. -/// -/// To find submatches, split or replace text, you'll need to compile an -/// expression first. -pub fn is_match(regex: &str, text: &str) -> Result { - Regex::new(regex).map(|r| r.is_match(text)) -} - /// A compiled regular expression for matching Unicode strings. /// /// It is represented as either a sequence of bytecode instructions (dynamic) From fd11ea0e9411f79a9c045d617950da9f5e54e8a2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 22:15:02 -0400 Subject: [PATCH 08/18] Rename RegexSplits to Splits. Similarly, rename RegexSplitsN to SplitsN. This follows the convention of all other iterator types. In general, we shouldn't namespace our type names. --- src/lib.rs | 2 +- src/re_unicode.rs | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 54e3bdb655..d2a887b063 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -463,7 +463,7 @@ pub use re_set::unicode::*; pub use re_unicode::{ Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, CaptureNames, FindCaptures, FindMatches, - Replacer, NoExpand, RegexSplits, RegexSplitsN, + Replacer, NoExpand, Splits, SplitsN, quote, }; diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 943ff733dc..8916f5f4a7 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -406,8 +406,8 @@ impl Regex { /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { - RegexSplits { + pub fn split<'r, 't>(&'r self, text: &'t str) -> Splits<'r, 't> { + Splits { finder: self.find_iter(text), last: 0, } @@ -434,8 +434,8 @@ impl Regex { /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) - -> RegexSplitsN<'r, 't> { - RegexSplitsN { + -> SplitsN<'r, 't> { + SplitsN { splits: self.split(text), n: limit, } @@ -707,12 +707,12 @@ impl<'r> Iterator for CaptureNames<'r> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplits<'r, 't> { +pub struct Splits<'r, 't> { finder: FindMatches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for RegexSplits<'r, 't> { +impl<'r, 't> Iterator for Splits<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -742,12 +742,12 @@ impl<'r, 't> Iterator for RegexSplits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct RegexSplitsN<'r, 't> { - splits: RegexSplits<'r, 't>, +pub struct SplitsN<'r, 't> { + splits: Splits<'r, 't>, n: usize, } -impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitsN<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { From 6b5f3db86f165d3cb4a33dc181abb266f5d3fc0a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 May 2016 23:43:04 -0400 Subject: [PATCH 09/18] Reorganize capture slot handling, but don't make any public API changes. --- regex-capi/src/rure.rs | 10 +-- src/exec.rs | 9 ++- src/lib.rs | 6 +- src/re_bytes.rs | 151 ++++++++++++++++------------------- src/re_plugin.rs | 11 ++- src/re_trait.rs | 88 ++++++++++++++++++-- src/re_unicode.rs | 177 +++++++++++++++++++---------------------- 7 files changed, 253 insertions(+), 199 deletions(-) diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 4e2b65924e..874811ea13 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -36,7 +36,7 @@ pub struct rure_match { pub end: size_t, } -pub struct Captures(Vec>); +pub struct Captures(bytes::Locations); pub struct Iter { re: *const Regex, @@ -323,7 +323,7 @@ ffi_fn! { ffi_fn! { fn rure_captures_new(re: *const Regex) -> *mut Captures { let re = unsafe { &*re }; - let captures = Captures(vec![None; 2 * re.captures_len()]); + let captures = Captures(re.locations()); Box::into_raw(Box::new(captures)) } } @@ -340,9 +340,9 @@ ffi_fn! { i: size_t, match_info: *mut rure_match, ) -> bool { - let captures = unsafe { &(*captures).0 }; - match (captures[i * 2], captures[i * 2 + 1]) { - (Some(start), Some(end)) => { + let locs = unsafe { &(*captures).0 }; + match locs.pos(i) { + Some((start, end)) => { if !match_info.is_null() { unsafe { (*match_info).start = start; diff --git a/src/exec.rs b/src/exec.rs index 5d0541a13c..18df740140 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -27,7 +27,7 @@ use prog::Program; use re_builder::RegexOptions; use re_bytes; use re_set; -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; use re_unicode; use utf8::next_utf8; @@ -332,11 +332,11 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> { #[inline(always)] // reduces constant overhead fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &str, start: usize, ) -> Option<(usize, usize)> { - self.0.read_captures_at(slots, text.as_bytes(), start) + self.0.read_captures_at(locs, text.as_bytes(), start) } } @@ -501,10 +501,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// locations of the overall match. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &[u8], start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/lib.rs b/src/lib.rs index d2a887b063..906d70f6b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -460,8 +460,9 @@ extern crate utf8_ranges; pub use error::Error; pub use re_builder::unicode::*; pub use re_set::unicode::*; +pub use re_trait::{Locations, SubCapturesPos}; pub use re_unicode::{ - Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, + Regex, Captures, SubCaptures, SubCapturesNamed, CaptureNames, FindCaptures, FindMatches, Replacer, NoExpand, Splits, SplitsN, quote, @@ -554,8 +555,9 @@ performance on `&str`. */ pub mod bytes { pub use re_builder::bytes::*; - pub use re_set::bytes::*; pub use re_bytes::*; + pub use re_set::bytes::*; + pub use re_trait::{Locations, SubCapturesPos}; } mod backtrack; diff --git a/src/re_bytes.rs b/src/re_bytes.rs index d11d96b78c..55d4469113 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -22,7 +22,7 @@ use exec::{Exec, ExecNoSync}; use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPos}; /// A compiled regular expression for matching arbitrary bytes. /// @@ -71,6 +71,7 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. @@ -102,17 +103,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -137,21 +127,6 @@ impl Regex { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at( - &self, - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().find_at(text, start) - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -243,30 +218,14 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: self.0.capture_name_idx().clone(), }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &[u8], - start: usize, - ) -> Option<(usize, usize)> { - self.0.searcher().read_captures_at(slots, text, start) - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it /// yields information about submatches. @@ -513,7 +472,10 @@ impl Regex { extend_from_slice(&mut new, &text[last_match..]); new } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -554,6 +516,51 @@ impl Regex { self.0.searcher().shortest_match_at(text, start) } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at( + &self, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.0.searcher().find_at(text, start) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at( + &self, + locs: &mut Locations, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.0.searcher().read_captures_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { &self.0.regex_strings()[0] @@ -568,6 +575,13 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + self.0.searcher().locations() + } } /// An iterator over all non-overlapping matches for a particular string. @@ -601,9 +615,9 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { - self.0.next().map(|slots| Captures { + self.0.next().map(|locs| Captures { text: self.0.text(), - slots: slots, + locs: locs, named_groups: self.0.regex().capture_name_idx().clone(), }) } @@ -704,7 +718,7 @@ impl<'r> Iterator for CaptureNames<'r> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t [u8], - slots: Vec>, + locs: Locations, named_groups: Arc>, } @@ -714,11 +728,7 @@ impl<'t> Captures<'t> { /// not match anything. The positions returned are *always* byte indices /// with respect to the original byte string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } + self.locs.pos(i) } /// Returns the matched string for the capture group `i`. If `i` isn't @@ -747,8 +757,8 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> { - SubCapturesPos { idx: 0, slots: &self.slots } + pub fn iter_pos(&self) -> SubCapturesPos { + self.locs.iter() } /// Creates an iterator of all named groups as an tuple with the group @@ -787,7 +797,7 @@ impl<'t> Captures<'t> { /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 + self.locs.len() } } @@ -895,33 +905,6 @@ impl<'c, 't> Iterator for SubCaptures<'c, 't> { } } -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original byte string matched. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] -} - -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; - - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - }; - self.idx += 2; - Some(r) - } -} - /// An Iterator over named capture groups as a tuple with the group name and /// the value. /// diff --git a/src/re_plugin.rs b/src/re_plugin.rs index d453ef7e7e..afd828921b 100644 --- a/src/re_plugin.rs +++ b/src/re_plugin.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use re_trait::{RegularExpression, Slot}; +use re_trait::{RegularExpression, Slot, Locations, as_slots}; /// Plugin is the compiler plugin's data structure. It declare some static /// data (like capture groups and the original regex string), but defines its @@ -67,15 +67,20 @@ impl RegularExpression for Plugin { fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { let mut slots = [None, None]; - self.read_captures_at(&mut slots, text, start) + (self.prog)(&mut slots, text, start); + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } } fn read_captures_at<'t>( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &'t str, start: usize, ) -> Option<(usize, usize)> { + let slots = as_slots(locs); for slot in slots.iter_mut() { *slot = None; } diff --git a/src/re_trait.rs b/src/re_trait.rs index 1841efb6a8..92ceef1542 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -13,6 +13,77 @@ /// of the capture). pub type Slot = Option; +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +pub struct Locations(Vec); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPos { + SubCapturesPos { idx: 0, locs: &self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } +} + +/// This is a hack to make Locations -> &mut [Slot] be available internally +/// without exposing it in the public API. +pub fn as_slots(locs: &mut Locations) -> &mut [Slot] { + &mut locs.0 +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +pub struct SubCapturesPos<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPos<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => { + Some(Some((s, e))) + } + }; + self.idx += 1; + x + } +} + /// RegularExpression describes types that can implement regex searching. /// /// This trait is my attempt at reducing code duplication and to standardize @@ -33,6 +104,11 @@ pub trait RegularExpression: Sized { /// always two times the number of capture groups (two slots per group). fn slots_len(&self) -> usize; + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + /// Returns the position of the next character after `i`. /// /// For example, a haystack with type `&[u8]` probably returns `i+1`, @@ -65,7 +141,7 @@ pub trait RegularExpression: Sized { /// fills in any matching capture slot locations. fn read_captures_at( &self, - slots: &mut [Slot], + locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>; @@ -163,15 +239,15 @@ impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { impl<'t, R> Iterator for FindCaptures<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { - type Item = Vec; + type Item = Locations; - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option { if self.0.last_end > self.0.text.as_ref().len() { return None } - let mut slots = vec![None; self.0.re.slots_len()]; + let mut locs = self.0.re.locations(); let (s, e) = match self.0.re.read_captures_at( - &mut slots, + &mut locs, self.0.text, self.0.last_end, ) { @@ -187,6 +263,6 @@ impl<'t, R> Iterator for FindCaptures<'t, R> self.0.last_end = e; } self.0.last_match = Some(e); - Some(slots) + Some(locs) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 8916f5f4a7..ca954b69b8 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -23,7 +23,7 @@ use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; -use re_trait::{self, RegularExpression, Slot}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPos}; /// Escapes all regular expression meta characters in `text`. /// @@ -134,6 +134,7 @@ impl FromStr for Regex { } } +/// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace text in a string. @@ -165,17 +166,6 @@ impl Regex { self.is_match_at(text, 0) } - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn is_match_at(&self, text: &str, start: usize) -> bool { - self.shortest_match_at(text, start).is_some() - } - /// Returns the start and end byte range of the leftmost-first match in /// `text`. If no match exists, then `None` is returned. /// @@ -200,22 +190,6 @@ impl Regex { self.find_at(text, 0) } - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start) - } - _Regex::Plugin(ref plug) => plug.find_at(text, start), - } - } - /// Returns an iterator for each successive non-overlapping match in /// `text`, returning the start and end byte indices with respect to /// `text`. @@ -316,37 +290,14 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut slots = vec![None; 2 * self.captures_len()]; - self.read_captures_at(&mut slots, text, 0).map(|_| Captures { + let mut locs = self.locations(); + self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, - slots: slots, + locs: locs, named_groups: NamedGroups::from_regex(self) }) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - pub fn read_captures_at( - &self, - slots: &mut [Slot], - text: &str, - start: usize, - ) -> Option<(usize, usize)> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().read_captures_at(slots, text, start) - } - _Regex::Plugin(ref plug) => { - plug.read_captures_at(slots, text, start) - } - } - } - /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter`, except it /// yields information about submatches. @@ -595,7 +546,10 @@ impl Regex { new.push_str(&text[last_match..]); new } +} +/// Advanced or "lower level" search methods. +impl Regex { /// Returns the end location of a match in the text given. /// /// This method may have the same performance characteristics as @@ -641,6 +595,59 @@ impl Regex { } } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.shortest_match_at(text, start).is_some() + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().find_at(text, start) + } + _Regex::Plugin(ref plug) => plug.find_at(text, start), + } + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn read_captures_at( + &self, + locs: &mut Locations, + text: &str, + start: usize, + ) -> Option<(usize, usize)> { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().read_captures_at(locs, text, start) + } + _Regex::Plugin(ref plug) => { + plug.read_captures_at(locs, text, start) + } + } + } +} + +/// Auxiliary methods. +impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match self.0 { @@ -666,6 +673,18 @@ impl Regex { _Regex::Dynamic(ref d) => d.capture_names().len() } } + + /// Returns an empty set of locations that can be reused in multiple calls + /// to `read_captures`. + #[doc(hidden)] + pub fn locations(&self) -> Locations { + match self.0 { + _Regex::Dynamic(ref exec) => { + exec.searcher_str().locations() + } + _Regex::Plugin(ref plug) => plug.locations(), + } + } } /// An iterator over the names of all possible captures. @@ -830,7 +849,7 @@ impl<'n> Iterator for NamedGroupsIter<'n> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - slots: Vec>, + locs: Locations, named_groups: NamedGroups, } @@ -840,11 +859,7 @@ impl<'t> Captures<'t> { /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); - match (self.slots.get(s), self.slots.get(e)) { - (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), - _ => None, - } + self.locs.pos(i) } /// Returns the matched string for the capture group `i`. If `i` isn't @@ -873,8 +888,8 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> { - SubCapturesPos { idx: 0, slots: &self.slots } + pub fn iter_pos(&self) -> SubCapturesPos { + self.locs.iter() } /// Creates an iterator of all named groups as an tuple with the group @@ -913,7 +928,7 @@ impl<'t> Captures<'t> { /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { - self.slots.len() / 2 + self.locs.len() } } @@ -1005,34 +1020,6 @@ impl<'c, 't> Iterator for SubCaptures<'c, 't> { } } -/// An iterator over capture group positions for a particular match of a -/// regular expression. -/// -/// Positions are byte indices in terms of the original string matched. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { - idx: usize, - slots: &'c [Option] -} - -impl<'c> Iterator for SubCapturesPos<'c> { - type Item = Option<(usize, usize)>; - - fn next(&mut self) -> Option> { - if self.idx >= self.slots.len() { - return None - } - let r = match (self.slots[self.idx], self.slots[self.idx + 1]) { - (Some(s), Some(e)) => Some((s, e)), - (None, None) => None, - _ => unreachable!() - }; - self.idx += 2; - Some(r) - } -} - /// An Iterator over named capture groups as a tuple with the group /// name and the value. /// @@ -1071,16 +1058,16 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { match self.0 { FindCapturesInner::Dynamic(ref mut it) => { let named = it.regex().capture_name_idx().clone(); - it.next().map(|slots| Captures { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Dynamic(named), }) } FindCapturesInner::Plugin(ref mut it) => { - it.next().map(|slots| Captures { + it.next().map(|locs| Captures { text: it.text(), - slots: slots, + locs: locs, named_groups: NamedGroups::Plugin(it.regex().groups), }) } From 404f9a2b8fe18215b343f4ee8505bca562832ab3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 17 May 2016 19:18:46 -0400 Subject: [PATCH 10/18] Rename many of the iterator types. Mostly, this adds an `Iter` suffix to all of the names. --- src/lib.rs | 10 +-- src/pattern.rs | 8 +-- src/re_bytes.rs | 56 +++++++-------- src/re_trait.rs | 28 ++++---- src/re_unicode.rs | 110 +++++++++++++++--------------- tests/macros_str.rs | 5 ++ tests/test_backtrack.rs | 3 +- tests/test_backtrack_utf8bytes.rs | 3 +- tests/test_default.rs | 7 -- tests/test_nfa.rs | 3 +- tests/test_nfa_utf8bytes.rs | 3 +- 11 files changed, 112 insertions(+), 124 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 906d70f6b8..6e39bf8e61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -460,11 +460,11 @@ extern crate utf8_ranges; pub use error::Error; pub use re_builder::unicode::*; pub use re_set::unicode::*; -pub use re_trait::{Locations, SubCapturesPos}; +pub use re_trait::{Locations, SubCapturesPosIter}; pub use re_unicode::{ - Regex, Captures, SubCaptures, SubCapturesNamed, - CaptureNames, FindCaptures, FindMatches, - Replacer, NoExpand, Splits, SplitsN, + Regex, Captures, SubCapturesIter, SubCapturesNamedIter, + CaptureNamesIter, CapturesIter, FindIter, + Replacer, NoExpand, SplitsIter, SplitsNIter, quote, }; @@ -557,7 +557,7 @@ pub mod bytes { pub use re_builder::bytes::*; pub use re_bytes::*; pub use re_set::bytes::*; - pub use re_trait::{Locations, SubCapturesPos}; + pub use re_trait::{Locations, SubCapturesPosIter}; } mod backtrack; diff --git a/src/pattern.rs b/src/pattern.rs index 3de377ad07..a6037d0ea1 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,17 +1,14 @@ -#[cfg(feature = "pattern")] use std::str::pattern::{Pattern, Searcher, SearchStep}; -use re_unicode::{Regex, FindMatches}; +use re_unicode::{Regex, FindIter}; -#[cfg(feature = "pattern")] pub struct RegexSearcher<'r, 't> { haystack: &'t str, - it: FindMatches<'r, 't>, + it: FindIter<'r, 't>, last_step_end: usize, next_match: Option<(usize, usize)>, } -#[cfg(feature = "pattern")] impl<'r, 't> Pattern<'t> for &'r Regex { type Searcher = RegexSearcher<'r, 't>; @@ -25,7 +22,6 @@ impl<'r, 't> Pattern<'t> for &'r Regex { } } -#[cfg(feature = "pattern")] unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { #[inline] fn haystack(&self) -> &'t str { diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 55d4469113..38c7b43c13 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -22,7 +22,7 @@ use exec::{Exec, ExecNoSync}; use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Locations, SubCapturesPos}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; /// A compiled regular expression for matching arbitrary bytes. /// @@ -150,8 +150,8 @@ impl Regex { /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindMatches<'r, 't> { - FindMatches(self.0.searcher().find_iter(text)) + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindIter<'r, 't> { + FindIter(self.0.searcher().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first @@ -255,8 +255,8 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t [u8], - ) -> FindCaptures<'r, 't> { - FindCaptures(self.0.searcher().captures_iter(text)) + ) -> CapturesIter<'r, 't> { + CapturesIter(self.0.searcher().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -279,8 +279,8 @@ impl Regex { /// ]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Splits<'r, 't> { - Splits { + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> SplitsIter<'r, 't> { + SplitsIter { finder: self.find_iter(text), last: 0, } @@ -310,8 +310,8 @@ impl Regex { &'r self, text: &'t [u8], limit: usize, - ) -> SplitsN<'r, 't> { - SplitsN { + ) -> SplitsNIter<'r, 't> { + SplitsNIter { splits: self.split(text), n: limit, } @@ -592,9 +592,9 @@ impl Regex { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindMatches<'r, 't>(re_trait::FindMatches<'t, ExecNoSync<'r>>); +pub struct FindIter<'r, 't>(re_trait::FindIter<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindMatches<'r, 't> { +impl<'r, 't> Iterator for FindIter<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { @@ -609,9 +609,9 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindCaptures<'r, 't>(re_trait::FindCaptures<'t, ExecNoSync<'r>>); +pub struct CapturesIter<'r, 't>(re_trait::CapturesIter<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CapturesIter<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { @@ -627,12 +627,12 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct Splits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct SplitsIter<'r, 't> { + finder: FindIter<'r, 't>, last: usize, } -impl<'r, 't> Iterator for Splits<'r, 't> { +impl<'r, 't> Iterator for SplitsIter<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -662,12 +662,12 @@ impl<'r, 't> Iterator for Splits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct SplitsN<'r, 't> { - splits: Splits<'r, 't>, +pub struct SplitsNIter<'r, 't> { + splits: SplitsIter<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitsNIter<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -750,22 +750,22 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture groups in order of appearance /// in the regular expression. - pub fn iter<'c>(&'c self) -> SubCaptures<'c, 't> { - SubCaptures { idx: 0, caps: self } + pub fn iter<'c>(&'c self) -> SubCapturesIter<'c, 't> { + SubCapturesIter { idx: 0, caps: self } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos(&self) -> SubCapturesPos { + pub fn iter_pos(&self) -> SubCapturesPosIter { self.locs.iter() } /// Creates an iterator of all named groups as an tuple with the group /// name and the value. The iterator returns these values in arbitrary /// order. - pub fn iter_named<'c>(&'c self) -> SubCapturesNamed<'c, 't> { - SubCapturesNamed { + pub fn iter_named<'c>(&'c self) -> SubCapturesNamedIter<'c, 't> { + SubCapturesNamedIter { caps: self, names: self.named_groups.iter() } @@ -887,12 +887,12 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// /// `'c` is the lifetime of the captures and `'t` is the lifetime of the /// matched text. -pub struct SubCaptures<'c, 't: 'c> { +pub struct SubCapturesIter<'c, 't: 'c> { idx: usize, caps: &'c Captures<'t>, } -impl<'c, 't> Iterator for SubCaptures<'c, 't> { +impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { type Item = Option<&'t [u8]>; fn next(&mut self) -> Option> { @@ -910,12 +910,12 @@ impl<'c, 't> Iterator for SubCaptures<'c, 't> { /// /// `'c` is the lifetime of the captures and `'t` is the lifetime of the /// matched text. -pub struct SubCapturesNamed<'c, 't: 'c> { +pub struct SubCapturesNamedIter<'c, 't: 'c> { caps: &'c Captures<'t>, names: hash_map::Iter<'c, String, usize>, } -impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { +impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { type Item = (&'c str, Option<&'t [u8]>); fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { diff --git a/src/re_trait.rs b/src/re_trait.rs index 92ceef1542..81bfbc0d6b 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -36,8 +36,8 @@ impl Locations { /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter(&self) -> SubCapturesPos { - SubCapturesPos { idx: 0, locs: &self } + pub fn iter(&self) -> SubCapturesPosIter { + SubCapturesPosIter { idx: 0, locs: &self } } /// Returns the total number of capturing groups. @@ -61,12 +61,12 @@ pub fn as_slots(locs: &mut Locations) -> &mut [Slot] { /// Positions are byte indices in terms of the original string matched. /// /// `'c` is the lifetime of the captures. -pub struct SubCapturesPos<'c> { +pub struct SubCapturesPosIter<'c> { idx: usize, locs: &'c Locations, } -impl<'c> Iterator for SubCapturesPos<'c> { +impl<'c> Iterator for SubCapturesPosIter<'c> { type Item = Option<(usize, usize)>; fn next(&mut self) -> Option> { @@ -151,8 +151,8 @@ pub trait RegularExpression: Sized { fn find_iter<'t>( self, text: &'t Self::Text, - ) -> FindMatches<'t, Self> { - FindMatches { + ) -> FindIter<'t, Self> { + FindIter { re: self, text: text, last_end: 0, @@ -165,20 +165,20 @@ pub trait RegularExpression: Sized { fn captures_iter<'t>( self, text: &'t Self::Text, - ) -> FindCaptures<'t, Self> { - FindCaptures(self.find_iter(text)) + ) -> CapturesIter<'t, Self> { + CapturesIter(self.find_iter(text)) } } /// An iterator over all non-overlapping successive leftmost-first matches. -pub struct FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +pub struct FindIter<'t, R> where R: RegularExpression, R::Text: 't { re: R, text: &'t R::Text, last_end: usize, last_match: Option, } -impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> FindIter<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.text @@ -190,7 +190,7 @@ impl<'t, R> FindMatches<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindMatches<'t, R> +impl<'t, R> Iterator for FindIter<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = (usize, usize); @@ -222,10 +222,10 @@ impl<'t, R> Iterator for FindMatches<'t, R> /// An iterator over all non-overlapping successive leftmost-first matches with /// captures. -pub struct FindCaptures<'t, R>(FindMatches<'t, R>) +pub struct CapturesIter<'t, R>(FindIter<'t, R>) where R: RegularExpression, R::Text: 't; -impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> CapturesIter<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.0.text() @@ -237,7 +237,7 @@ impl<'t, R> FindCaptures<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindCaptures<'t, R> +impl<'t, R> Iterator for CapturesIter<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = Locations; diff --git a/src/re_unicode.rs b/src/re_unicode.rs index ca954b69b8..a6a443b4b9 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -23,7 +23,7 @@ use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; -use re_trait::{self, RegularExpression, Locations, SubCapturesPos}; +use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. /// @@ -213,15 +213,15 @@ impl Regex { /// // (45, 58) /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindIter<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().find_iter(text); - FindMatches(FindMatchesInner::Dynamic(it)) + FindIter(FindIterInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.find_iter(text); - FindMatches(FindMatchesInner::Plugin(it)) + FindIter(FindIterInner::Plugin(it)) } } } @@ -326,15 +326,15 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t str, - ) -> FindCaptures<'r, 't> { + ) -> CapturesIter<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().captures_iter(text); - FindCaptures(FindCapturesInner::Dynamic(it)) + CapturesIter(CapturesIterInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.captures_iter(text); - FindCaptures(FindCapturesInner::Plugin(it)) + CapturesIter(CapturesIterInner::Plugin(it)) } } } @@ -357,8 +357,8 @@ impl Regex { /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t str) -> Splits<'r, 't> { - Splits { + pub fn split<'r, 't>(&'r self, text: &'t str) -> SplitsIter<'r, 't> { + SplitsIter { finder: self.find_iter(text), last: 0, } @@ -385,8 +385,8 @@ impl Regex { /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) - -> SplitsN<'r, 't> { - SplitsN { + -> SplitsNIter<'r, 't> { + SplitsNIter { splits: self.split(text), n: limit, } @@ -657,11 +657,11 @@ impl Regex { } /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNames { - CaptureNames(match self.0 { - _Regex::Plugin(ref n) => _CaptureNames::Plugin(n.names.iter()), + pub fn capture_names(&self) -> CaptureNamesIter { + CaptureNamesIter(match self.0 { + _Regex::Plugin(ref n) => _CaptureNamesIter::Plugin(n.names.iter()), _Regex::Dynamic(ref d) => { - _CaptureNames::Dynamic(d.capture_names().iter()) + _CaptureNamesIter::Dynamic(d.capture_names().iter()) } }) } @@ -693,22 +693,20 @@ impl Regex { /// whole matched region) is always unnamed. /// /// `'r` is the lifetime of the compiled regular expression. -pub struct CaptureNames<'r>(_CaptureNames<'r>); +pub struct CaptureNamesIter<'r>(_CaptureNamesIter<'r>); -enum _CaptureNames<'r> { - #[doc(hidden)] +enum _CaptureNamesIter<'r> { Plugin(::std::slice::Iter<'r, Option<&'static str>>), - #[doc(hidden)] Dynamic(::std::slice::Iter<'r, Option>) } -impl<'r> Iterator for CaptureNames<'r> { +impl<'r> Iterator for CaptureNamesIter<'r> { type Item = Option<&'r str>; fn next(&mut self) -> Option> { match self.0 { - _CaptureNames::Plugin(ref mut i) => i.next().cloned(), - _CaptureNames::Dynamic(ref mut i) => { + _CaptureNamesIter::Plugin(ref mut i) => i.next().cloned(), + _CaptureNamesIter::Dynamic(ref mut i) => { i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())) } } @@ -716,8 +714,8 @@ impl<'r> Iterator for CaptureNames<'r> { fn size_hint(&self) -> (usize, Option) { match self.0 { - _CaptureNames::Plugin(ref i) => i.size_hint(), - _CaptureNames::Dynamic(ref i) => i.size_hint(), + _CaptureNamesIter::Plugin(ref i) => i.size_hint(), + _CaptureNamesIter::Dynamic(ref i) => i.size_hint(), } } } @@ -726,12 +724,12 @@ impl<'r> Iterator for CaptureNames<'r> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct Splits<'r, 't> { - finder: FindMatches<'r, 't>, +pub struct SplitsIter<'r, 't> { + finder: FindIter<'r, 't>, last: usize, } -impl<'r, 't> Iterator for Splits<'r, 't> { +impl<'r, 't> Iterator for SplitsIter<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -761,12 +759,12 @@ impl<'r, 't> Iterator for Splits<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct SplitsN<'r, 't> { - splits: Splits<'r, 't>, +pub struct SplitsNIter<'r, 't> { + splits: SplitsIter<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsN<'r, 't> { +impl<'r, 't> Iterator for SplitsNIter<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -881,22 +879,22 @@ impl<'t> Captures<'t> { /// Creates an iterator of all the capture groups in order of appearance /// in the regular expression. - pub fn iter<'c>(&'c self) -> SubCaptures<'c, 't> { - SubCaptures { idx: 0, caps: self, } + pub fn iter<'c>(&'c self) -> SubCapturesIter<'c, 't> { + SubCapturesIter { idx: 0, caps: self, } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. - pub fn iter_pos(&self) -> SubCapturesPos { + pub fn iter_pos(&self) -> SubCapturesPosIter { self.locs.iter() } /// Creates an iterator of all named groups as an tuple with the group /// name and the value. The iterator returns these values in arbitrary /// order. - pub fn iter_named<'c>(&'c self) -> SubCapturesNamed<'c, 't> { - SubCapturesNamed { + pub fn iter_named<'c>(&'c self) -> SubCapturesNamedIter<'c, 't> { + SubCapturesNamedIter { caps: self, names: self.named_groups.iter() } @@ -1002,12 +1000,12 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// expression. /// /// `'c` is the lifetime of the captures. -pub struct SubCaptures<'c, 't: 'c> { +pub struct SubCapturesIter<'c, 't: 'c> { idx: usize, caps: &'c Captures<'t>, } -impl<'c, 't> Iterator for SubCaptures<'c, 't> { +impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { type Item = Option<&'t str>; fn next(&mut self) -> Option> { @@ -1024,12 +1022,12 @@ impl<'c, 't> Iterator for SubCaptures<'c, 't> { /// name and the value. /// /// `'c` is the lifetime of the captures. -pub struct SubCapturesNamed<'c, 't: 'c> { +pub struct SubCapturesNamedIter<'c, 't: 'c> { caps: &'c Captures<'t>, names: NamedGroupsIter<'c>, } -impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { +impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { type Item = (&'c str, Option<&'t str>); fn next(&mut self) -> Option<(&'c str, Option<&'t str>)> { @@ -1044,19 +1042,19 @@ impl<'c, 't> Iterator for SubCapturesNamed<'c, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindCaptures<'r, 't>(FindCapturesInner<'r, 't>); +pub struct CapturesIter<'r, 't>(CapturesIterInner<'r, 't>); -enum FindCapturesInner<'r, 't> { - Dynamic(re_trait::FindCaptures<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindCaptures<'t, Plugin>), +enum CapturesIterInner<'r, 't> { + Dynamic(re_trait::CapturesIter<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::CapturesIter<'t, Plugin>), } -impl<'r, 't> Iterator for FindCaptures<'r, 't> { +impl<'r, 't> Iterator for CapturesIter<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { match self.0 { - FindCapturesInner::Dynamic(ref mut it) => { + CapturesIterInner::Dynamic(ref mut it) => { let named = it.regex().capture_name_idx().clone(); it.next().map(|locs| Captures { text: it.text(), @@ -1064,7 +1062,7 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { named_groups: NamedGroups::Dynamic(named), }) } - FindCapturesInner::Plugin(ref mut it) => { + CapturesIterInner::Plugin(ref mut it) => { it.next().map(|locs| Captures { text: it.text(), locs: locs, @@ -1083,29 +1081,29 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindMatches<'r, 't>(FindMatchesInner<'r, 't>); +pub struct FindIter<'r, 't>(FindIterInner<'r, 't>); -enum FindMatchesInner<'r, 't> { - Dynamic(re_trait::FindMatches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindMatches<'t, Plugin>), +enum FindIterInner<'r, 't> { + Dynamic(re_trait::FindIter<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::FindIter<'t, Plugin>), } -impl<'r, 't> FindMatches<'r, 't> { +impl<'r, 't> FindIter<'r, 't> { fn text(&self) -> &'t str { match self.0 { - FindMatchesInner::Dynamic(ref it) => it.text(), - FindMatchesInner::Plugin(ref it) => it.text(), + FindIterInner::Dynamic(ref it) => it.text(), + FindIterInner::Plugin(ref it) => it.text(), } } } -impl<'r, 't> Iterator for FindMatches<'r, 't> { +impl<'r, 't> Iterator for FindIter<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { match self.0 { - FindMatchesInner::Dynamic(ref mut it) => it.next(), - FindMatchesInner::Plugin(ref mut it) => it.next(), + FindIterInner::Dynamic(ref mut it) => it.next(), + FindIterInner::Plugin(ref mut it) => it.next(), } } } diff --git a/tests/macros_str.rs b/tests/macros_str.rs index 5acbe282b6..c654c2fc03 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -32,3 +32,8 @@ macro_rules! expand { } } } + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index b6726167ff..7861a3db88 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -8,8 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(feature = "pattern", feature(pattern))] - extern crate rand; extern crate regex; @@ -56,6 +54,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index dd0ebbd7f5..2bf9456292 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -8,8 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(feature = "pattern", feature(pattern))] - extern crate rand; extern crate regex; @@ -57,6 +55,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_default.rs b/tests/test_default.rs index e873cb0640..452872d35d 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -8,8 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(feature = "pattern", feature(pattern))] - extern crate rand; extern crate regex; @@ -46,11 +44,6 @@ macro_rules! regex_set { } } -#[cfg(feature = "pattern")] -macro_rules! searcher_expr { ($e:expr) => ($e) } -#[cfg(not(feature = "pattern"))] -macro_rules! searcher_expr { ($e:expr) => ({}) } - // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index 12cb1606f3..abf24561fd 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -8,8 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(feature = "pattern", feature(pattern))] - extern crate rand; extern crate regex; @@ -52,6 +50,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index e6dd1907e6..5926de7bd6 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -8,8 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(feature = "pattern", feature(pattern))] - extern crate rand; extern crate regex; @@ -53,6 +51,7 @@ mod multiline; mod noparse; mod regression; mod replace; +mod searcher; mod set; mod suffix_reverse; mod unicode; From ef39985f51795c32e5a84812bc12663ad97698f6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 17 May 2016 19:19:03 -0400 Subject: [PATCH 11/18] Remove the inline test block. These are tested automatically. --- Cargo.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5054f4c8c3..bd947addf8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,12 +44,6 @@ simd-accel = ["simd"] # There are no benchmarks in the library code itself bench = false -# Runs unit tests defined inside the regex package. -# Generally these tests specific pieces of the regex implementation. -[[test]] -path = "src/lib.rs" -name = "regex-inline" - # Run the test suite on the default behavior of Regex::new. # This includes a mish mash of NFAs and DFAs, which are chosen automatically # based on the regex. We test both of the NFA implementations by forcing their From 17c0f3c232b306e76bc69b2215db628ef52ca2ee Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 17 May 2016 19:57:52 -0400 Subject: [PATCH 12/18] Use `Cow` for replacements. If `replace` doesn't find any matches, then it can return the original string unchanged. --- examples/shootout-regex-dna-bytes.rs | 4 +- examples/shootout-regex-dna-cheat.rs | 2 +- examples/shootout-regex-dna-replace.rs | 2 +- examples/shootout-regex-dna-single-cheat.rs | 2 +- examples/shootout-regex-dna-single.rs | 4 +- examples/shootout-regex-dna.rs | 4 +- src/re_bytes.rs | 42 ++++++++++++++------- src/re_unicode.rs | 35 ++++++++++++----- 8 files changed, 63 insertions(+), 32 deletions(-) diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs index 3b120260c0..ec57157c8e 100644 --- a/examples/shootout-regex-dna-bytes.rs +++ b/examples/shootout-regex-dna-bytes.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_end(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs index 57583218ba..8d88acc2d8 100644 --- a/examples/shootout-regex-dna-cheat.rs +++ b/examples/shootout-regex-dna-cheat.rs @@ -23,7 +23,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); diff --git a/examples/shootout-regex-dna-replace.rs b/examples/shootout-regex-dna-replace.rs index a3319ad29d..857d8bfcd7 100644 --- a/examples/shootout-regex-dna-replace.rs +++ b/examples/shootout-regex-dna-replace.rs @@ -14,6 +14,6 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); println!("original: {}, replaced: {}", ilen, seq.len()); } diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs index fbf464202f..86c1c3f661 100644 --- a/examples/shootout-regex-dna-single-cheat.rs +++ b/examples/shootout-regex-dna-single-cheat.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ diff --git a/examples/shootout-regex-dna-single.rs b/examples/shootout-regex-dna-single.rs index 58eada712f..a84bc63c12 100644 --- a/examples/shootout-regex-dna-single.rs +++ b/examples/shootout-regex-dna-single.rs @@ -16,7 +16,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let variants = vec![ @@ -49,7 +49,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } println!("\n{}\n{}\n{}", ilen, clen, seq.len()); } diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs index d66b4fdf06..ec0060d7f4 100644 --- a/examples/shootout-regex-dna.rs +++ b/examples/shootout-regex-dna.rs @@ -18,7 +18,7 @@ fn main() { io::stdin().read_to_string(&mut seq).unwrap(); let ilen = seq.len(); - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); let clen = seq.len(); let seq_arc = Arc::new(seq.clone()); @@ -56,7 +56,7 @@ fn main() { ]; let mut seq = seq; for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, replacement); + seq = re.replace_all(&seq, replacement).into_owned(); } for (variant, count) in counts { diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 38c7b43c13..60d67cbbb5 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -353,7 +353,7 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); - /// assert_eq!(re.replace(b"1078910", &b""[..]), b"1010"); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); /// # } /// ``` /// @@ -372,7 +372,7 @@ impl Regex { /// replacement.extend(&caps[1]); /// replacement /// }); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -386,7 +386,7 @@ impl Regex { /// # fn main() { /// let re = Regex::new(r"(?P[^,\s]+),\s+(?P\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); - /// assert_eq!(result, b"Bruce Springsteen"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); /// # } /// ``` /// @@ -411,10 +411,14 @@ impl Regex { /// /// let re = Regex::new(r"(?P[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); - /// assert_eq!(result, b"$2 $last"); + /// assert_eq!(result, &b"$2 $last"[..]); /// # } /// ``` - pub fn replace(&self, text: &[u8], rep: R) -> Vec { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 1, rep) } @@ -424,7 +428,11 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement text. - pub fn replace_all(&self, text: &[u8], rep: R) -> Vec { + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { self.replacen(text, 0, rep) } @@ -434,16 +442,20 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement text. - pub fn replacen( + pub fn replacen<'t, R: Replacer>( &self, - text: &[u8], + text: &'t [u8], limit: usize, mut rep: R, - ) -> Vec { + ) -> Cow<'t, [u8]> { if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, (s, e)) in it { if limit > 0 && i >= limit { break } @@ -452,14 +464,18 @@ impl Regex { last_match = e; } extend_from_slice(&mut new, &text[last_match..]); - return new; + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } @@ -470,7 +486,7 @@ impl Regex { last_match = e; } extend_from_slice(&mut new, &text[last_match..]); - new + Cow::Owned(new) } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index a6a443b4b9..b12dbd20a4 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -478,7 +478,11 @@ impl Regex { /// assert_eq!(result, "$2 $last"); /// # } /// ``` - pub fn replace(&self, text: &str, rep: R) -> String { + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 1, rep) } @@ -488,7 +492,11 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. - pub fn replace_all(&self, text: &str, rep: R) -> String { + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { self.replacen(text, 0, rep) } @@ -498,13 +506,12 @@ impl Regex { /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. - pub fn replacen( + pub fn replacen<'t, R: Replacer>( &self, - text: &str, + text: &'t str, limit: usize, mut rep: R, - ) -> String { - + ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, // then we can fast path. The fast path can make a tremendous // difference: @@ -515,9 +522,13 @@ impl Regex { // replacements inside the replacement string. We just push it // at each match and be done with it. if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in self.find_iter(text).enumerate() { + for (i, (s, e)) in it { if limit > 0 && i >= limit { break } @@ -526,14 +537,18 @@ impl Regex { last_match = e; } new.push_str(&text[last_match..]); - return new; + return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { + for (i, cap) in it { if limit > 0 && i >= limit { break } @@ -544,7 +559,7 @@ impl Regex { last_match = e; } new.push_str(&text[last_match..]); - new + Cow::Owned(new) } } From 553fbad5e8b19f02b4a35e8b1e25d14d08b8a4cb Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 18 May 2016 08:11:14 -0400 Subject: [PATCH 13/18] Update the Error type. This remove the InvalidSet variant, which is no longer used, and no longer exposes the `regex_syntax::Error` type, instead exposing it as a string. --- src/error.rs | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/error.rs b/src/error.rs index e014a37aba..c95d67acdd 100644 --- a/src/error.rs +++ b/src/error.rs @@ -16,15 +16,10 @@ use syntax; #[derive(Debug)] pub enum Error { /// A syntax error. - Syntax(syntax::Error), + Syntax(String), /// The compiled program exceeded the set size limit. /// The argument is the size limit imposed. CompiledTooBig(usize), - /// **DEPRECATED:** Will be removed on next major version bump. - /// - /// This error is no longer used. (A `RegexSet` can now contain zero or - /// more regular expressions.) - InvalidSet, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients @@ -37,20 +32,14 @@ pub enum Error { impl ::std::error::Error for Error { fn description(&self) -> &str { match *self { - Error::Syntax(ref err) => err.description(), + Error::Syntax(ref err) => err, Error::CompiledTooBig(_) => "compiled program too big", - Error::InvalidSet => { - "sets must contain 2 or more regular expressions" - } Error::__Nonexhaustive => unreachable!(), } } fn cause(&self) -> Option<&::std::error::Error> { - match *self { - Error::Syntax(ref err) => Some(err), - _ => None, - } + None } } @@ -62,9 +51,6 @@ impl fmt::Display for Error { write!(f, "Compiled regex exceeds size limit of {} bytes.", limit) } - Error::InvalidSet => { - write!(f, "Sets must contain 2 or more regular expressions.") - } Error::__Nonexhaustive => unreachable!(), } } @@ -72,6 +58,6 @@ impl fmt::Display for Error { impl From for Error { fn from(err: syntax::Error) -> Error { - Error::Syntax(err) + Error::Syntax(err.to_string()) } } From faadb9520d2bea49e248b7c65cc01cab2003d7fa Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Aug 2016 22:50:42 -0400 Subject: [PATCH 14/18] find/find_iter now return a Match instead of (usize, usize). This also removes Captures.{at,pos} and replaces it with Captures.get, which now returns a Match. Similarly, Captures.name returns a Match as well. --- examples/shootout-regex-dna-cheat.rs | 8 +- examples/shootout-regex-dna-single-cheat.rs | 8 +- src/expand.rs | 20 ++- src/lib.rs | 23 +-- src/re_bytes.rs | 150 +++++++++------- src/re_unicode.rs | 179 ++++++++++++-------- tests/api.rs | 17 +- tests/api_str.rs | 3 +- tests/macros.rs | 19 ++- tests/macros_bytes.rs | 1 + tests/macros_str.rs | 1 + tests/regression.rs | 8 +- 12 files changed, 268 insertions(+), 169 deletions(-) diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs index 8d88acc2d8..a421d20853 100644 --- a/examples/shootout-regex-dna-cheat.rs +++ b/examples/shootout-regex-dna-cheat.rs @@ -78,10 +78,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs index 86c1c3f661..64d210499d 100644 --- a/examples/shootout-regex-dna-single-cheat.rs +++ b/examples/shootout-regex-dna-single-cheat.rs @@ -63,10 +63,10 @@ fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { let re = regex!(&alternates.join("|")); let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (s, e) in re.find_iter(text) { - new.push_str(&text[last_match..s]); - new.push_str(replacements[text.as_bytes()[s] as usize]); - last_match = e; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); } new.push_str(&text[last_match..]); new diff --git a/src/expand.rs b/src/expand.rs index 40c4c87152..55873f88bb 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -34,8 +34,14 @@ pub fn expand_str( }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { - Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")), - Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")), + Ref::Number(i) => { + dst.push_str( + caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or("")); + } } } dst.push_str(replacement); @@ -70,8 +76,14 @@ pub fn expand_bytes( }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { - Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), - Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), + Ref::Number(i) => { + dst.extend( + caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b"")); + } } } dst.extend(replacement); diff --git a/src/lib.rs b/src/lib.rs index 6e39bf8e61..e2acd6de4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -107,9 +107,7 @@ //! let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); //! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; //! for cap in re.captures_iter(text) { -//! println!("Month: {} Day: {} Year: {}", -//! cap.at(2).unwrap_or(""), cap.at(3).unwrap_or(""), -//! cap.at(1).unwrap_or("")); +//! println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); //! } //! // Output: //! // Month: 03 Day: 14 Year: 2012 @@ -225,7 +223,8 @@ //! # extern crate regex; use regex::Regex; //! # fn main() { //! let re = Regex::new(r"(?i)Δ+").unwrap(); -//! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); +//! let mat = re.find("ΔδΔ").unwrap(); +//! assert_eq!((mat.start(), mat.end()), (0, 6)); //! # } //! ``` //! @@ -237,7 +236,8 @@ //! # extern crate regex; use regex::Regex; //! # fn main() { //! let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); -//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); +//! let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +//! assert_eq!((mat.start(), mat.end()), (3, 23)); //! # } //! ``` //! @@ -348,7 +348,7 @@ //! # fn main() { //! let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); //! let cap = re.captures("AaAaAbbBBBb").unwrap(); -//! assert_eq!(cap.at(0), Some("AaAaAbb")); +//! assert_eq!(&cap[0], "AaAaAbb"); //! # } //! ``` //! @@ -363,7 +363,7 @@ //! # fn main() { //! let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); //! let cap = re.captures("$$abc$$").unwrap(); -//! assert_eq!(cap.at(0), Some("abc")); +//! assert_eq!(&cap[0], "abc"); //! # } //! ``` //! @@ -462,7 +462,7 @@ pub use re_builder::unicode::*; pub use re_set::unicode::*; pub use re_trait::{Locations, SubCapturesPosIter}; pub use re_unicode::{ - Regex, Captures, SubCapturesIter, SubCapturesNamedIter, + Regex, Match, Captures, SubCapturesIter, SubCapturesNamedIter, CaptureNamesIter, CapturesIter, FindIter, Replacer, NoExpand, SplitsIter, SplitsNIter, quote, @@ -492,7 +492,7 @@ let text = b"foo\x00bar\x00baz\x00"; // The unwrap is OK here since a match requires the `cstr` capture to match. let cstrs: Vec<&[u8]> = re.captures_iter(text) - .map(|c| c.name("cstr").unwrap()) + .map(|c| c.name("cstr").unwrap().as_bytes()) .collect(); assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); ``` @@ -514,10 +514,11 @@ let caps = re.captures(text).unwrap(); // Notice that despite the `.*` at the end, it will only match valid UTF-8 // because Unicode mode was enabled with the `u` flag. Without the `u` flag, // the `.*` would match the rest of the bytes. -assert_eq!((7, 10), caps.pos(1).unwrap()); +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); // If there was a match, Unicode mode guarantees that `title` is valid UTF-8. -let title = str::from_utf8(caps.at(1).unwrap()).unwrap(); +let title = str::from_utf8(&caps[1]).unwrap(); assert_eq!("☃", title); ``` diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 60d67cbbb5..e752e4ba29 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -24,6 +24,46 @@ use error::Error; use re_builder::bytes::RegexBuilder; use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + self.text + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { + text: &haystack[start..end], + start: start, + end: end, + } + } +} + /// A compiled regular expression for matching arbitrary bytes. /// /// It can be used to search, split or replace text. All searching is done with @@ -119,11 +159,11 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); /// # } /// ``` - pub fn find(&self, text: &[u8]) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { self.find_at(text, 0) } @@ -140,14 +180,9 @@ impl Regex { /// # extern crate regex; use regex::bytes::Regex; /// # fn main() { /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindIter<'r, 't> { @@ -175,9 +210,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.at(2), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); + /// assert_eq!(&caps[2], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], b"Citizen Kane"); @@ -198,9 +233,9 @@ impl Regex { /// .unwrap(); /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some(&b"Citizen Kane"[..])); - /// assert_eq!(caps.name("year"), Some(&b"1941"[..])); - /// assert_eq!(caps.at(0), Some(&b"'Citizen Kane' (1941)"[..])); + /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); + /// assert_eq!(&caps["year"], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], b"Citizen Kane"); @@ -455,13 +490,13 @@ impl Regex { } let mut new = Vec::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in it { + for (i, m) in it { if limit > 0 && i >= limit { break } - extend_from_slice(&mut new, &text[last_match..s]); + extend_from_slice(&mut new, &text[last_match..m.start()]); extend_from_slice(&mut new, &*rep); - last_match = e; + last_match = m.end(); } extend_from_slice(&mut new, &text[last_match..]); return Cow::Owned(new); @@ -480,10 +515,10 @@ impl Regex { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - extend_from_slice(&mut new, &text[last_match..s]); + let m = cap.get(0).unwrap(); + extend_from_slice(&mut new, &text[last_match..m.start()]); rep.replace_append(&cap, &mut new); - last_match = e; + last_match = m.end(); } extend_from_slice(&mut new, &text[last_match..]); Cow::Owned(new) @@ -550,12 +585,13 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. #[doc(hidden)] - pub fn find_at( + pub fn find_at<'t>( &self, - text: &[u8], + text: &'t [u8], start: usize, - ) -> Option<(usize, usize)> { + ) -> Option> { self.0.searcher().find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) } /// Returns the same as captures, but starts the search at the given @@ -565,13 +601,14 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. #[doc(hidden)] - pub fn read_captures_at( + pub fn read_captures_at<'t>( &self, locs: &mut Locations, - text: &[u8], + text: &'t [u8], start: usize, - ) -> Option<(usize, usize)> { + ) -> Option> { self.0.searcher().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) } } @@ -611,10 +648,11 @@ impl Regex { pub struct FindIter<'r, 't>(re_trait::FindIter<'t, ExecNoSync<'r>>); impl<'r, 't> Iterator for FindIter<'r, 't> { - type Item = (usize, usize); + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { - self.0.next() + fn next(&mut self) -> Option> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) } } @@ -663,9 +701,9 @@ impl<'r, 't> Iterator for SplitsIter<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -739,29 +777,17 @@ pub struct Captures<'t> { } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original byte string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - self.locs.pos(i) - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t [u8]> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t [u8]> { - self.named_groups.get(name).and_then(|&i| self.at(i)) + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.get(name).and_then(|&i| self.get(i)) } /// Creates an iterator of all the capture groups in order of appearance @@ -874,7 +900,8 @@ impl<'t> Index for Captures<'t> { type Output = [u8]; fn index(&self, i: usize) -> &[u8] { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -894,7 +921,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = [u8]; fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) + self.name(name).map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } @@ -914,7 +942,7 @@ impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { fn next(&mut self) -> Option> { if self.idx < self.caps.len() { self.idx += 1; - Some(self.caps.at(self.idx - 1)) + Some(self.caps.get(self.idx - 1).map(|m| m.as_bytes())) } else { None } @@ -935,7 +963,9 @@ impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { type Item = (&'c str, Option<&'t [u8]>); fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { - self.names.next().map(|(name, &pos)| (&**name, self.caps.at(pos))) + self.names.next().map(|(name, &pos)| { + (&**name, self.caps.get(pos).map(|m| m.as_bytes())) + }) } } @@ -952,7 +982,7 @@ pub trait Replacer { /// have a match at capture group `0`. /// /// For example, a no-op replacement would be - /// `dst.extend(caps.at(0).unwrap())`. + /// `dst.extend(&caps[0])`. fn replace_append(&mut self, caps: &Captures, dst: &mut Vec); /// Return a fixed unchanging replacement byte string. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index b12dbd20a4..a0cd4ce32c 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -33,6 +33,46 @@ pub fn quote(text: &str) -> String { syntax::quote(text) } +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + self.text + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { + text: &haystack[start..end], + start: start, + end: end, + } + } +} + /// A compiled regular expression for matching Unicode strings. /// /// It is represented as either a sequence of bytecode instructions (dynamic) @@ -61,13 +101,14 @@ pub fn quote(text: &str) -> String { /// ```rust /// # use regex::Regex; /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); -/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19))); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); /// ``` /// /// # Using the `std::str::StrExt` methods with `Regex` /// -/// > **Note**: This section requires that this crate is currently compiled with -/// > the `pattern` Cargo feature enabled. +/// > **Note**: This section requires that this crate is currently compiled +/// > with the `pattern` Cargo feature enabled. /// /// Since `Regex` implements `Pattern`, you can use regexes with methods /// defined on `std::str::StrExt`. For example, `is_match`, `find`, `find_iter` @@ -182,11 +223,12 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; - /// let pos = Regex::new(r"\b\w{13}\b").unwrap().find(text); - /// assert_eq!(pos, Some((2, 15))); + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); /// # } /// ``` - pub fn find(&self, text: &str) -> Option<(usize, usize)> { + pub fn find<'t>(&self, text: &'t str) -> Option> { self.find_at(text, 0) } @@ -203,14 +245,9 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "Retroactively relinquishing remunerations is reprehensible."; - /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", pos); + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); /// } - /// // Output: - /// // (0, 13) - /// // (14, 27) - /// // (28, 41) - /// // (45, 58) /// # } /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindIter<'r, 't> { @@ -247,9 +284,9 @@ impl Regex { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.at(1), Some("Citizen Kane")); - /// assert_eq!(caps.at(2), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], "Citizen Kane"); @@ -270,9 +307,9 @@ impl Regex { /// .unwrap(); /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title"), Some("Citizen Kane")); - /// assert_eq!(caps.name("year"), Some("1941")); - /// assert_eq!(caps.at(0), Some("'Citizen Kane' (1941)")); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], "Citizen Kane"); @@ -315,7 +352,7 @@ impl Regex { /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for caps in re.captures_iter(text) { /// println!("Movie: {:?}, Released: {:?}", - /// caps.name("title"), caps.name("year")); + /// &caps["title"], &caps["year"]); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 @@ -441,7 +478,7 @@ impl Regex { /// # use regex::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { - /// format!("{} {}", caps.at(2).unwrap_or(""), caps.at(1).unwrap_or("")) + /// format!("{} {}", &caps[2], &caps[1]) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// # } @@ -528,13 +565,13 @@ impl Regex { } let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, (s, e)) in it { + for (i, m) in it { if limit > 0 && i >= limit { break } - new.push_str(&text[last_match..s]); + new.push_str(&text[last_match..m.start()]); new.push_str(&rep); - last_match = e; + last_match = m.end(); } new.push_str(&text[last_match..]); return Cow::Owned(new); @@ -553,10 +590,10 @@ impl Regex { break } // unwrap on 0 is OK because captures only reports matches - let (s, e) = cap.pos(0).unwrap(); - new.push_str(&text[last_match..s]); + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); rep.replace_append(&cap, &mut new); - last_match = e; + last_match = m.end(); } new.push_str(&text[last_match..]); Cow::Owned(new) @@ -628,12 +665,20 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. #[doc(hidden)] - pub fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option> { match self.0 { _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start) + exec.searcher_str().find_at(text, start).map(|(s, e)| { + Match::new(text, s, e) + }) + } + _Regex::Plugin(ref plug) => { + plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e)) } - _Regex::Plugin(ref plug) => plug.find_at(text, start), } } @@ -644,18 +689,20 @@ impl Regex { /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. #[doc(hidden)] - pub fn read_captures_at( + pub fn read_captures_at<'t>( &self, locs: &mut Locations, - text: &str, + text: &'t str, start: usize, - ) -> Option<(usize, usize)> { + ) -> Option> { match self.0 { _Regex::Dynamic(ref exec) => { exec.searcher_str().read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) } _Regex::Plugin(ref plug) => { plug.read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) } } } @@ -759,9 +806,9 @@ impl<'r, 't> Iterator for SplitsIter<'r, 't> { Some(s) } } - Some((s, e)) => { - let matched = &text[self.last..s]; - self.last = e; + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); Some(matched) } } @@ -867,29 +914,17 @@ pub struct Captures<'t> { } impl<'t> Captures<'t> { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - self.locs.pos(i) - } - - /// Returns the matched string for the capture group `i`. If `i` isn't - /// a valid capture group or didn't match anything, then `None` is - /// returned. - pub fn at(&self, i: usize) -> Option<&'t str> { - match self.pos(i) { - None => None, - Some((s, e)) => Some(&self.text[s..e]) - } + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + pub fn get(&self, i: usize) -> Option> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } - /// Returns the matched string for the capture group named `name`. If - /// `name` isn't a valid capture group or didn't match anything, then - /// `None` is returned. - pub fn name(&self, name: &str) -> Option<&'t str> { - self.named_groups.pos(name).and_then(|i| self.at(i)) + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option> { + self.named_groups.pos(name).and_then(|i| self.get(i)) } /// Creates an iterator of all the capture groups in order of appearance @@ -987,7 +1022,8 @@ impl<'t> Index for Captures<'t> { type Output = str; fn index(&self, i: usize) -> &str { - self.at(i).unwrap_or_else(|| panic!("no group at index '{}'", i)) + self.get(i).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } @@ -1007,7 +1043,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = str; fn index<'a>(&'a self, name: &'i str) -> &'a str { - self.name(name).unwrap_or_else(|| panic!("no group named '{}'", name)) + self.name(name).map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) } } @@ -1026,7 +1063,7 @@ impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { fn next(&mut self) -> Option> { if self.idx < self.caps.len() { self.idx += 1; - Some(self.caps.at(self.idx - 1)) + Some(self.caps.get(self.idx - 1).map(|m| m.as_str())) } else { None } @@ -1046,7 +1083,9 @@ impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { type Item = (&'c str, Option<&'t str>); fn next(&mut self) -> Option<(&'c str, Option<&'t str>)> { - self.names.next().map(|(name, pos)| (name, self.caps.at(pos))) + self.names.next().map(|(name, pos)| { + (name, self.caps.get(pos).map(|m| m.as_str())) + }) } } @@ -1090,8 +1129,7 @@ impl<'r, 't> Iterator for CapturesIter<'r, 't> { /// An iterator over all non-overlapping matches for a particular string. /// -/// The iterator yields a tuple of integers corresponding to the start and end -/// of the match. The indices are byte offsets. The iterator stops when no more +/// The iterator yields a `Match` value. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the @@ -1113,12 +1151,17 @@ impl<'r, 't> FindIter<'r, 't> { } impl<'r, 't> Iterator for FindIter<'r, 't> { - type Item = (usize, usize); + type Item = Match<'t>; - fn next(&mut self) -> Option<(usize, usize)> { + fn next(&mut self) -> Option> { + let text = self.text(); match self.0 { - FindIterInner::Dynamic(ref mut it) => it.next(), - FindIterInner::Plugin(ref mut it) => it.next(), + FindIterInner::Dynamic(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } + FindIterInner::Plugin(ref mut it) => { + it.next().map(|(s, e)| Match::new(text, s, e)) + } } } } @@ -1135,7 +1178,7 @@ pub trait Replacer { /// have a match at capture group `0`. /// /// For example, a no-op replacement would be - /// `dst.extend(caps.at(0).unwrap())`. + /// `dst.extend(caps.get(0).unwrap().as_str())`. fn replace_append(&mut self, caps: &Captures, dst: &mut String); /// Return a fixed unchanging replacement string. diff --git a/tests/api.rs b/tests/api.rs index 0be032949a..afc6b16600 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -60,7 +60,8 @@ fn empty_match_find_iter() { fn empty_match_captures_iter() { let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("abc")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); } @@ -127,16 +128,16 @@ fn capture_misc() { assert_eq!(5, cap.len()); - assert_eq!(Some((0, 3)), cap.pos(0)); - assert_eq!(None, cap.pos(2)); - assert_eq!(Some((2, 3)), cap.pos(4)); + assert_eq!((0, 3), { let m = cap.get(0).unwrap(); (m.start(), m.end()) }); + assert_eq!(None, cap.get(2)); + assert_eq!((2, 3), { let m = cap.get(4).unwrap(); (m.start(), m.end()) }); - assert_eq!(Some(t!("abc")), cap.at(0)); - assert_eq!(None, cap.at(2)); - assert_eq!(Some(t!("c")), cap.at(4)); + assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap())); + assert_eq!(None, cap.get(2)); + assert_eq!(t!("c"), match_text!(cap.get(4).unwrap())); assert_eq!(None, cap.name("a")); - assert_eq!(Some(t!("c")), cap.name("b")); + assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); } #[test] diff --git a/tests/api_str.rs b/tests/api_str.rs index c6d392876b..5bdca8426a 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -15,7 +15,8 @@ fn empty_match_unicode_captures_iter() { // Same as empty_match_unicode_find_iter, but tests capture iteration. let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) - .map(|c| c.pos(0).unwrap()) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) .collect(); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); } diff --git a/tests/macros.rs b/tests/macros.rs index f9e8912630..9d30dd16e2 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -2,7 +2,8 @@ macro_rules! findall { ($re:expr, $text:expr) => {{ - $re.find_iter(text!($text)).collect::>() + $re.find_iter(text!($text)) + .map(|m| (m.start(), m.end())).collect::>() }} } @@ -45,14 +46,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<(usize, usize)> = vec![]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ @@ -67,14 +72,18 @@ macro_rules! matiter( let text = text!($text); let expected: Vec<_> = vec![$($loc)+]; let r = regex!($re); - let got: Vec<_> = r.find_iter(text).collect(); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); if expected != got { panic!("For RE '{}' against '{:?}', \ expected '{:?}' but got '{:?}'", $re, text, expected, got); } let captures_got: Vec<_> = - r.captures_iter(text).map(|c| c.pos(0).unwrap()).collect(); + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); if captures_got != got { panic!("For RE '{}' against '{:?}', \ got '{:?}' using find_iter but got '{:?}' \ diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index c0875ab074..4a382c78dd 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -1,6 +1,7 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text.as_bytes() } } macro_rules! t { ($re:expr) => { text!($re) } } +macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } macro_rules! bytes { ($text:expr) => { $text } } macro_rules! b { ($text:expr) => { bytes!($text) } } diff --git a/tests/macros_str.rs b/tests/macros_str.rs index c654c2fc03..e5b0e219da 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -1,6 +1,7 @@ // Macros for use in writing tests generic over &str/&[u8]. macro_rules! text { ($text:expr) => { $text } } macro_rules! t { ($text:expr) => { text!($text) } } +macro_rules! match_text { ($text:expr) => { $text.as_str() } } macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } macro_rules! b { ($text:expr) => { bytes!($text) } } diff --git a/tests/regression.rs b/tests/regression.rs index ccb4fab8ca..108cdb9565 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -30,7 +30,7 @@ mat!(regression_ascii_word_underscore, r"[:word:]", "_", Some((0, 1))); fn regression_captures_rep() { let re = regex!(r"([a-f]){2}(?P[x-z])"); let caps = re.captures(text!("abx")).unwrap(); - assert_eq!(caps.name("foo").unwrap(), text!("x")); + assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x")); } // See: https://github.com/rust-lang-nursery/regex/issues/153 @@ -59,8 +59,8 @@ matiter!(word_boundary_dfa, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); // See: https://github.com/rust-lang-nursery/regex/issues/268 -matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); +matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); // See: https://github.com/rust-lang-nursery/regex/issues/264 -mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0))); -mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); From 5562e0e6c3467f8f1c8fef95e73fb5d0f1c721f8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 5 Aug 2016 00:10:39 -0400 Subject: [PATCH 15/18] Remove the submatch iterators. All use cases can be replaced with Regex::capture_names. --- regex-capi/src/rure.rs | 10 +++--- src/lib.rs | 6 ++-- src/pattern.rs | 3 +- src/re_bytes.rs | 71 ++----------------------------------- src/re_unicode.rs | 68 ++--------------------------------- tests/api.rs | 80 ------------------------------------------ tests/macros.rs | 5 ++- 7 files changed, 18 insertions(+), 225 deletions(-) diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 874811ea13..29e47dfb35 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -162,10 +162,10 @@ ffi_fn! { ) -> bool { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; - re.find_at(haystack, start).map(|(s, e)| unsafe { + re.find_at(haystack, start).map(|m| unsafe { if !match_info.is_null() { - (*match_info).start = s; - (*match_info).end = e; + (*match_info).start = m.start(); + (*match_info).end = m.end(); } }).is_some() } @@ -258,7 +258,7 @@ ffi_fn! { } let (s, e) = match re.find_at(text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start @@ -300,7 +300,7 @@ ffi_fn! { } let (s, e) = match re.read_captures_at(slots, text, it.last_end) { None => return false, - Some((s, e)) => (s, e), + Some(m) => (m.start(), m.end()), }; if s == e { // This is an empty match. To ensure we make progress, start diff --git a/src/lib.rs b/src/lib.rs index e2acd6de4a..1729ed014a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -460,9 +460,9 @@ extern crate utf8_ranges; pub use error::Error; pub use re_builder::unicode::*; pub use re_set::unicode::*; -pub use re_trait::{Locations, SubCapturesPosIter}; +pub use re_trait::Locations; pub use re_unicode::{ - Regex, Match, Captures, SubCapturesIter, SubCapturesNamedIter, + Regex, Match, Captures, CaptureNamesIter, CapturesIter, FindIter, Replacer, NoExpand, SplitsIter, SplitsNIter, quote, @@ -558,7 +558,7 @@ pub mod bytes { pub use re_builder::bytes::*; pub use re_bytes::*; pub use re_set::bytes::*; - pub use re_trait::{Locations, SubCapturesPosIter}; + pub use re_trait::Locations; } mod backtrack; diff --git a/src/pattern.rs b/src/pattern.rs index a6037d0ea1..f796bc5641 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -45,7 +45,8 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { SearchStep::Done } } - Some((s, e)) => { + Some(m) => { + let (s, e) = (m.start(), m.end()); if s == self.last_step_end { self.last_step_end = e; SearchStep::Match(s, e) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index e752e4ba29..205f59870c 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -10,7 +10,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::collections::hash_map; use std::fmt; use std::ops::Index; use std::str::FromStr; @@ -22,7 +21,7 @@ use exec::{Exec, ExecNoSync}; use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; -use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; +use re_trait::{self, RegularExpression, Locations}; /// Match represents a single match of a regex in a haystack. /// @@ -790,29 +789,6 @@ impl<'t> Captures<'t> { self.named_groups.get(name).and_then(|&i| self.get(i)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter<'c>(&'c self) -> SubCapturesIter<'c, 't> { - SubCapturesIter { idx: 0, caps: self } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&self) -> SubCapturesPosIter { - self.locs.iter() - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named<'c>(&'c self) -> SubCapturesNamedIter<'c, 't> { - SubCapturesNamedIter { - caps: self, - names: self.named_groups.iter() - } - } - /// Expands all instances of `$name` in `text` to the corresponding capture /// group `name`, and writes them to the `dst` buffer given. /// @@ -873,7 +849,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap<&usize, &String> = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -926,49 +902,6 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { } } -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCapturesIter<'c, 't: 'c> { - idx: usize, - caps: &'c Captures<'t>, -} - -impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { - type Item = Option<&'t [u8]>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.get(self.idx - 1).map(|m| m.as_bytes())) - } else { - None - } - } -} - -/// An Iterator over named capture groups as a tuple with the group name and -/// the value. -/// -/// `'c` is the lifetime of the captures and `'t` is the lifetime of the -/// matched text. -pub struct SubCapturesNamedIter<'c, 't: 'c> { - caps: &'c Captures<'t>, - names: hash_map::Iter<'c, String, usize>, -} - -impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { - type Item = (&'c str, Option<&'t [u8]>); - - fn next(&mut self) -> Option<(&'c str, Option<&'t [u8]>)> { - self.names.next().map(|(name, &pos)| { - (&**name, self.caps.get(pos).map(|m| m.as_bytes())) - }) - } -} - /// Replacer describes types that can be used to replace matches in a byte /// string. /// diff --git a/src/re_unicode.rs b/src/re_unicode.rs index a0cd4ce32c..7e712d26a2 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -23,7 +23,7 @@ use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; -use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; +use re_trait::{self, RegularExpression, Locations}; /// Escapes all regular expression meta characters in `text`. /// @@ -927,29 +927,6 @@ impl<'t> Captures<'t> { self.named_groups.pos(name).and_then(|i| self.get(i)) } - /// Creates an iterator of all the capture groups in order of appearance - /// in the regular expression. - pub fn iter<'c>(&'c self) -> SubCapturesIter<'c, 't> { - SubCapturesIter { idx: 0, caps: self, } - } - - /// Creates an iterator of all the capture group positions in order of - /// appearance in the regular expression. Positions are byte indices - /// in terms of the original string matched. - pub fn iter_pos(&self) -> SubCapturesPosIter { - self.locs.iter() - } - - /// Creates an iterator of all named groups as an tuple with the group - /// name and the value. The iterator returns these values in arbitrary - /// order. - pub fn iter_named<'c>(&'c self) -> SubCapturesNamedIter<'c, 't> { - SubCapturesNamedIter { - caps: self, - names: self.named_groups.iter() - } - } - /// Expands all instances of `$name` in `text` to the corresponding capture /// group `name`, and writes them to the `dst` buffer given. /// @@ -995,7 +972,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { let slot_to_name: HashMap = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); - for (slot, m) in self.0.iter_pos().enumerate() { + for (slot, m) in self.0.locs.iter().enumerate() { let m = m.map(|(s, e)| &self.0.text[s..e]); if let Some(ref name) = slot_to_name.get(&slot) { map.entry(&name, &m); @@ -1048,47 +1025,6 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { } } -/// An iterator over capture groups for a particular match of a regular -/// expression. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesIter<'c, 't: 'c> { - idx: usize, - caps: &'c Captures<'t>, -} - -impl<'c, 't> Iterator for SubCapturesIter<'c, 't> { - type Item = Option<&'t str>; - - fn next(&mut self) -> Option> { - if self.idx < self.caps.len() { - self.idx += 1; - Some(self.caps.get(self.idx - 1).map(|m| m.as_str())) - } else { - None - } - } -} - -/// An Iterator over named capture groups as a tuple with the group -/// name and the value. -/// -/// `'c` is the lifetime of the captures. -pub struct SubCapturesNamedIter<'c, 't: 'c> { - caps: &'c Captures<'t>, - names: NamedGroupsIter<'c>, -} - -impl<'c, 't> Iterator for SubCapturesNamedIter<'c, 't> { - type Item = (&'c str, Option<&'t str>); - - fn next(&mut self) -> Option<(&'c str, Option<&'t str>)> { - self.names.next().map(|(name, pos)| { - (name, self.caps.get(pos).map(|m| m.as_str())) - }) - } -} - /// An iterator that yields all non-overlapping capture groups matching a /// particular regular expression. /// diff --git a/tests/api.rs b/tests/api.rs index afc6b16600..7221995b0e 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -140,86 +140,6 @@ fn capture_misc() { assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); } -#[test] -fn capture_iter() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - t!("abcd"), t!("a"), t!("b"), t!("c"), t!("d"), - ].into_iter().map(Some).collect::>(); - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - assert_eq!(5, cap.len()); - - let expected = vec![ - Some(t!("abc")), Some(t!("a")), None, Some(t!("b")), Some(t!("c")), - ]; - let got = cap.iter().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected = vec![ - (0, 4), (0, 1), (1, 2), (2, 3), (3, 4), - ].into_iter().map(Some).collect::>(); - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_pos_missing() { - let re = regex!(r"(.)(?Pa)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - - let expected = vec![ - Some((0, 3)), Some((0, 1)), None, Some((1, 2)), Some((2, 3)), - ]; - let got = cap.iter_pos().collect::>(); - assert_eq!(expected, got); -} - -#[test] -fn capture_iter_named() { - let re = regex!(r"(.)(?P.)(.)(?P.)"); - let cap = re.captures(t!("abcd")).unwrap(); - - let expected1 = vec![ - ("a", Some(t!("b"))), ("b", Some(t!("d"))), - ]; - let expected2 = vec![ - ("b", Some(t!("d"))), ("a", Some(t!("b"))), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); -} - -#[test] -fn capture_iter_named_missing() { - let re = regex!(r"(.)(?P.)?(.)(?P.)"); - let cap = re.captures(t!("abc")).unwrap(); - - let expected1 = vec![ - ("a", None), ("b", Some(t!("c"))), - ]; - let expected2 = vec![ - ("b", Some(t!("c"))), ("a", None), - ]; - let got = cap.iter_named().collect::>(); - assert!(got == expected1 || got == expected2); -} - expand!(expand1, r"(?P\w+)", "abc", "$foo", "abc"); expand!(expand2, r"(?P\w+)", "abc", "$0", "abc"); expand!(expand3, r"(?P\w+)", "abc", "$1", "abc"); diff --git a/tests/macros.rs b/tests/macros.rs index 9d30dd16e2..5badc89b53 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -20,7 +20,10 @@ macro_rules! mat( Some(c) => { assert!(r.is_match(text)); assert!(r.shortest_match(text).is_some()); - c.iter_pos().collect() + r.capture_names() + .enumerate() + .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end()))) + .collect() } None => vec![None], }; From 9d208ca44d0f4ab5e5d183af01bee427a476f938 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 5 Aug 2016 08:51:22 -0400 Subject: [PATCH 16/18] Fix tests. Weird that this cfg_attrs didn't get pulled in during the rebase... --- tests/test_backtrack.rs | 2 ++ tests/test_backtrack_utf8bytes.rs | 2 ++ tests/test_default.rs | 2 ++ tests/test_nfa.rs | 2 ++ tests/test_nfa_bytes.rs | 1 - tests/test_nfa_utf8bytes.rs | 2 ++ 6 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index 7861a3db88..5516c840e7 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index 2bf9456292..a170d19324 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; diff --git a/tests/test_default.rs b/tests/test_default.rs index 452872d35d..e6cf92fa2e 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index abf24561fd..8a831c47d3 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index a084c804fe..f376cefe1f 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -1,4 +1,3 @@ - // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index 5926de7bd6..5d13685aab 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -8,6 +8,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#![cfg_attr(feature = "pattern", feature(pattern))] + extern crate rand; extern crate regex; From 3fe0783860d7a6a04c187d8eccbb09de0cc278c3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Aug 2016 16:52:41 -0400 Subject: [PATCH 17/18] Switch to more idiomatic builder definition. Specifically, use mutable references instead of passing ownership. --- bench/src/bench.rs | 2 +- regex-capi/src/rure.rs | 18 +++++++++--------- src/re_builder.rs | 20 ++++++++++---------- src/re_bytes.rs | 2 +- src/re_unicode.rs | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 9c8a924746..db9e3a5f61 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -71,7 +71,7 @@ macro_rules! regex { // Always enable the Unicode flag for byte based regexes. // Really, this should have been enabled by default. *sigh* use regex::bytes::RegexBuilder; - RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap() + RegexBuilder::new(&$re.to_owned()).unicode(true).build().unwrap() }} } diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 29e47dfb35..832bab6c0d 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -98,16 +98,16 @@ ffi_fn! { let mut builder = bytes::RegexBuilder::new(pat); if !options.is_null() { let options = unsafe { &*options }; - builder = builder.size_limit(options.size_limit); - builder = builder.dfa_size_limit(options.dfa_size_limit); + builder.size_limit(options.size_limit); + builder.dfa_size_limit(options.dfa_size_limit); } - builder = builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); - builder = builder.multi_line(flags & RURE_FLAG_MULTI > 0); - builder = builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); - builder = builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); - builder = builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); - builder = builder.unicode(flags & RURE_FLAG_UNICODE > 0); - match builder.compile() { + builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); + builder.multi_line(flags & RURE_FLAG_MULTI > 0); + builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); + builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); + builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); + builder.unicode(flags & RURE_FLAG_UNICODE > 0); + match builder.build() { Ok(re) => { let mut capture_names = HashMap::new(); for (i, name) in re.capture_names().enumerate() { diff --git a/src/re_builder.rs b/src/re_builder.rs index c769cc5d17..e770dcb7bf 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -70,21 +70,21 @@ impl RegexBuilder { /// Note that calling `as_str` on the resulting `Regex` will produce the /// pattern given to `new` verbatim. Notably, it will not incorporate any /// of the flags set on this builder. - pub fn compile(self) -> Result { - ExecBuilder::new_options(self.0) + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) .only_utf8($only_utf8) .build() .map(Regex::from) } /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive(mut self, yes: bool) -> RegexBuilder { + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.0.case_insensitive = yes; self } /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line(mut self, yes: bool) -> RegexBuilder { + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.multi_line = yes; self } @@ -96,19 +96,19 @@ impl RegexBuilder { /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` /// expressions and means "any Unicode codepoint" for `regex::Regex` /// expressions. - pub fn dot_matches_new_line(mut self, yes: bool) -> RegexBuilder { + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { self.0.dot_matches_new_line = yes; self } /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed(mut self, yes: bool) -> RegexBuilder { + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.0.swap_greed = yes; self } /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace(mut self, yes: bool) -> RegexBuilder { + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.0.ignore_whitespace = yes; self } @@ -116,7 +116,7 @@ impl RegexBuilder { /// Set the value for the Unicode (`u`) flag. /// /// For byte based regular expressions, this is disabled by default. - pub fn unicode(mut self, yes: bool) -> RegexBuilder { + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { self.0.unicode = yes; self } @@ -126,7 +126,7 @@ impl RegexBuilder { /// This roughly corresponds to the number of bytes occupied by a single /// compiled program. If the program exceeds this number, then a /// compilation error is returned. - pub fn size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.size_limit = limit; self } @@ -140,7 +140,7 @@ impl RegexBuilder { /// limit. In particular, if a regex is used from multiple threads /// simulanteously, then each thread may use up to the number of bytes /// specified here. - pub fn dfa_size_limit(mut self, limit: usize) -> RegexBuilder { + pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.0.dfa_size_limit = limit; self } diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 205f59870c..80b63d7628 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -117,7 +117,7 @@ impl Regex { /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - RegexBuilder::new(re).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 7e712d26a2..4fefef3187 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -182,7 +182,7 @@ impl Regex { /// /// If an invalid expression is given, then an error is returned. pub fn new(re: &str) -> Result { - RegexBuilder::new(re).compile() + RegexBuilder::new(re).build() } /// Returns true if and only if the regex matches the string given. From 255005cf8968e701083322537bca4965389dca8c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 21 Aug 2016 17:05:36 -0400 Subject: [PATCH 18/18] Rename iterator types to match `std` conventions. --- src/lib.rs | 4 +-- src/pattern.rs | 4 +-- src/re_bytes.rs | 36 +++++++++---------- src/re_trait.rs | 20 +++++------ src/re_unicode.rs | 88 +++++++++++++++++++++++------------------------ 5 files changed, 76 insertions(+), 76 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1729ed014a..d2d9b18526 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -463,8 +463,8 @@ pub use re_set::unicode::*; pub use re_trait::Locations; pub use re_unicode::{ Regex, Match, Captures, - CaptureNamesIter, CapturesIter, FindIter, - Replacer, NoExpand, SplitsIter, SplitsNIter, + CaptureNames, Matches, CaptureMatches, + Replacer, NoExpand, Split, SplitN, quote, }; diff --git a/src/pattern.rs b/src/pattern.rs index f796bc5641..37183c24e3 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,10 +1,10 @@ use std::str::pattern::{Pattern, Searcher, SearchStep}; -use re_unicode::{Regex, FindIter}; +use re_unicode::{Regex, Matches}; pub struct RegexSearcher<'r, 't> { haystack: &'t str, - it: FindIter<'r, 't>, + it: Matches<'r, 't>, last_step_end: usize, next_match: Option<(usize, usize)>, } diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 80b63d7628..a625fe2aa9 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -184,8 +184,8 @@ impl Regex { /// } /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> FindIter<'r, 't> { - FindIter(self.0.searcher().find_iter(text)) + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first @@ -289,8 +289,8 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t [u8], - ) -> CapturesIter<'r, 't> { - CapturesIter(self.0.searcher().captures_iter(text)) + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -313,8 +313,8 @@ impl Regex { /// ]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> SplitsIter<'r, 't> { - SplitsIter { + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -344,8 +344,8 @@ impl Regex { &'r self, text: &'t [u8], limit: usize, - ) -> SplitsNIter<'r, 't> { - SplitsNIter { + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -644,9 +644,9 @@ impl Regex { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct FindIter<'r, 't>(re_trait::FindIter<'t, ExecNoSync<'r>>); +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for FindIter<'r, 't> { +impl<'r, 't> Iterator for Matches<'r, 't> { type Item = Match<'t>; fn next(&mut self) -> Option> { @@ -662,9 +662,9 @@ impl<'r, 't> Iterator for FindIter<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched byte string. -pub struct CapturesIter<'r, 't>(re_trait::CapturesIter<'t, ExecNoSync<'r>>); +pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSync<'r>>); -impl<'r, 't> Iterator for CapturesIter<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { @@ -680,12 +680,12 @@ impl<'r, 't> Iterator for CapturesIter<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct SplitsIter<'r, 't> { - finder: FindIter<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for SplitsIter<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { @@ -715,12 +715,12 @@ impl<'r, 't> Iterator for SplitsIter<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the byte string being split. -pub struct SplitsNIter<'r, 't> { - splits: SplitsIter<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsNIter<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t [u8]; fn next(&mut self) -> Option<&'t [u8]> { diff --git a/src/re_trait.rs b/src/re_trait.rs index 81bfbc0d6b..9f3407c98b 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -151,8 +151,8 @@ pub trait RegularExpression: Sized { fn find_iter<'t>( self, text: &'t Self::Text, - ) -> FindIter<'t, Self> { - FindIter { + ) -> Matches<'t, Self> { + Matches { re: self, text: text, last_end: 0, @@ -165,20 +165,20 @@ pub trait RegularExpression: Sized { fn captures_iter<'t>( self, text: &'t Self::Text, - ) -> CapturesIter<'t, Self> { - CapturesIter(self.find_iter(text)) + ) -> CaptureMatches<'t, Self> { + CaptureMatches(self.find_iter(text)) } } /// An iterator over all non-overlapping successive leftmost-first matches. -pub struct FindIter<'t, R> where R: RegularExpression, R::Text: 't { +pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't { re: R, text: &'t R::Text, last_end: usize, last_match: Option, } -impl<'t, R> FindIter<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.text @@ -190,7 +190,7 @@ impl<'t, R> FindIter<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for FindIter<'t, R> +impl<'t, R> Iterator for Matches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = (usize, usize); @@ -222,10 +222,10 @@ impl<'t, R> Iterator for FindIter<'t, R> /// An iterator over all non-overlapping successive leftmost-first matches with /// captures. -pub struct CapturesIter<'t, R>(FindIter<'t, R>) +pub struct CaptureMatches<'t, R>(Matches<'t, R>) where R: RegularExpression, R::Text: 't; -impl<'t, R> CapturesIter<'t, R> where R: RegularExpression, R::Text: 't { +impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.0.text() @@ -237,7 +237,7 @@ impl<'t, R> CapturesIter<'t, R> where R: RegularExpression, R::Text: 't { } } -impl<'t, R> Iterator for CapturesIter<'t, R> +impl<'t, R> Iterator for CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]> { type Item = Locations; diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 4fefef3187..a8c5983f8a 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -250,15 +250,15 @@ impl Regex { /// } /// # } /// ``` - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindIter<'r, 't> { + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().find_iter(text); - FindIter(FindIterInner::Dynamic(it)) + Matches(MatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.find_iter(text); - FindIter(FindIterInner::Plugin(it)) + Matches(MatchesInner::Plugin(it)) } } } @@ -363,15 +363,15 @@ impl Regex { pub fn captures_iter<'r, 't>( &'r self, text: &'t str, - ) -> CapturesIter<'r, 't> { + ) -> CaptureMatches<'r, 't> { match self.0 { _Regex::Dynamic(ref exec) => { let it = exec.searcher_str().captures_iter(text); - CapturesIter(CapturesIterInner::Dynamic(it)) + CaptureMatches(CaptureMatchesInner::Dynamic(it)) } _Regex::Plugin(ref plug) => { let it = plug.captures_iter(text); - CapturesIter(CapturesIterInner::Plugin(it)) + CaptureMatches(CaptureMatchesInner::Plugin(it)) } } } @@ -394,8 +394,8 @@ impl Regex { /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// # } /// ``` - pub fn split<'r, 't>(&'r self, text: &'t str) -> SplitsIter<'r, 't> { - SplitsIter { + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0, } @@ -422,8 +422,8 @@ impl Regex { /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) - -> SplitsNIter<'r, 't> { - SplitsNIter { + -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit, } @@ -719,11 +719,11 @@ impl Regex { } /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNamesIter { - CaptureNamesIter(match self.0 { - _Regex::Plugin(ref n) => _CaptureNamesIter::Plugin(n.names.iter()), + pub fn capture_names(&self) -> CaptureNames { + CaptureNames(match self.0 { + _Regex::Plugin(ref n) => _CaptureNames::Plugin(n.names.iter()), _Regex::Dynamic(ref d) => { - _CaptureNamesIter::Dynamic(d.capture_names().iter()) + _CaptureNames::Dynamic(d.capture_names().iter()) } }) } @@ -755,20 +755,20 @@ impl Regex { /// whole matched region) is always unnamed. /// /// `'r` is the lifetime of the compiled regular expression. -pub struct CaptureNamesIter<'r>(_CaptureNamesIter<'r>); +pub struct CaptureNames<'r>(_CaptureNames<'r>); -enum _CaptureNamesIter<'r> { +enum _CaptureNames<'r> { Plugin(::std::slice::Iter<'r, Option<&'static str>>), Dynamic(::std::slice::Iter<'r, Option>) } -impl<'r> Iterator for CaptureNamesIter<'r> { +impl<'r> Iterator for CaptureNames<'r> { type Item = Option<&'r str>; fn next(&mut self) -> Option> { match self.0 { - _CaptureNamesIter::Plugin(ref mut i) => i.next().cloned(), - _CaptureNamesIter::Dynamic(ref mut i) => { + _CaptureNames::Plugin(ref mut i) => i.next().cloned(), + _CaptureNames::Dynamic(ref mut i) => { i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())) } } @@ -776,8 +776,8 @@ impl<'r> Iterator for CaptureNamesIter<'r> { fn size_hint(&self) -> (usize, Option) { match self.0 { - _CaptureNamesIter::Plugin(ref i) => i.size_hint(), - _CaptureNamesIter::Dynamic(ref i) => i.size_hint(), + _CaptureNames::Plugin(ref i) => i.size_hint(), + _CaptureNames::Dynamic(ref i) => i.size_hint(), } } } @@ -786,12 +786,12 @@ impl<'r> Iterator for CaptureNamesIter<'r> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct SplitsIter<'r, 't> { - finder: FindIter<'r, 't>, +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, last: usize, } -impl<'r, 't> Iterator for SplitsIter<'r, 't> { +impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -821,12 +821,12 @@ impl<'r, 't> Iterator for SplitsIter<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the string being split. -pub struct SplitsNIter<'r, 't> { - splits: SplitsIter<'r, 't>, +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, n: usize, } -impl<'r, 't> Iterator for SplitsNIter<'r, 't> { +impl<'r, 't> Iterator for SplitN<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { @@ -1032,19 +1032,19 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct CapturesIter<'r, 't>(CapturesIterInner<'r, 't>); +pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>); -enum CapturesIterInner<'r, 't> { - Dynamic(re_trait::CapturesIter<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::CapturesIter<'t, Plugin>), +enum CaptureMatchesInner<'r, 't> { + Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::CaptureMatches<'t, Plugin>), } -impl<'r, 't> Iterator for CapturesIter<'r, 't> { +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { match self.0 { - CapturesIterInner::Dynamic(ref mut it) => { + CaptureMatchesInner::Dynamic(ref mut it) => { let named = it.regex().capture_name_idx().clone(); it.next().map(|locs| Captures { text: it.text(), @@ -1052,7 +1052,7 @@ impl<'r, 't> Iterator for CapturesIter<'r, 't> { named_groups: NamedGroups::Dynamic(named), }) } - CapturesIterInner::Plugin(ref mut it) => { + CaptureMatchesInner::Plugin(ref mut it) => { it.next().map(|locs| Captures { text: it.text(), locs: locs, @@ -1070,32 +1070,32 @@ impl<'r, 't> Iterator for CapturesIter<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct FindIter<'r, 't>(FindIterInner<'r, 't>); +pub struct Matches<'r, 't>(MatchesInner<'r, 't>); -enum FindIterInner<'r, 't> { - Dynamic(re_trait::FindIter<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::FindIter<'t, Plugin>), +enum MatchesInner<'r, 't> { + Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>), + Plugin(re_trait::Matches<'t, Plugin>), } -impl<'r, 't> FindIter<'r, 't> { +impl<'r, 't> Matches<'r, 't> { fn text(&self) -> &'t str { match self.0 { - FindIterInner::Dynamic(ref it) => it.text(), - FindIterInner::Plugin(ref it) => it.text(), + MatchesInner::Dynamic(ref it) => it.text(), + MatchesInner::Plugin(ref it) => it.text(), } } } -impl<'r, 't> Iterator for FindIter<'r, 't> { +impl<'r, 't> Iterator for Matches<'r, 't> { type Item = Match<'t>; fn next(&mut self) -> Option> { let text = self.text(); match self.0 { - FindIterInner::Dynamic(ref mut it) => { + MatchesInner::Dynamic(ref mut it) => { it.next().map(|(s, e)| Match::new(text, s, e)) } - FindIterInner::Plugin(ref mut it) => { + MatchesInner::Plugin(ref mut it) => { it.next().map(|(s, e)| Match::new(text, s, e)) } }