diff --git a/stl/inc/regex b/stl/inc/regex index d688a526e3..2409ce9bd5 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1682,14 +1682,19 @@ enum class _Rx_unwind_ops { _Do_nothing, _Loop_simple_nongreedy, _Loop_simple_greedy, + _Loop_nongreedy, + _Loop_greedy, + _Loop_restore_vals, }; template class _Rx_state_frame_t { public: _Rx_unwind_ops _Code; + int _Loop_idx_sav; _Node_base* _Node; _Tgt_state_t<_BidIt> _Match_state; + size_t _Loop_frame_idx_sav; }; template @@ -1816,7 +1821,6 @@ private: void _Decrease_stack_usage_count(); void _Increase_complexity_count(); - bool _Do_rep(_Node_rep*, bool, int); void _Prepare_rep(_Node_rep*); bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*); _It _Do_class(_Node_base*, _It); @@ -3372,7 +3376,7 @@ void _Builder2<_FwdIt, _Elem, _RxTraits>::_Tidy() noexcept { // free memory template size_t _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Push_frame(_Rx_unwind_ops _Code, _Node_base* _Node) { if (_Frames_count >= _Frames.size()) { - _Frames.push_back({_Code, _Node, _Tgt_state}); + _Frames.push_back({_Code, 0, _Node, _Tgt_state, size_t{}}); } else { auto& _Frame = _Frames[_Frames_count]; _Frame._Code = _Code; @@ -3413,74 +3417,6 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun } } -template -bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) { - // apply repetition - bool _Matched0 = false; - _Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number]; - const int _Loop_idx_sav = _Psav->_Loop_idx; - const size_t _Loop_frame_idx_sav = _Psav->_Loop_frame_idx; - const size_t _Frame_idx = _Push_frame(); - const bool _Progress = _Init_idx == 0 || _Frames[_Loop_frame_idx_sav]._Match_state._Cur != _Tgt_state._Cur; - - if (_Init_idx < _Node->_Min) { // try another required match - _Psav->_Loop_frame_idx = _Frame_idx; - _Psav->_Loop_idx = _Progress ? _Init_idx + 1 : _Node->_Min; // try only one more match after an empty match - _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), - _Tgt_state._Grp_valid.end(), false); - _Matched0 = _Match_pat(_Node->_Next); - } else if (_Init_idx == _Node->_Min || _Progress) { - if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) { - _Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail - } else if (_Longest) { // longest, try any number of repetitions - - // match with no further repetition - _Matched0 = _Match_pat(_Node->_End_rep->_Next); - - // try to match with one more repetition - _Tgt_state = _Frames[_Frame_idx]._Match_state; - _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_frame_idx = _Frame_idx; - if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true - _Matched0 = true; - } - } else if (!_Greedy) { // not greedy, favor minimum number of reps - _Matched0 = _Match_pat(_Node->_End_rep->_Next); - if (!_Matched0) { // tail failed, try another rep - _Tgt_state = _Frames[_Frame_idx]._Match_state; - _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_frame_idx = _Frame_idx; - _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), - _Tgt_state._Grp_valid.end(), false); - _Matched0 = _Match_pat(_Node->_Next); - } - } else { // greedy, favor maximum number of reps, - // so try another rep - _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_frame_idx = _Frame_idx; - _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), - _Tgt_state._Grp_valid.end(), false); - _Matched0 = _Match_pat(_Node->_Next); - - if (!_Matched0) { // rep failed, try tail - _Psav->_Loop_idx = _Loop_idx_sav; - _Psav->_Loop_frame_idx = _Loop_frame_idx_sav; - _Tgt_state = _Frames[_Frame_idx]._Match_state; - _Matched0 = _Match_pat(_Node->_End_rep->_Next); - } - } - } else if (_Init_idx == 1 && (_Sflags & regex_constants::_Any_posix)) { - // POSIX allows an empty repetition if the subexpression is matched only once, - // so try tail - _Matched0 = _Match_pat(_Node->_End_rep->_Next); - } - - _Psav->_Loop_idx = _Loop_idx_sav; - _Psav->_Loop_frame_idx = _Loop_frame_idx_sav; - _Pop_frame(_Frame_idx); - return _Matched0; -} - template void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Prepare_rep(_Node_rep* _Node) { _Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number]; @@ -4055,9 +3991,9 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N auto _Node = static_cast<_Node_rep*>(_Nx); _Prepare_rep(_Node); bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0; + auto& _Sav = _Loop_vals[_Node->_Loop_number]; if (_Node->_Simple_loop == 1) { - auto& _Sav = _Loop_vals[_Node->_Loop_number]; _Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing); _Increase_complexity_count(); if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first @@ -4078,8 +4014,33 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N } } } else { - _Failed = !_Do_rep(_Node, _Greedy, 0); - _Next = nullptr; + if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first + // set up stack unwinding for greedy matching or loop val restoration + const auto _Code = + _Node->_Min == 0 ? _Rx_unwind_ops::_Loop_greedy : _Rx_unwind_ops::_Loop_restore_vals; + auto _Frame_idx = _Push_frame(_Code, _Node); + auto& _Frame = _Frames[_Frame_idx]; + _Frame._Loop_idx_sav = _Sav._Loop_idx; + _Frame._Loop_frame_idx_sav = _Sav._Loop_frame_idx; + _Sav._Loop_idx = 1; + _Sav._Loop_frame_idx = _Frame_idx; + _Increase_stack_usage_count(); + // _Next is already assigned correctly for matching a rep + } else { // try tail first + _Next = _Node->_End_rep->_Next; + // set up stack unwinding for non-greedy matching if at least one rep is allowed + if (_Node->_Max != 0) { + auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Loop_nongreedy, _Node); + auto& _Frame = _Frames[_Frame_idx]; + _Frame._Loop_idx_sav = _Sav._Loop_idx; + _Frame._Loop_frame_idx_sav = _Sav._Loop_frame_idx; + _Sav._Loop_idx = 0; + _Sav._Loop_frame_idx = _Frame_idx; + _Increase_stack_usage_count(); + } else { + _Increase_complexity_count(); + } + } } } @@ -4128,8 +4089,62 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N _Increase_complexity_count(); } } else { - _Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx); - _Next = nullptr; + const bool _Progress = _Frames[_Sav._Loop_frame_idx]._Match_state._Cur != _Tgt_state._Cur; + if (_Sav._Loop_idx < _Nr->_Min) { // try another required match + auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Loop_restore_vals, _Nr); + auto& _Frame = _Frames[_Frame_idx]; + _Frame._Loop_idx_sav = _Sav._Loop_idx; + _Frame._Loop_frame_idx_sav = _Sav._Loop_frame_idx; + _Sav._Loop_frame_idx = _Frame_idx; + if (_Progress) { + ++_Sav._Loop_idx; + } else { // try only one more match after an empty match + _Sav._Loop_idx = _Nr->_Min; + } + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Sav._Group_first), + _Tgt_state._Grp_valid.end(), false); + _Next = _Nr->_Next; + _Increase_stack_usage_count(); + } else if (!_Progress) { // latest rep match empty + // An empty match is allowed if it is needed to reach the minimum number of reps. + // Moreover, POSIX allows an empty repetition if the subexpression is matched only once. + // So try tail in either case, else fail. + if (_Sav._Loop_idx != _Nr->_Min + && !((_Sflags & regex_constants::_Any_posix) && _Sav._Loop_idx == 1)) { + _Failed = true; + } else { + _Increase_complexity_count(); + } + // _Next is already assigned correctly for matching tail + } else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next + // set up stack unwinding for greedy matching + auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Loop_greedy, _Nr); + auto& _Frame = _Frames[_Frame_idx]; + _Frame._Loop_idx_sav = _Sav._Loop_idx; + _Frame._Loop_frame_idx_sav = _Sav._Loop_frame_idx; + _Sav._Loop_frame_idx = _Frame_idx; + if (_Sav._Loop_idx < INT_MAX) { + ++_Sav._Loop_idx; + } + + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Sav._Group_first), + _Tgt_state._Grp_valid.end(), false); + _Next = _Nr->_Next; + _Increase_stack_usage_count(); + } else { // non-greedy matching or greedy matching with maximum reached + // set up stack unwinding for non-greedy matching if one more rep is allowed + if (_Sav._Loop_idx != _Nr->_Max) { + auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Loop_nongreedy, _Nr); + auto& _Frame = _Frames[_Frame_idx]; + _Frame._Loop_idx_sav = _Sav._Loop_idx; + _Frame._Loop_frame_idx_sav = _Sav._Loop_frame_idx; + _Sav._Loop_frame_idx = _Frame_idx; + _Increase_stack_usage_count(); + } else { + _Increase_complexity_count(); + } + // _Next is already assigned correctly for matching tail + } } break; } @@ -4249,6 +4264,51 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N } break; + case _Rx_unwind_ops::_Loop_greedy: + // try tail if matching one more rep failed + if (_Failed) { + auto _Node = static_cast<_Node_rep*>(_Frame._Node); + + _Increase_complexity_count(); + _Nx = _Node->_End_rep->_Next; + _Tgt_state = _Frame._Match_state; + _Failed = false; + } + _FALLTHROUGH; + + case _Rx_unwind_ops::_Loop_restore_vals: + { // restore loop vals after processing of a rep is completed + auto _Node = static_cast<_Node_rep*>(_Frame._Node); + auto& _Sav = _Loop_vals[_Node->_Loop_number]; + + _Sav._Loop_idx = _Frame._Loop_idx_sav; + _Sav._Loop_frame_idx = _Frame._Loop_frame_idx_sav; + + _Decrease_stack_usage_count(); + } + break; + + case _Rx_unwind_ops::_Loop_nongreedy: + // try another rep if matching tail failed or longest mode + if (_Failed || _Longest) { + auto _Node = static_cast<_Node_rep*>(_Frame._Node); + auto& _Sav = _Loop_vals[_Node->_Loop_number]; + + _Increase_complexity_count(); + _Nx = _Node->_Next; + _Tgt_state = _Frame._Match_state; + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Sav._Group_first), + _Tgt_state._Grp_valid.end(), false); + _Failed = false; + if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx + ++_Sav._Loop_idx; + } + + _Frame._Code = _Rx_unwind_ops::_Loop_restore_vals; + ++_Frames_count; + } + break; + default: #if _ITERATOR_DEBUG_LEVEL != 0 _STL_REPORT_ERROR("internal stack of regex matcher corrupted"); diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index b779b4f051..794e3ffd06 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -899,6 +899,37 @@ void test_gh_993() { } } +void test_gh_997() { + // GH-997: : Grouping within repetition causes regex stack error + // GH-1528: : regex_match gets caught in recursive loop until stack overflow occurs + + try { + (void) regex_match(string(1025, 'a'), regex("(?:a)+")); + assert(false); // adjust test when matching succeeds + } catch (const regex_error& ex) { + assert(ex.code() == error_stack); + } + + wregex rgx(LR"(^http[s]?://([^.]+\.)*example\.com/.*$)", icase); + + assert(regex_match(L"https://www.example.com/meow", rgx)); + + try { + assert(!regex_match( + L"https://www.bogus.invalid/" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456.89-123456789-123456789." + L"123456789-12345678.-123456789-123456789-1.3456789-123456789-123456789-123456789-123456789-123456789-" + L"123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-1234", + rgx)); + } catch (const regex_error& ex) { + assert(ex.code() == error_stack); + } +} + void test_gh_4995() { // GH-4995: R"([\d-e])" should be rejected g_regexTester.should_throw(R"([\d-e])", error_range); @@ -2232,6 +2263,89 @@ void test_gh_5792() { g_regexTester.should_match("bc", "(?:(?!ab))+bc"); } +void test_gh_5798() { + // GH-5798: : Process generic loops non-recursively. + // This extends our test coverage on non-simple loops, + // especially on bounds on the number of repetitions. + for (string quantifier_suffix : {"", "?"}) { + g_regexTester.should_not_match("", "(a|bc)+" + quantifier_suffix); + g_regexTester.should_match("b", "(a|cd){0}?b" + quantifier_suffix); + g_regexTester.should_not_match("ab", "(a|cd){0}?b" + quantifier_suffix); + g_regexTester.should_match("ab", "(a|cd){0,1}?b" + quantifier_suffix); + g_regexTester.should_not_match("aab", "(a|cd){0,1}?b" + quantifier_suffix); + g_regexTester.should_match("acdb", "(a|cd){0,2}?b" + quantifier_suffix); + g_regexTester.should_match("cdab", "(a|cd){1,2}?b" + quantifier_suffix); + g_regexTester.should_not_match("acdb", "(a|cd){1}?b" + quantifier_suffix); + g_regexTester.should_not_match("cdacdb", "(a|cd){1,2}?b" + quantifier_suffix); + g_regexTester.should_match("cdacdb", "(a|cd){1,3}?b" + quantifier_suffix); + g_regexTester.should_match("a", "(a|(?=^)){2}" + quantifier_suffix); + } + + // Check that greedy and non-greedy search find the appropriate match. + // For the following regexes, greedy and leftmost-longest search yield the same matches. + for (syntax_option_type options : {ECMAScript, extended}) { + { + test_regex greedy_a_or_bc_star(&g_regexTester, "(a|bc)*", options); + greedy_a_or_bc_star.should_search_match("aabcabcabcbcaa", "aabcabcabcbcaa"); + } + + { + test_regex bounded_greedy_a_or_bc_rep(&g_regexTester, "(a|bc){5}", options); + bounded_greedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabcabc"); + } + + { + test_regex upper_bounded_greedy_a_or_bc_rep(&g_regexTester, "(a|bc){0,5}", options); + upper_bounded_greedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabcabc"); + } + + { + test_regex lower_bounded_greedy_a_or_bc_rep(&g_regexTester, "(a|bc){4,1000}", options); + lower_bounded_greedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabcabcabcbcaa"); + } + + { + test_regex lower_and_upper_bounded_greedy_a_or_bc_rep(&g_regexTester, "(a|bc){2,5}", options); + lower_and_upper_bounded_greedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabcabc"); + } + + { + test_regex too_large_min_greedy_a_or_bc_rep(&g_regexTester, "(a|bc){11,1000}", options); + too_large_min_greedy_a_or_bc_rep.should_search_fail("aabcabcabcbcaa"); + } + } + + { + test_regex nongreedy_a_or_bc_star(&g_regexTester, "(a|bc)*?"); + nongreedy_a_or_bc_star.should_search_match("aabcabcabcbcaa", ""); + } + + { + test_regex bounded_nongreedy_a_or_bc_rep(&g_regexTester, "(a|bc){5}?"); + bounded_nongreedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabcabc"); + } + + { + test_regex upper_bounded_nongreedy_a_or_bc_rep(&g_regexTester, "(a|bc){0,5}?"); + upper_bounded_nongreedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", ""); + } + + { + test_regex lower_bounded_nongreedy_a_or_bc_rep(&g_regexTester, "(a|bc){4,1000}?"); + lower_bounded_nongreedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aabca"); + } + + { + test_regex lower_and_upper_bounded_nongreedy_a_or_bc_rep(&g_regexTester, "(a|bc){2,5}?"); + lower_and_upper_bounded_nongreedy_a_or_bc_rep.should_search_match("aabcabcabcbcaa", "aa"); + } + + { + test_regex too_large_min_nongreedy_a_or_bc_rep(&g_regexTester, "(a|bc){11,1000}?"); + too_large_min_nongreedy_a_or_bc_rep.should_search_fail("aabcabcabcbcaa"); + } +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -2263,6 +2377,7 @@ int main() { test_gh_731(); test_gh_992(); test_gh_993(); + test_gh_997(); test_gh_4995(); test_gh_5058(); test_gh_5160(); @@ -2286,6 +2401,7 @@ int main() { test_gh_5774(); test_gh_5790(); test_gh_5792(); + test_gh_5798(); return g_regexTester.result(); }