|  | 
| 6 | 6 | 
 | 
| 7 | 7 | import automata.base.exceptions as exceptions | 
| 8 | 8 | import automata.regex.regex as re | 
| 9 |  | -from automata.fa.nfa import NFA, RESERVED_CHARACTERS | 
|  | 9 | +from automata.fa.nfa import NFA | 
| 10 | 10 | from automata.regex.parser import StringToken, WildcardToken | 
| 11 | 11 | 
 | 
| 12 | 12 | 
 | 
| @@ -114,13 +114,13 @@ def test_intersection(self) -> None: | 
| 114 | 114 |         # Test intersection subset | 
| 115 | 115 |         regex_3 = "bcdaaa" | 
| 116 | 116 |         nfa_5 = NFA.from_regex(regex_3) | 
| 117 |  | -        nfa_6 = NFA.from_regex(f"({regex_3}) & (bcda*)") | 
|  | 117 | +        nfa_6 = NFA.from_regex(f"({regex_3})&(bcda*)") | 
| 118 | 118 | 
 | 
| 119 | 119 |         self.assertEqual(nfa_5, nfa_6) | 
| 120 | 120 | 
 | 
| 121 | 121 |         # Test distributive law | 
| 122 |  | -        regex_4 = f"{regex_1} & (({regex_2}) | ({regex_3}))" | 
| 123 |  | -        regex_5 = f"(({regex_1}) & ({regex_2})) | (({regex_1}) & ({regex_3}))" | 
|  | 122 | +        regex_4 = f"{regex_1}&(({regex_2})|({regex_3}))" | 
|  | 123 | +        regex_5 = f"(({regex_1})&({regex_2}))|(({regex_1})&({regex_3}))" | 
| 124 | 124 |         nfa_7 = NFA.from_regex(regex_4) | 
| 125 | 125 |         nfa_8 = NFA.from_regex(regex_5) | 
| 126 | 126 | 
 | 
| @@ -159,18 +159,18 @@ def test_shuffle(self) -> None: | 
| 159 | 159 |         self.assertTrue( | 
| 160 | 160 |             re.isequal( | 
| 161 | 161 |                 "ab^cd", | 
| 162 |  | -                "abcd | acbd | cabd | acdb | cadb | cdab", | 
|  | 162 | +                "abcd|acbd|cabd|acdb|cadb|cdab", | 
| 163 | 163 |                 input_symbols=input_symbols, | 
| 164 | 164 |             ) | 
| 165 | 165 |         ) | 
| 166 | 166 |         self.assertTrue( | 
| 167 | 167 |             re.isequal("(a*)^(b*)^(c*)^(d*)", ".*", input_symbols=input_symbols) | 
| 168 | 168 |         ) | 
| 169 | 169 |         self.assertTrue( | 
| 170 |  | -            re.isequal("ca^db", "(c^db)a | (ca^d)b", input_symbols=input_symbols) | 
|  | 170 | +            re.isequal("ca^db", "(c^db)a|(ca^d)b", input_symbols=input_symbols) | 
| 171 | 171 |         ) | 
| 172 | 172 |         self.assertTrue( | 
| 173 |  | -            re.isequal("a^(b|c)", "ab | ac | ba | ca", input_symbols=input_symbols) | 
|  | 173 | +            re.isequal("a^(b|c)", "ab|ac|ba|ca", input_symbols=input_symbols) | 
| 174 | 174 |         ) | 
| 175 | 175 | 
 | 
| 176 | 176 |         reference_nfa = NFA.from_regex("a*^ba") | 
| @@ -229,10 +229,14 @@ def test_blank(self) -> None: | 
| 229 | 229 |         self.assertTrue(re.isequal("a()", "a")) | 
| 230 | 230 |         self.assertTrue(re.isequal("a()b()()c()", "abc")) | 
| 231 | 231 | 
 | 
| 232 |  | -    def test_invalid_symbols(self) -> None: | 
|  | 232 | +    def test_reserved_characters_handled_correctly(self) -> None: | 
| 233 | 233 |         """Should throw exception if reserved character is in input symbols""" | 
| 234 |  | -        with self.assertRaises(exceptions.InvalidSymbolError): | 
| 235 |  | -            NFA.from_regex("a+", input_symbols={"a", "+"}) | 
|  | 234 | +        nfa = NFA.from_regex("a+", input_symbols={"a", "+"}) | 
|  | 235 | +        self.assertTrue(nfa.accepts_input("a")) | 
|  | 236 | +        self.assertTrue(nfa.accepts_input("aa")) | 
|  | 237 | +        self.assertFalse(nfa.accepts_input("a+")) | 
|  | 238 | +        self.assertFalse(nfa.accepts_input("")) | 
|  | 239 | +        self.assertFalse(nfa.accepts_input("+")) | 
| 236 | 240 | 
 | 
| 237 | 241 |     def test_character_class(self) -> None: | 
| 238 | 242 |         """Should correctly handle character classes""" | 
| @@ -344,7 +348,7 @@ def test_character_class(self) -> None: | 
| 344 | 348 |         self.assertFalse(nfa1.accepts_input("b")) | 
| 345 | 349 | 
 | 
| 346 | 350 |         # One more more complex test with and without input symbols | 
| 347 |  | -        input_symbols = set(string.printable) - RESERVED_CHARACTERS | 
|  | 351 | +        input_symbols = set(string.printable) | 
| 348 | 352 |         nfa1 = NFA.from_regex("[a-zA-Z0-9._%+-]+", input_symbols=input_symbols) | 
| 349 | 353 |         self.assertTrue(nfa1.accepts_input("a")) | 
| 350 | 354 |         self.assertTrue(nfa1.accepts_input("1")) | 
| @@ -382,8 +386,6 @@ def create_range(start_char: str, end_char: str) -> set[str]: | 
| 382 | 386 |         ascii_chars = set(string.printable) | 
| 383 | 387 |         input_symbols.update(ascii_chars) | 
| 384 | 388 | 
 | 
| 385 |  | -        input_symbols = input_symbols - RESERVED_CHARACTERS | 
| 386 |  | - | 
| 387 | 389 |         latin_nfa = NFA.from_regex("[¡-ƿ]+", input_symbols=input_symbols) | 
| 388 | 390 |         greek_nfa = NFA.from_regex("[Ͱ-Ͽ]+", input_symbols=input_symbols) | 
| 389 | 391 |         cyrillic_nfa = NFA.from_regex("[Ѐ-ӿ]+", input_symbols=input_symbols) | 
| @@ -437,7 +439,7 @@ def create_range(start_char: str, end_char: str) -> set[str]: | 
| 437 | 439 |         self.assertFalse(non_latin_nfa.accepts_input("a¡")) | 
| 438 | 440 | 
 | 
| 439 | 441 |         alphabet = set("abcdefghijklmnopqrstuvwxyz") | 
| 440 |  | -        alphabet = alphabet - RESERVED_CHARACTERS | 
|  | 442 | +        alphabet = alphabet | 
| 441 | 443 |         safe_input_symbols = input_symbols.union(alphabet) | 
| 442 | 444 | 
 | 
| 443 | 445 |         ascii_range_nfa = NFA.from_regex("[i-p]+", input_symbols=safe_input_symbols) | 
| @@ -625,3 +627,224 @@ def test_shorthand_character_classes(self) -> None: | 
| 625 | 627 |         self.assertTrue(complex_nfa.accepts_input("_\t0\n")) | 
| 626 | 628 |         self.assertFalse(complex_nfa.accepts_input("abc 123\n"))  # space instead of tab | 
| 627 | 629 |         self.assertFalse(complex_nfa.accepts_input("abc\t123"))  # missing newline | 
|  | 630 | + | 
|  | 631 | +    def test_negated_class_with_period(self) -> None: | 
|  | 632 | +        """Test that negated character classes can match the period character""" | 
|  | 633 | + | 
|  | 634 | +        # Create an NFA with a negated character class | 
|  | 635 | +        nfa = NFA.from_regex(r"[.]+.", input_symbols={"a"}) | 
|  | 636 | +        self.assertTrue(nfa.accepts_input(".a")) | 
|  | 637 | +        self.assertFalse(nfa.accepts_input("<a")) | 
|  | 638 | + | 
|  | 639 | +        # Create an NFA with a negated character class | 
|  | 640 | +        nfa = NFA.from_regex(r"[^<>]+", input_symbols={"a", "."}) | 
|  | 641 | +        self.assertTrue(nfa.accepts_input(".")) | 
|  | 642 | +        self.assertTrue(nfa.accepts_input("...")) | 
|  | 643 | + | 
|  | 644 | +        nfa = NFA.from_regex(r"[^<>]+", input_symbols=set(string.printable)) | 
|  | 645 | +        # This should match any character except < and > | 
|  | 646 | +        self.assertTrue(nfa.accepts_input("abc")) | 
|  | 647 | +        self.assertTrue(nfa.accepts_input("123")) | 
|  | 648 | +        self.assertTrue(nfa.accepts_input('!@#$%^&*()_+{}|:",./?`~')) | 
|  | 649 | + | 
|  | 650 | +        # These should not match | 
|  | 651 | +        self.assertFalse(nfa.accepts_input("<")) | 
|  | 652 | +        self.assertFalse(nfa.accepts_input(">")) | 
|  | 653 | +        self.assertFalse(nfa.accepts_input("a<b"))  # contains < | 
|  | 654 | +        self.assertFalse(nfa.accepts_input("a>b"))  # contains > | 
|  | 655 | + | 
|  | 656 | +    def test_slash_character(self) -> None: | 
|  | 657 | +        """Should correctly handle the slash character""" | 
|  | 658 | +        nfa = NFA.from_regex(r"/", input_symbols=set(string.printable)) | 
|  | 659 | +        self.assertTrue(nfa.accepts_input("/")) | 
|  | 660 | +        self.assertFalse(nfa.accepts_input("a/b")) | 
|  | 661 | + | 
|  | 662 | +    def test_email_like_regexes(self) -> None: | 
|  | 663 | +        """Should correctly handle email-like regexes""" | 
|  | 664 | +        input_symbols = set(string.printable) | 
|  | 665 | + | 
|  | 666 | +        # Pattern for bracketed email content: ">content<something" | 
|  | 667 | +        bracketed_nfa = NFA.from_regex(r">[^<>]+<.*", input_symbols=input_symbols) | 
|  | 668 | +        self.assertTrue (bracketed_nfa .accepts_input (">[email protected]<" )) | 
|  | 669 | +        self.assertTrue (bracketed_nfa .accepts_input (">John Doe<[email protected]" )) | 
|  | 670 | +        self.assertFalse (bracketed_nfa .accepts_input ("[email protected]" ))  # missing > | 
|  | 671 | +        self.assertFalse(bracketed_nfa.accepts_input("><"))  # empty content | 
|  | 672 | + | 
|  | 673 | +        # Pattern for "To:" header field | 
|  | 674 | +        to_header_nfa = NFA.from_regex(r"to:[^\r\n]+\r\n", input_symbols=input_symbols) | 
|  | 675 | +        self.assertTrue (to_header_nfa .accepts_input ("to:[email protected]\r\n" )) | 
|  | 676 | +        self.assertTrue( | 
|  | 677 | +            to_header_nfa.accepts_input( | 
|  | 678 | +                "to:Multiple Recipients <[email protected]>\r\n" | 
|  | 679 | +            ) | 
|  | 680 | +        ) | 
|  | 681 | +        self.assertFalse( | 
|  | 682 | +            to_header_nfa.accepts_input ("to:[email protected]" ) | 
|  | 683 | +        )  # missing newline | 
|  | 684 | +        self.assertFalse( | 
|  | 685 | +            to_header_nfa.accepts_input ("from:[email protected]\r\n" ) | 
|  | 686 | +        )  # wrong header | 
|  | 687 | + | 
|  | 688 | +        # Pattern for "Subject:" header field | 
|  | 689 | +        subject_nfa = NFA.from_regex( | 
|  | 690 | +            r"\)subject:[^\r\n]+\r\n", input_symbols=input_symbols | 
|  | 691 | +        ) | 
|  | 692 | +        self.assertTrue(subject_nfa.accepts_input(")subject:Hello World\r\n")) | 
|  | 693 | +        self.assertTrue( | 
|  | 694 | +            subject_nfa.accepts_input(")subject:Re: Meeting Tomorrow at 10AM\r\n") | 
|  | 695 | +        ) | 
|  | 696 | +        self.assertFalse( | 
|  | 697 | +            subject_nfa.accepts_input("subject:Hello World\r\n") | 
|  | 698 | +        )  # missing ) | 
|  | 699 | + | 
|  | 700 | +        # Pattern for standard email address | 
|  | 701 | +        email_nfa = NFA.from_regex( | 
|  | 702 | +            r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", | 
|  | 703 | +            input_symbols=input_symbols, | 
|  | 704 | +        ) | 
|  | 705 | +        self.assertTrue (email_nfa .accepts_input ("[email protected]" )) | 
|  | 706 | +        self.assertTrue (email_nfa .accepts_input ("[email protected]" )) | 
|  | 707 | +        self.assertTrue (email_nfa .accepts_input ("unusual!#$%&'*[email protected]" )) | 
|  | 708 | +        self.assertFalse(email_nfa.accepts_input("@example.com"))  # missing local part | 
|  | 709 | +        self.assertFalse(email_nfa.accepts_input("user@"))  # missing domain | 
|  | 710 | + | 
|  | 711 | +        # Pattern for DKIM signature with Base64 hash | 
|  | 712 | +        dkim_bh_nfa = NFA.from_regex( | 
|  | 713 | +            r"dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", | 
|  | 714 | +            input_symbols=input_symbols, | 
|  | 715 | +        ) | 
|  | 716 | +        self.assertTrue( | 
|  | 717 | +            dkim_bh_nfa.accepts_input( | 
|  | 718 | +                "dkim-signature:v=1; a=rsa-sha256; bh=47DEQpj8HBSa+/TImW+5JCeuQeR;" | 
|  | 719 | +            ) | 
|  | 720 | +        ) | 
|  | 721 | +        self.assertTrue( | 
|  | 722 | +            dkim_bh_nfa.accepts_input( | 
|  | 723 | +                "dkim-signature:v=1; a=rsa-sha256; d=example.org; bh=base64+/hash=;" | 
|  | 724 | +            ) | 
|  | 725 | +        ) | 
|  | 726 | +        self.assertFalse( | 
|  | 727 | +            dkim_bh_nfa.accepts_input("dkim-signature:v=1; bh=;") | 
|  | 728 | +        )  # empty hash | 
|  | 729 | + | 
|  | 730 | +        # Pattern for alternative email address format | 
|  | 731 | +        alt_email_nfa = NFA.from_regex( | 
|  | 732 | +            r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", | 
|  | 733 | +            input_symbols=input_symbols, | 
|  | 734 | +        ) | 
|  | 735 | +        self.assertTrue (alt_email_nfa .accepts_input ("[email protected]" )) | 
|  | 736 | +        self.assertTrue( | 
|  | 737 | +            alt_email_nfa.accepts_input ("user/[email protected]" ) | 
|  | 738 | +        )  # with slash | 
|  | 739 | +        self.assertFalse(alt_email_nfa.accepts_input("user@"))  # missing domain | 
|  | 740 | + | 
|  | 741 | +        # Pattern for "From:" header field | 
|  | 742 | +        from_header_nfa = NFA.from_regex( | 
|  | 743 | +            r"from:[^\r\n]+\r\n", input_symbols=input_symbols | 
|  | 744 | +        ) | 
|  | 745 | +        self.assertTrue (from_header_nfa .accepts_input ("from:[email protected]\r\n" )) | 
|  | 746 | +        self.assertTrue( | 
|  | 747 | +            from_header_nfa.accepts_input ("from:John Doe <[email protected]>\r\n" ) | 
|  | 748 | +        ) | 
|  | 749 | +        self.assertFalse( | 
|  | 750 | +            from_header_nfa.accepts_input ("from:[email protected]" ) | 
|  | 751 | +        )  # missing newline | 
|  | 752 | + | 
|  | 753 | +        # Pattern for DKIM signature with timestamp | 
|  | 754 | +        dkim_time_nfa = NFA.from_regex( | 
|  | 755 | +            r"dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", input_symbols=input_symbols | 
|  | 756 | +        ) | 
|  | 757 | +        self.assertTrue( | 
|  | 758 | +            dkim_time_nfa.accepts_input( | 
|  | 759 | +                "dkim-signature:v=1; a=rsa-sha256; t=1623456789;" | 
|  | 760 | +            ) | 
|  | 761 | +        ) | 
|  | 762 | +        self.assertTrue( | 
|  | 763 | +            dkim_time_nfa.accepts_input( | 
|  | 764 | +                "dkim-signature:v=1; a=rsa-sha256; s=selector; t=1623456789;" | 
|  | 765 | +            ) | 
|  | 766 | +        ) | 
|  | 767 | +        self.assertFalse( | 
|  | 768 | +            dkim_time_nfa.accepts_input("dkim-signature:v=1; t=;") | 
|  | 769 | +        )  # empty timestamp | 
|  | 770 | + | 
|  | 771 | +        # Pattern for Message-ID header | 
|  | 772 | +        msgid_nfa = NFA.from_regex( | 
|  | 773 | +            r"message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", input_symbols=input_symbols | 
|  | 774 | +        ) | 
|  | 775 | +        self.assertTrue (msgid_nfa .accepts_input ("message-id:<[email protected]>\r\n" )) | 
|  | 776 | +        self.assertTrue( | 
|  | 777 | +            msgid_nfa.accepts_input ("message-id:<[email protected]>\r\n" ) | 
|  | 778 | +        ) | 
|  | 779 | +        self.assertFalse( | 
|  | 780 | +            msgid_nfa.accepts_input("message-id:<invalid chars!>\r\n") | 
|  | 781 | +        )  # invalid chars | 
|  | 782 | +        self.assertFalse( | 
|  | 783 | +            msgid_nfa.accepts_input ("message-id:<[email protected]>" ) | 
|  | 784 | +        )  # missing newline | 
|  | 785 | + | 
|  | 786 | +    def test_repeating_group_with_space(self) -> None: | 
|  | 787 | +        """Test a simpler version of the DKIM signature pattern to isolate the issue""" | 
|  | 788 | +        input_symbols = set(string.printable) | 
|  | 789 | + | 
|  | 790 | +        # Try another variation without the space in the pattern | 
|  | 791 | +        no_space = NFA.from_regex(r"([a-z]+=[^;]+;)+", input_symbols=input_symbols) | 
|  | 792 | +        self.assertTrue(no_space.accepts_input("v=1;")) | 
|  | 793 | +        self.assertTrue(no_space.accepts_input("v=1;a=2;")) | 
|  | 794 | + | 
|  | 795 | +        # Test with explicit space character instead of relying on character class | 
|  | 796 | +        explicit_space = NFA.from_regex( | 
|  | 797 | +            r"([a-z]+=[^;]+; )+", input_symbols=input_symbols | 
|  | 798 | +        ) | 
|  | 799 | +        self.assertTrue(explicit_space.accepts_input("v=1; ")) | 
|  | 800 | + | 
|  | 801 | +        # Simplified version of the problematic pattern | 
|  | 802 | +        simple_repeat = NFA.from_regex( | 
|  | 803 | +            r"([a-z]+=[^;]+; )+", input_symbols=input_symbols | 
|  | 804 | +        ) | 
|  | 805 | +        self.assertTrue(simple_repeat.accepts_input("v=1; ")) | 
|  | 806 | +        self.assertTrue(simple_repeat.accepts_input("v=1; a=2; ")) | 
|  | 807 | + | 
|  | 808 | +        # Test the full pattern but simplified | 
|  | 809 | +        full_simple = NFA.from_regex( | 
|  | 810 | +            r"header:([a-z]+=[^;]+; )+value;", input_symbols=input_symbols | 
|  | 811 | +        ) | 
|  | 812 | +        self.assertTrue(full_simple.accepts_input("header:v=1; value;")) | 
|  | 813 | +        self.assertTrue(full_simple.accepts_input("header:v=1; a=2; value;")) | 
|  | 814 | + | 
|  | 815 | +    def test_space_in_patterns(self) -> None: | 
|  | 816 | +        """Test different patterns with spaces to isolate the issue""" | 
|  | 817 | +        input_symbols = set(string.printable) | 
|  | 818 | + | 
|  | 819 | +        # Test 1: Basic pattern with space at the end | 
|  | 820 | +        basic = NFA.from_regex(r"a ", input_symbols=input_symbols) | 
|  | 821 | +        self.assertTrue(basic.accepts_input("a ")) | 
|  | 822 | + | 
|  | 823 | +        # Test 2: Character class with space | 
|  | 824 | +        with_class = NFA.from_regex(r"a[b ]", input_symbols=input_symbols) | 
|  | 825 | +        self.assertTrue(with_class.accepts_input("a ")) | 
|  | 826 | +        self.assertTrue(with_class.accepts_input("ab")) | 
|  | 827 | + | 
|  | 828 | +        # Test 3: Simple repetition with space | 
|  | 829 | +        simple_repeat = NFA.from_regex(r"(a )+", input_symbols=input_symbols) | 
|  | 830 | +        self.assertTrue(simple_repeat.accepts_input("a ")) | 
|  | 831 | +        self.assertTrue(simple_repeat.accepts_input("a a ")) | 
|  | 832 | + | 
|  | 833 | +        # Test 4: Specific repeating pattern without the semicolon | 
|  | 834 | +        no_semicolon = NFA.from_regex(r"([a-z]+=. )+", input_symbols=input_symbols) | 
|  | 835 | +        self.assertTrue(no_semicolon.accepts_input("v=1 ")) | 
|  | 836 | +        self.assertTrue(no_semicolon.accepts_input("v=1 a=2 ")) | 
|  | 837 | + | 
|  | 838 | +        # Test 5: With semicolon but space before | 
|  | 839 | +        space_before = NFA.from_regex(r"([a-z]+=[^;]+ ;)+", input_symbols=input_symbols) | 
|  | 840 | +        self.assertTrue(space_before.accepts_input("v=1 ;")) | 
|  | 841 | +        self.assertTrue(space_before.accepts_input("v=1 ;a=2 ;")) | 
|  | 842 | + | 
|  | 843 | +        # Test 6: Space as part of negated class | 
|  | 844 | +        space_in_neg = NFA.from_regex(r"([a-z]+=[^; ]+;)+", input_symbols=input_symbols) | 
|  | 845 | +        self.assertTrue(space_in_neg.accepts_input("v=1;")) | 
|  | 846 | + | 
|  | 847 | +        # Test 7: Bare minimum to reproduce | 
|  | 848 | +        minimal = NFA.from_regex(r"(a; )+", input_symbols=input_symbols) | 
|  | 849 | +        self.assertTrue(minimal.accepts_input("a; ")) | 
|  | 850 | +        self.assertTrue(minimal.accepts_input("a; a; ")) | 
0 commit comments