Skip to content
49 changes: 43 additions & 6 deletions automata/fa/nfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import re
from collections import deque
from itertools import chain, count, product, repeat
from typing import (
Expand Down Expand Up @@ -216,21 +217,57 @@ def from_regex(
Self
The NFA accepting the language of the input regex.
"""

if input_symbols is None:
input_symbols = frozenset(regex) - RESERVED_CHARACTERS
else:
# First check user-provided input_symbols for reserved characters
if input_symbols is not None:
conflicting_symbols = RESERVED_CHARACTERS & input_symbols
if conflicting_symbols:
raise exceptions.InvalidSymbolError(
f"Invalid input symbols: {conflicting_symbols}"
)

nfa_builder = parse_regex(regex, input_symbols)
# Extract all characters from character classes
class_symbols = set()
range_pattern = re.compile(r"\[([^\]]*)\]")
for match in range_pattern.finditer(regex):
class_content = match.group(1)
pos = 0
while pos < len(class_content):
if pos + 2 < len(class_content) and class_content[pos + 1] == "-":
start_char, end_char = (
class_content[pos],
class_content[pos + 2],
)
if ord(start_char) <= ord(end_char):
for i in range(ord(start_char), ord(end_char) + 1):
class_symbols.add(chr(i))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be done with a python update call instead of a loop.

pos += 3
else:
if class_content[pos] != "^": # Skip negation symbol
class_symbols.add(class_content[pos])
pos += 1

# Set up the final input symbols
if input_symbols is None:
# If no input_symbols provided, collect all non-reserved chars from regex
input_symbols_set = set()
for char in regex:
if char not in RESERVED_CHARACTERS:
input_symbols_set.add(char)

# Include all character class symbols
input_symbols_set.update(class_symbols)
final_input_symbols = frozenset(input_symbols_set)
else:
# For user-provided input_symbols, we need to update with character class
# Create a copy to avoid modifying the original input_symbols
final_input_symbols = frozenset(input_symbols).union(class_symbols)

# Build the NFA
nfa_builder = parse_regex(regex, final_input_symbols)

return cls(
states=frozenset(nfa_builder._transitions.keys()),
input_symbols=input_symbols,
input_symbols=final_input_symbols,
transitions=nfa_builder._transitions,
initial_state=nfa_builder._initial_state,
final_states=nfa_builder._final_states,
Expand Down
175 changes: 168 additions & 7 deletions automata/regex/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
BuilderTransitionsT = Dict[int, Dict[str, Set[int]]]

RESERVED_CHARACTERS = frozenset(
("*", "|", "(", ")", "?", " ", "\t", "&", "+", ".", "^", "{", "}")
("*", "|", "(", ")", "?", " ", "\t", "&", "+", ".", "^", "{", "}", "[", "]")
)


Expand Down Expand Up @@ -414,12 +414,52 @@ def __init__(self, text: str, lower_bound: int, upper_bound: Optional[int]) -> N
self.upper_bound = upper_bound

@classmethod
def from_match(cls: Type[Self], match: re.Match) -> QuantifierToken:
def from_match(cls: Type[Self], match: re.Match) -> Self:
lower_bound_str = match.group(1)
upper_bound_str = match.group(2)

lower_bound = 0 if not lower_bound_str else int(lower_bound_str)
upper_bound = None if not upper_bound_str else int(upper_bound_str)
# Parse lower bound
if not lower_bound_str:
lower_bound = 0
else:
try:
lower_bound = int(lower_bound_str)
if lower_bound < 0:
raise exceptions.InvalidRegexError(
f"Lower bound cannot be negative: {lower_bound}"
)
except ValueError:
# This shouldn't happen with our regex pattern, but just in case
raise exceptions.InvalidRegexError(
f"Invalid lower bound: {lower_bound_str}"
)

# Parse upper bound
if upper_bound_str is None:
# Format {n}
upper_bound = lower_bound
elif not upper_bound_str:
# Format {n,}
upper_bound = None
else:
try:
upper_bound = int(upper_bound_str)
if upper_bound < 0:
raise exceptions.InvalidRegexError(
f"Upper bound cannot be negative: {upper_bound}"
)
except ValueError:
# This shouldn't happen with our regex pattern, but just in case
raise exceptions.InvalidRegexError(
f"Invalid upper bound: {upper_bound_str}"
)

# Validate bounds relationship
if upper_bound is not None and lower_bound > upper_bound:
raise exceptions.InvalidRegexError(
f"Lower bound {lower_bound} cannot be "
"greater than upper bound {upper_bound}"
)

return cls(match.group(), lower_bound, upper_bound)

Expand Down Expand Up @@ -494,14 +534,73 @@ def val(self) -> NFARegexBuilder:
return NFARegexBuilder.wildcard(self.input_symbols, self.counter)


class CharacterClassToken(Literal[NFARegexBuilder]):
"""Subclass of literal token defining a character class."""

__slots__: Tuple[str, ...] = ("input_symbols", "class_chars", "negated", "counter")

def __init__(
self,
text: str,
class_chars: Set[str],
negated: bool,
input_symbols: AbstractSet[str],
counter: count,
) -> None:
super().__init__(text)
self.class_chars = class_chars
self.negated = negated
self.input_symbols = input_symbols
self.counter = counter

@classmethod
def from_match(cls: Type[Self], match: re.Match) -> Self:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the logic here heavily overlaps with the process_char_class function. Could one of these be made to call the other?

content = match.group(1)

# Process character ranges and build full content
pos = 0
expanded_content = ""
while pos < len(content):
if pos + 2 < len(content) and content[pos + 1] == "-":
start_char, end_char = content[pos], content[pos + 2]
if ord(start_char) <= ord(end_char):
# Include all characters in the range
expanded_content += "".join(
chr(i) for i in range(ord(start_char), ord(end_char) + 1)
)
pos += 3
else:
# Invalid range - just add characters as is
expanded_content += content[pos]
pos += 1
else:
expanded_content += content[pos]
pos += 1

is_negated = content.startswith("^")
if is_negated:
expanded_content = expanded_content[1:] # Remove ^ from the content

return cls(match.group(), expanded_content, is_negated)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like there are some missing fields that are used in the constructor? Shouldn't this line throw an exception?

EDIT: Based on the coverage report, this line isn't being hit at all.


def val(self) -> NFARegexBuilder:
if self.negated:
# For negated class, create an NFA accepting any character
# not in class_chars
acceptable_chars = self.input_symbols - self.class_chars
return NFARegexBuilder.wildcard(acceptable_chars, self.counter)
else:
# Create an NFA accepting any character in the set
return NFARegexBuilder.wildcard(self.class_chars, self.counter)


def add_concat_and_empty_string_tokens(
token_list: List[Token[NFARegexBuilder]],
state_name_counter: count,
) -> List[Token[NFARegexBuilder]]:
"""Add concat tokens to list of parsed infix tokens."""

final_token_list = []

# Pairs of token types to insert concat tokens in between
concat_pairs = [
(Literal, Literal),
Expand All @@ -524,7 +623,6 @@ def add_concat_and_empty_string_tokens(
next_token, secondClass
):
final_token_list.append(ConcatToken(""))

for firstClass, secondClass in empty_string_pairs:
if isinstance(curr_token, firstClass) and isinstance(
next_token, secondClass
Expand All @@ -548,11 +646,27 @@ def get_regex_lexer(
lexer.register_token(KleeneStarToken.from_match, r"\*")
lexer.register_token(KleenePlusToken.from_match, r"\+")
lexer.register_token(OptionToken.from_match, r"\?")
lexer.register_token(QuantifierToken.from_match, r"\{(.*?),(.*?)\}")
# Match both {n}, {n,m}, and {,m} formats for quantifiers
lexer.register_token(QuantifierToken.from_match, r"\{(-?\d*)(?:,(-?\d*))?\}")
# Register wildcard and character classes next
lexer.register_token(
lambda match: WildcardToken(match.group(), input_symbols, state_name_counter),
r"\.",
)

# Add character class token
def character_class_factory(match: re.Match) -> CharacterClassToken:
class_str = match.group()
negated, class_chars = process_char_class(class_str)
return CharacterClassToken(
class_str, class_chars, negated, input_symbols, state_name_counter
)

lexer.register_token(
character_class_factory,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would personally prefer to use the from_match syntax the way the other token types are registered, but either syntax is fine. But it seems like the from_match in the new token class isn't being called at all.

r"\[[^\]]*\]", # Match anything between [ and ]
)

lexer.register_token(
lambda match: StringToken(match.group(), state_name_counter), r"\S"
)
Expand All @@ -577,3 +691,50 @@ def parse_regex(regexstr: str, input_symbols: AbstractSet[str]) -> NFARegexBuild
postfix = tokens_to_postfix(tokens_with_concats)

return parse_postfix_tokens(postfix)


def process_char_class(class_str: str) -> Tuple[bool, Set[str]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Might be good to have a couple of small test cases for this function independently to aid in debugging later, but won't make any hard requests for this.

"""Process a character class string into a set of characters and negation flag.

Parameters
----------
class_str : str
The character class string including brackets, e.g., '[a-z]' or '[^abc]'

Returns
-------
Tuple[bool, Set[str]]
A tuple containing (is_negated, set_of_characters)
"""
content = class_str[1:-1]

if not content:
raise exceptions.InvalidRegexError("Empty character class '[]' is not allowed")

negated = content.startswith("^")
if negated:
content = content[1:]

if not content:
raise exceptions.InvalidRegexError(
"Empty negated character class '[^]' is not allowed"
)

chars = set()
i = 0
while i < len(content):
# Special case: - at the beginning or end is treated as literal
if content[i] == "-" and (i == 0 or i == len(content) - 1):
chars.add("-")
i += 1
# Handle ranges - but only when there are characters on both sides
elif i + 2 < len(content) and content[i + 1] == "-":
# Range like a-z
start, end = content[i], content[i + 2]
chars.update(chr(c) for c in range(ord(start), ord(end) + 1))
i += 3
else:
chars.add(content[i])
i += 1

return negated, chars
4 changes: 4 additions & 0 deletions automata/regex/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
- `&`: Intersection. Ex: `a&b`
- `.`: Wildcard. Ex: `a.b`
- `^`: Shuffle. Ex: `a^b`
- `[...]`: Character class, matching any single character from the class.
Ex: `[abc]`, `[0-9]`
- `[^...]`: Negated character class, matching any single character not in the class.
Ex: `[^abc]`
- `{}`: Quantifiers expressing finite repetitions. Ex: `a{1,2}`,`a{3,}`
- `()`: The empty string.
- `(...)`: Grouping.
Expand Down
Loading
Loading