w4rum · Kyrela · Feb 6, 2023 · Feb 9, 2023 · Feb 9, 2023 · Dec 16, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ venv
 .env
 __pycache__
 runtime
+poetry.lock
diff --git a/README.md b/README.md
@@ -91,49 +91,83 @@ ast_tuple_of_nodes = parse(message_content)
 These are the types of nodes the parser will output:
 ```
 TEXT
-- fields: "text_content"
+- fields: "content"
 - Just standard text, no additional formatting
 - No child nodes
 
 ITALIC, BOLD, UNDERLINE, STRIKETHROUGH, SPOILER, CODE_INLINE
-- fields: "children"
+- fields: "children" "content"
 - self-explanatory
 
 QUOTE_BLOCK
-- fields: "children"
+- fields: "children" "content"
 - represents a single, uninterrupted code block (no gaps in Discord's client)
 - can not contain another quote block (Discord has no nested quotes)
 
 CODE_BLOCK
-- fields: "children", "code_lang"
+- fields: "children", "code_lang" "content"
 - can only contain a single TEXT node, all other markdown syntax inside the code block
   is ignored
 - may or may not have a language specifier
 - first newline is stripped according to the same rules that the Discord client uses
 
 USER, ROLE, CHANNEL
-- fields: "discord_id"
+- fields: "id"
 - user, role, or channel mention
 - there is no way to retrieve the user/role/channel name, color or channel type
   (text/voice/stage) from just the message, so you'll have to use the API
   (or discord.py) to query that
 
-URL_WITH_PREVIEW, URL_WITHOUT_PREVIEW
-- fields: "url"
+URL_WITH_PREVIEW, URL_WITHOUT_PREVIEW URL_WITH_PREVIEW_EMBEDDED URL_WITHOUT_PREVIEW_EMBEDDED
+- fields: "url" "content"
 - a HTTP URL
 - this is only recognized if the link actually contains "http". this is the same for the
   Discord client, with the exception that the Discord client also scan for invite links
   that don't start with http, e.g., "discord.gg/pxa"
 - the WITHOUT_PREVIEW variant appears when the message contains the URL in the <URL>
   form, which causes the Discord client to suppress the preview
+- content is provided for the URL_WITH_PREVIEW_EMBEDDED and URL_WITHOUT_PREVIEW_EMBEDDED variants
 
-EMOJI_CUSTOM
-- fields: "emoji_name", "emoji_id"
-- you can get the custom emoji's image by querying to
+EMOJI_CUSTOM, EMOJI_CUSTOM_ANIMATED
+- fields: "content", "id" "url"
+- URLs are returned in the following way
   https://cdn.discordapp.com/emojis/EMOJI_ID.png
+  https://cdn.discordapp.com/emojis/EMOJI_ID.gif
+
 
+EMOJI_UNICODE
+- fields: "content" "url"
+- unicode emoji, e.g., 🚗
+- URLs are returned in the following way
+  https://emoji.fileformat.info/png/1f697.png
+
 EMOJI_UNICODE_ENCODED
-- fields: "emoji_name"
+- fields: "content"
+- unicode emojis that are encoded using the Discord client's emoji encoding method
+- this will appear very rarely. unicode emojis are usually just posted as unicode
+  characters and thus end up in a TEXT node
+
+EMOJI_CUSTOM_ENCODED, EMOJI_CUSTOM_ANIMATED_ENCODED
+- fields: "content", "id"
+- custom emojis that are encoded using the Discord client's emoji encoding method
+- you can get the custom emoji's image by querying to
+  https://cdn.discordapp.com/emojis/EMOJI_ID.png
+
+EMOJI_CUSTOM_NAME, EMOJI_CUSTOM_ANIMATED_NAME
+- fields: "content", "name"
+- custom emojis that are posted using their name, e.g., :red_car:
+- you can get the custom emoji's image by querying to
+  https://cdn.discordapp.com/emojis/EMOJI_ID.png
+
+EMOJI_CUSTOM_NAME_ENCODED, EMOJI_CUSTOM_ANIMATED_NAME_ENCODED
+- fields: "content", "name"
+- custom emojis that are posted using their name and encoded using the Discord client's
+  emoji encoding method, e.g., <:red_car:123456789123456789>
+- you can get the custom emoji's image by querying to
+  https://cdn.discordapp.com/emojis/EMOJI_ID.png
+
+EMOJI_UNICODE_ENCODED
+- fields: "content"
 - this will appear very rarely. unicode emojis are usually just posted as unicode  
   characters and thus end up in a TEXT node it is, however, possible to send a message
   from a bot that uses, e.g., :red_car: instead of the actual red_car unicode emoji.
@@ -162,4 +196,4 @@ with how it's rendered in the Discord client:
   will be detected as spoilers spanning the code segments, although the Discord the
   client will only show spoiler bars before and after the code segment, but not on top
   of it.
-
+- Custom parsers are experimental, tends to work for different pair of values.  
diff --git a/discord_markdown_ast_parser/__init__.py b/discord_markdown_ast_parser/__init__.py
@@ -1,23 +1,30 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
-from discord_markdown_ast_parser.lexer import lex
-from discord_markdown_ast_parser.parser import Node, parse_tokens
+from .lexer import lex, Lexing
+from .parser import Node, parse_tokens
 
 
-def parse(text) -> List[Node]:
+def lexing_list_convert(lexing: Union[List[Lexing], Lexing]) -> List[Lexing]:
+    if not isinstance(lexing, list):
+        lexing = [lexing]
+    return [Lexing(item) if isinstance(item, str) else item for item in lexing]
+
+
+def parse(text, custom: Dict[str, List[Lexing]] = None) -> List[Node]:
     """
     Parses the text and returns an AST, using this package's internal Node
     representation.
     See parse_to_dict for a more generic string representation.
     """
-    tokens = list(lex(text))
-    return parse_tokens(tokens)
+    custom = custom if custom is not None else {}
+    custom = {k: lexing_list_convert(v) for k, v in custom.items()}
+    tokens = list(lex(text, custom))
+    return parse_tokens(tokens, custom)
 
 
-def parse_to_dict(text) -> List[Dict[str, Any]]:
+def parse_to_dict(text, custom: Dict[str, List[Lexing]] = None) -> List[Dict[str, Any]]:
     """
     Parses the text and returns an AST, represented as a dict.
     See the README for information on the structure of this dict.
     """
-    node_ast = parse(text)
-    return [node.to_dict() for node in node_ast]
+    return [node.to_dict() for node in parse(text, custom)]
diff --git a/discord_markdown_ast_parser/lexer.py b/discord_markdown_ast_parser/lexer.py
@@ -1,76 +1,75 @@
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
-from typing import Optional, List, Generator
-
-
-class TokenType(Enum):
-    TEXT_INLINE = 1
-    NEWLINE = 2
-    STAR = 3
-    UNDERSCORE = 4
-    TILDE = 5
-    SPOILER_DELIMITER = 6
-    USER_MENTION = 7
-    ROLE_MENTION = 8
-    CHANNEL_MENTION = 9
-    EMOJI_CUSTOM = 10
-    EMOJI_UNICODE_ENCODED = 11
-    URL_WITH_PREVIEW = 12
-    URL_WITHOUT_PREVIEW = 13
-    QUOTE_LINE_PREFIX = 14
-    CODE_INLINE_DELIMITER = 15
-    CODE_BLOCK_DELIMITER = 16
+from typing import Optional, List, Generator, Dict
+import itertools
 
 
-@dataclass
-class Token:
-    token_type: TokenType
-    value: str
-    groups: Optional[List[str]] = None
+class Lexing:
+    def __init__(self, pattern: Optional[str] = None, flags: re.RegexFlag = re.NOFLAG):
+        self.regex = re.compile(pattern, flags=flags) if pattern else None
+
+    def __call__(self, text: str):
+        return self.regex and self.regex.match(text)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})"
+
+
+URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
+
+
+class LexingRule(Lexing, Enum):
+    USER_MENTION = r"<@!?(\d{15,20})>"
+    ROLE_MENTION = r"<@&(\d{15,20})>"
+    SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
+    CHANNEL_MENTION = r"<#(\d{15,20})>"
+    TIMESTAMP = r"<t:(-?\d+)(?::([tTdDfFR]))?>"
+    EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
+    EMOJI_CUSTOM_ANIMATED = r"<a:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
+    EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])"
+    EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):"
+    URL_WITHOUT_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(<({URL_REGEX})>\)"
+    URL_WITH_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(({URL_REGEX})\)"
+    URL_WITHOUT_PREVIEW = fr"<{URL_REGEX}>"
+    URL_WITH_PREVIEW = URL_REGEX
+    QUOTE_LINE_PREFIX = r"(>>)?> "
+    TILDE = r"~"
+    STAR = r"\*"
+    UNDERSCORE = r"_"
+    SPOILER_DELIMITER = r"\|\|"
+    CODE_BLOCK_DELIMITER = r"```"
+    CODE_INLINE_DELIMITER = r"`"
+    NEWLINE = r"\n"
+    TEXT_INLINE = ""
 
 
 @dataclass
-class LexingRule:
-    token_type: TokenType
-    pattern: Optional[str] = None
+class Token:
+    value: str = ""
+    lexing_rule: Lexing = LexingRule.TEXT_INLINE
+    groups: List[str] = field(default_factory=list)
 
+    def __contains__(self, rule: Lexing):
+        return self.lexing_rule == rule
 
-def lex(input_text: str) -> Generator[Token, None, None]:
-    """
-    Scans the input text for sequences of characters (=tokens), identified by regular
-    expressions, that have a special meaning in Discord's Markdown.
 
-    This function takes care of identifying the low-level elements of the text such as
-    markdown special characters. It also does pretty much all of the parsing work for
-    simple structures such as user mentions that can be identified via regular
-    expressions.
+def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Generator[Token, None, None]:
+    """Lexes the input text and returns a generator of tokens.
+    The generator will yield a token for each lexing rule that matches the input text.
 
-    Will output the tokens in the order that they appear in the input text.
+    Args:
+        input_text (str): String to lex
+
+    Yields:
+        Generator[Token, None, None]: Generator of tokens
     """
-    # There will be cases when no specific lexing rules matches.
-    #
-    # This happens when what we're looking at is just simple text with no special
-    # markdown meaning.
-    #
-    # Problem is: We're generally only trying to match our regex pattern against the
-    # prefix of what we're looking at, so if we go through all of our rules and end up
-    # noticing "Oh, that's just text", then we don't know how long that text segment
-    # is going to be.
-    #
-    # So we're going to continue scanning until we arrive at something that is not just
-    # text, at which point we're going to output all the text we've found as a single
-    # text token.
     seen_simple_text = ""
+    custom = custom or {}
 
-    while True:
-        if len(input_text) == 0:
-            if len(seen_simple_text) > 0:
-                yield Token(TokenType.TEXT_INLINE, seen_simple_text)
-            return
-
-        for rule in lexing_rules:
-            match = re.match(rule.pattern, input_text)
+    while input_text:
+        for rule in itertools.chain(*custom.values(), LexingRule):
+            match = rule(input_text)
             if match is not None:
                 matching_rule = rule
                 break
@@ -84,37 +83,10 @@ def lex(input_text: str) -> Generator[Token, None, None]:
 
         # yield inline text if we have some left
         if len(seen_simple_text) > 0:
-            yield Token(TokenType.TEXT_INLINE, seen_simple_text)
+            yield Token(seen_simple_text)
             seen_simple_text = ""
 
-        groups = None
-        if len(match.groups()) > 0:
-            groups = match.groups()
-
-        yield Token(matching_rule.token_type, match[0], groups)
-
-
-# stolen from https://www.urlregex.com/
-URL_REGEX = (
-    r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
-)
-
-lexing_rules = [
-    LexingRule(token_type=TokenType.USER_MENTION, pattern="<@!?([0-9]+)>"),
-    LexingRule(token_type=TokenType.ROLE_MENTION, pattern="<@&([0-9]+)>"),
-    LexingRule(token_type=TokenType.CHANNEL_MENTION, pattern="<#([0-9]+)>"),
-    LexingRule(
-        token_type=TokenType.EMOJI_CUSTOM, pattern="<:([a-zA-Z0-9_]{2,}):([0-9]+)>"
-    ),
-    LexingRule(token_type=TokenType.EMOJI_UNICODE_ENCODED, pattern=":([a-zA-Z0-9_]+):"),
-    LexingRule(token_type=TokenType.URL_WITHOUT_PREVIEW, pattern=f"<{URL_REGEX}>"),
-    LexingRule(token_type=TokenType.URL_WITH_PREVIEW, pattern=URL_REGEX),
-    LexingRule(token_type=TokenType.QUOTE_LINE_PREFIX, pattern=r"(>>)?> "),
-    LexingRule(token_type=TokenType.TILDE, pattern=r"~"),
-    LexingRule(token_type=TokenType.STAR, pattern=r"\*"),
-    LexingRule(token_type=TokenType.UNDERSCORE, pattern=r"_"),
-    LexingRule(token_type=TokenType.SPOILER_DELIMITER, pattern=r"\|\|"),
-    LexingRule(token_type=TokenType.CODE_BLOCK_DELIMITER, pattern=r"```"),
-    LexingRule(token_type=TokenType.CODE_INLINE_DELIMITER, pattern=r"`"),
-    LexingRule(token_type=TokenType.NEWLINE, pattern="\n"),
-]
+        yield Token(match[0], matching_rule, match.groups())
+
+    if len(seen_simple_text) > 0:
+        yield Token(seen_simple_text)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ venv @@
     .env
     __pycache__
     runtime
+    poetry.lock