diff --git a/elixir/filters/__init__.py b/elixir/filters/__init__.py index b06eae8f..e65e9d08 100755 --- a/elixir/filters/__init__.py +++ b/elixir/filters/__init__.py @@ -1,23 +1,51 @@ -from typing import List - -from .utils import Filter, FilterContext -from .projects import project_filters, default_filters - -# Returns a list of applicable filters for project_name under provided filter context -def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: - filter_classes = project_filters.get(project_name, default_filters) - filters = [] - - for filter_cls in filter_classes: - if type(filter_cls) == tuple and len(filter_cls) == 2: - cls, kwargs = filter_cls - filters.append(cls(**kwargs)) - elif type(filter_cls) == type: - filters.append(filter_cls()) - else: - raise ValueError(f"Invalid filter: {filter_cls}, " \ - "should be either a two element tuple or a type. " \ - "Make sure project_filters in project.py is valid.") - - return [f for f in filters if f.check_if_applies(ctx)] +from .ident import IdentFilter + +from .cppinc import CppIncFilter +from .cpppathinc import CppPathIncFilter + +from .defconfig import DefConfigIdentsFilter +from .configin import ConfigInFilter + +from .kconfig import KconfigFilter +from .kconfigidents import KconfigIdentsFilter + +from .dtsi import DtsiFilter +from .dtscompdocs import DtsCompDocsFilter +from .dtscompcode import DtsCompCodeFilter +from .dtscompdts import DtsCompDtsFilter + +from .makefileo import MakefileOFilter +from .makefiledtb import MakefileDtbFilter +from .makefiledir import MakefileDirFilter +from .makefilesubdir import MakefileSubdirFilter +from .makefilefile import MakefileFileFilter +from .makefilesrctree import MakefileSrcTreeFilter +from .makefilesubdir import MakefileSubdirFilter + + +# List of filters applied to all projects +default_filters = [ + DtsCompCodeFilter, + DtsCompDtsFilter, + DtsCompDocsFilter, + IdentFilter, + CppIncFilter, +] + +# List of filters for Kconfig files +common_kconfig_filters = [ + KconfigFilter, + KconfigIdentsFilter, + DefConfigIdentsFilter, +] + +# List of filters for Makefiles +common_makefile_filters = [ + MakefileOFilter, + MakefileDtbFilter, + MakefileDirFilter, + MakefileFileFilter, + MakefileSubdirFilter, + MakefileSrcTreeFilter, +] diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py new file mode 100644 index 00000000..f4f3fa32 --- /dev/null +++ b/elixir/lexers/__init__.py @@ -0,0 +1,10 @@ +from .lexers import * + +default_lexers = { + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'.*\.s': GasLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst +} + diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py new file mode 100644 index 00000000..676b2ed4 --- /dev/null +++ b/elixir/lexers/__main__.py @@ -0,0 +1,35 @@ +if __name__ == "__main__": + import sys + from . import lexers + + if not (len(sys.argv) == 2 or (len(sys.argv) == 3 and sys.argv[1] == '-s')): + print("usage:", sys.argv[0], "[-s]", "path/to/file") + exit(1) + + short = sys.argv[1] == '-s' + + filename = sys.argv[-1] + + with open(filename) as f: + if filename.endswith(('.c', '.h', '.cpp', '.hpp')): + lexer = lexers.CLexer(f.read()) + elif filename.endswith(('.dts', '.dtsi')): + lexer = lexers.DTSLexer(f.read()) + elif filename.endswith('Kconfig'): + lexer = lexers.KconfigLexer(f.read()) + elif filename.endswith(('.s', '.S')): + lexer = lexers.GasLexer(f.read()) + elif filename.endswith('Makefile'): + lexer = lexers.MakefileLexer(f.read()) + else: + raise Exception("no lexer for filetype") + + for token in lexer.lex(): + if not short: + print(token.line, token.token_type.name, token.span, token.token.encode()) + else: + if token.token_type.name == 'IDENTIFIER' or token.token_type.name == 'STRING': + print(f"|{token.token}|", end='') + else: + print(token.token, end='') + diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py new file mode 100644 index 00000000..b470e749 --- /dev/null +++ b/elixir/lexers/lexers.py @@ -0,0 +1,395 @@ +import re + +from . import shared +from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string, token_from_match, \ + regex_or, match_token, Token + +# Lexers used to extract possible references from source files +# Design inspired by Pygments lexers interface + +# https://en.cppreference.com/w/c/language +# https://www.iso-9899.info/wiki/The_Standard +class CLexer: + # NOTE: does not support unicode identifiers + c_identifier = r'[a-zA-Z_][a-zA-Z_0-9]*' + + c_punctuation = r'[!#%&`()*+,./:;<=>?\[\]\\^_{|}~-]' + + # NOTE: macros don't always contain C code, but detecting that in pratice is hard + # without information about context (where the file is included from). + c_punctuation_extra = r'[$\\@]' + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (shared.common_slash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + (shared.c_number, TokenType.NUMBER), + (c_identifier, TokenType.IDENTIFIER), + (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL), + (c_punctuation, TokenType.PUNCTUATION), + (c_punctuation_extra, TokenType.PUNCTUATION), + ] + + def __init__(self, code): + self.code = code + + def lex(self, **kwargs): + return simple_lexer(self.rules, self.code, **kwargs) + + +# https://www.devicetree.org/specifications/ +class DTSLexer: + # TODO handle macros separately + + # NOTE: previous versions would split identifiers by commas (and other special characters), + # this changes the old behavior + + # 6.2 + # technically shall be 1-31 characters long BUT /linux/v6.9.4/source/arch/arm64/boot/dts/qcom/sm8250.dtsi#L3506 + dts_label = r'[a-zA-Z_][a-zA-Z_0-9]*' + # no whitespace between label and ampersand/colon is allowed + dts_label_reference = f'(&)({ dts_label })' + dts_label_definition = f'({ dts_label })(:)' + + # 2.2.1 + # same with label lenght, just in case + dts_node_name = r'[a-zA-Z0-9,._+-]+' + # can contain macro symbols + dts_unit_address = r'[a-zA-Z0-9,._+-]*' + + dts_node_name_with_unit_address = f'({ dts_node_name })(@)({ dts_unit_address })' + r'(\s*)({)' + dts_node_name_without_unit_address = f'({ dts_node_name })' + r'(\s*)({)' + + # 2.2.4 + dts_property_name = r'[0-9a-zA-Z,._+?#-]+' + dts_property_assignment = f'({ dts_property_name })' + r'(\s*)(=)' + dts_property_empty = f'({ dts_property_name })' + r'(\s*)(;)' + + dts_directive = r'/[a-zA-Z0-9-]+/'; + dts_delete_node = regex_concat(r'/delete-node/\s+', dts_node_name) + dts_delete_property = regex_concat(r'/delete-property/\s+', dts_property_name) + + # 6.3 + dts_node_reference = r'(&)({)([a-zA-Z0-9,._+/@-]+?)(})' + + dts_punctuation = r'[#@:;{}\[\]()^<>=+*/%&\\|~!?,-]' + # other, unknown, identifiers - for exmple macros + dts_default_identifier = r'[0-9a-zA-Z_]+' + + # Parse DTS node reference, ex: &{/path/to/node@20/test} + @staticmethod + def parse_dts_node_reference(ctx, match): + # & + token, ctx = token_from_string(ctx, match.group(1), TokenType.PUNCTUATION) + yield token + + # { + token, ctx = token_from_string(ctx, match.group(2), TokenType.PUNCTUATION) + yield token + + path = match.group(3) + path_part_matcher = re.compile(DTSLexer.dts_unit_address) + strpos = 0 + + while strpos < len(path): + if path[strpos] == '@' or path[strpos] == '/': + token, ctx = token_from_string(ctx, path[strpos], TokenType.PUNCTUATION) + yield token + strpos += 1 + else: + part_match = path_part_matcher.match(path, strpos) + if part_match is None: + token, _ = token_from_string(ctx, TokenType.ERROR, '') + yield token + return None + + token, ctx = token_from_string(ctx, part_match.group(0), TokenType.IDENTIFIER) + yield token + strpos += len(part_match.group(0)) + # } + token, ctx = token_from_string(ctx, match.group(4), TokenType.PUNCTUATION) + yield token + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (shared.common_slash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + (shared.c_number, TokenType.NUMBER), + + (dts_label_reference, split_by_groups(TokenType.PUNCTUATION, TokenType.IDENTIFIER)), + (dts_label_definition, split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION)), + (dts_node_reference, parse_dts_node_reference), + + (dts_property_assignment, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + (dts_property_empty, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + + (dts_node_name_with_unit_address, + split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION, + TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + (dts_node_name_without_unit_address, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + + (dts_directive, TokenType.SPECIAL), + (dts_delete_node, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)), + (dts_delete_property, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)), + (dts_default_identifier, TokenType.IDENTIFIER), + (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL), + (dts_punctuation, TokenType.PUNCTUATION), + ] + + def __init__(self, code): + self.code = code + + def lex(self, **kwargs): + return simple_lexer(self.rules, self.code, **kwargs) + + +# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-syntax +# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-hints + +# TODO better macros calls support + +class KconfigLexer: + hash_comment = r'#' + shared.singleline_comment_with_escapes_base + + # NOTE pretty much all kconfig identifiers either start uppercase or with a number. this saves us from parsing macro calls + kconfig_identifier_starts_with_letters = r'[A-Z_][A-Z0-9a-z_-]*' + kconfig_identifier_starts_with_digits = r'[0-9]+[A-Z_a-z-][A-Z0-9a-z_-]*' + kconfig_identifier = regex_or(kconfig_identifier_starts_with_letters, kconfig_identifier_starts_with_digits) + # other perhaps interesting identifiers + kconfig_minor_identifier = r'[a-zA-Z0-9_/][a-zA-Z0-9_/.-]*' + kconfig_punctuation = r'[|&!=$()/_.+<>,-]' + kconfig_number = f'[0-9]+' # TODO does not handle hex numbers + + # NOTE no identifiers are parsed out of KConfig help texts now, this changes the + # old behavior + # for example see all instances of USB in /u-boot/v2024.07/source/drivers/usb/Kconfig#L3 + + @staticmethod + def count_kconfig_help_whitespace(start_whitespace_str): + tabs = start_whitespace_str.count('\t') + spaces = start_whitespace_str.count(' ') + return 8*tabs + spaces + (len(start_whitespace_str)-tabs-spaces) + + @staticmethod + def parse_kconfig_help_text(ctx, match): + # assumes called with matched help keyword, return the keyword + token, ctx = token_from_match(ctx, match, TokenType.SPECIAL) + yield token + + # match whitespace after help + whitespace_after_help, ctx = match_token(ctx, r'\s*?\n', TokenType.WHITESPACE) + if whitespace_after_help is None: + # failed to match whitespace and newline after kconfig help - perhaps it's not the right context (macro call for exapmle) + return + else: + yield whitespace_after_help + + line_matcher = re.compile(r'[^\n]*\n', flags=re.MULTILINE|re.UNICODE) + + start_help_text_pos = ctx.pos + current_pos = ctx.pos + min_whitespace = None + + def collect_tokens(start, end): + return Token(TokenType.COMMENT, ctx.code[start:end], (start, end), ctx.line) + + # match first line with whitespace at the beginning + while current_pos < len(ctx.code): + line = line_matcher.match(ctx.code, current_pos) + if line is None: + yield collect_tokens(start_help_text_pos, current_pos) + return + + token = line.group(0) + span = line.span() + + if token == '\n': + # just an empty line + current_pos = span[1] + continue + else: + start_whitespace = re.match(r'\s*', token) + if start_whitespace is None: + # no whitespace at the beginning of the line + yield collect_tokens(start_help_text_pos, current_pos) + return + elif min_whitespace is None: + # first nonemtpy line - save amount of whitespace + min_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0)) + current_pos = span[1] + else: + cur_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0)) + if cur_whitespace < min_whitespace: + yield collect_tokens(start_help_text_pos, current_pos) + return + else: + current_pos = span[1] + + yield collect_tokens(start_help_text_pos, current_pos) + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (hash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + # for whatever reason u-boot kconfigs sometimes use ---help--- instead of help + # /u-boot/v2024.07/source/arch/arm/mach-sunxi/Kconfig#L732 + (FirstInLine('-+help-+'), parse_kconfig_help_text), + (kconfig_punctuation, TokenType.PUNCTUATION), + (FirstInLine('help'), parse_kconfig_help_text), + (kconfig_identifier, TokenType.IDENTIFIER), + (kconfig_number, TokenType.NUMBER), + (kconfig_minor_identifier, TokenType.SPECIAL), + # things that do not match are probably things from a macro call. + # unless the syntax changed, or the help parser got confused. + # https://www.kernel.org/doc/html/next/kbuild/kconfig-macro-language.html + # both shell call and warning/error would require additinal parsing + (r'[^\n]+', TokenType.SPECIAL), + ] + + def __init__(self, code): + self.code = code + + def lex(self): + return simple_lexer(self.rules, self.code) + + +# https://sourceware.org/binutils/docs/as.html#Syntax +class GasLexer: + # https://sourceware.org/binutils/docs/as.html#Symbol-Intro + # apparently dots are okay, BUT ctags removes the first dot from labels, for example. same with dollars + # /musl/v1.2.5/source/src/string/aarch64/memcpy.S#L92 + gasm_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9_$.]*' + + gasm_flonum = r'0?[a-zA-Z][+-]?([0-9]|\\s*\n\s*)*\.([0-9]|\\s*\n\s*)*([eE][+-]?[0-9]+)?' + gasm_number = regex_or(gasm_flonum, shared.common_hexidecimal_integer, shared.common_binary_integer, + shared.common_decimal_integer) + + gasm_char = r"'(\\.|.|\n)" + gasm_string = f'(({ shared.double_quote_string_with_escapes })|({ gasm_char }))' + + gasm_comment_chars_map = { + 'generic': (r'#\s',), + + 'nios2': (r'#',), + 'openrisc': (r'#',), + 'powerpc': (r'#',), + 's390': (r'#',), + 'xtensa': (r'#',), + 'microblaze': (r'#',), + 'mips': (r'#',), + 'alpha': (r'#',), + 'csky': (r'#',), + # BUT double pipe in macros is an operator... and # not in the first line in + # /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S + 'm68k': ('|', '^#', r'#\s'), + 'arc': ('# ', ';'), + + # https://sourceware.org/binutils/docs/as.html#HPPA-Syntax + # /linux/v6.10.7/source/arch/parisc/kernel/perf_asm.S#L28 + 'parisc': (';',), + 'x86': (';',), + 'tic6x': (';', '*'), # cx6, tms320, although the star is sketchy + + # in below, # can be a comment only if the first character of the line + + # https://sourceware.org/binutils/docs/as.html#SH-Syntax + # /linux/v6.10.7/source/arch/sh/kernel/head_32.S#L58 + 'sh': ('!', '^#'), + # https://sourceware.org/binutils/docs/as.html#Sparc_002dSyntax + # /linux/v6.10.7/source/arch/sparc/lib/memset.S#L125 + 'sparc': ('!', '^#'), + # used in ARM https://sourceware.org/binutils/docs/as.html#ARM-Syntax + # /linux/v6.10.7/source/arch/arm/mach-sa1100/sleep.S#L33 + 'arm32': ('@', '^#'), + 'cris': (';', '^#'), + 'avr': (';', '^#'), + # blackfin, tile + } + + gasm_punctuation = r'[.,\[\]()<>{}%&+*!|@#$;:^/\\=~-]' + # TODO make sure all relevant directives are listed here + gasm_preprocessor = r'#[ \t]*(define|ifdef|ifndef|undef|if|else|elif|endif)' + + rules_before_comments = [ + (shared.whitespace, TokenType.WHITESPACE), + # don't interpret macro concatenate as a comment + ('##', TokenType.PUNCTUATION), + # don't interpret or as a comment + (r'\|\|', TokenType.PUNCTUATION), + (FirstInLine(regex_or(shared.c_preproc_include, shared.c_preproc_warning_and_error)), TokenType.SPECIAL), + (FirstInLine(gasm_preprocessor), TokenType.SPECIAL), + (shared.common_slash_comment, TokenType.COMMENT), + ] + + rules_after_comments = [ + (gasm_string, TokenType.STRING), + (gasm_number, TokenType.NUMBER), + (gasm_identifier, TokenType.IDENTIFIER), + (gasm_punctuation, TokenType.PUNCTUATION), + ] + + def __init__(self, code, arch='generic'): + self.code = code + self.comment_chars = self.gasm_comment_chars_map[arch] + + def get_arch_rules(self): + result = [] + + regex_chars = '*?+^.$\\[]|()' + add_slash = lambda ch: '\\' + ch if ch in regex_chars else ch + + for comment_char in self.comment_chars: + if comment_char[0] == '^': + result.append(( + FirstInLine(add_slash(comment_char[1]) + shared.singleline_comment_with_escapes_base), + TokenType.COMMENT + )) + else: + result.append(( + add_slash(comment_char) + shared.singleline_comment_with_escapes_base, + TokenType.COMMENT) + ) + + return result + + def lex(self): + rules = self.rules_before_comments + \ + self.get_arch_rules() + \ + self.rules_after_comments + + return simple_lexer(rules, self.code) + + +# https://www.gnu.org/software/make/manual/make.html +class MakefileLexer: + # https://pubs.opengroup.org/onlinepubs/007904975/utilities/make.html + + # NOTE same as in KConfig, we only care about screaming case names + make_identifier = r'[A-Z0-9_]+' + make_minor_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9-_]*' + make_variable = r'(\$\([a-zA-Z0-9_-]\)|\$\{[a-zA-Z0-9_-]\})' + make_single_quote_string = r"'*?'" + make_string = f'(({ make_single_quote_string })|({ shared.double_quote_string_with_escapes }))' + make_escape = r'\\[#"\']' + make_punctuation = r'[~\\`\[\](){}<>.,:;|%$^@&?!+*/=-]' + make_comment = r'(?|".*?")' +# match warning and error directives with the error string +c_preproc_warning_and_error = r'#\s*(warning|error)\s(\\\s*\n|[^\n])*\n' +# match other preprocessor directives, but don't consume the whole line +c_preproc_other = r'#\s*[a-z]+' +c_preproc_ignore = regex_or(c_preproc_include, c_preproc_warning_and_error, c_preproc_other) + +# \, any amount of whitespace, newline or any character that's not backslash newline or a quote, any escaped character +double_quote_string_with_escapes = r'"(\\\s*\n|[^\\"\n]|\\(.|\s))*?"' +single_quote_string_with_escapes = r"'(\\\s*\n|[^\\'\n]|\\(.|\s))*?'" + +common_string_and_char = regex_or(double_quote_string_with_escapes, single_quote_string_with_escapes) + +c_exponent = r'([eE][+-]?[0-9][0-9\']*)' +c_hexidecimal_exponent = r'([pP][+-]?[0-9][0-9\']*)' + +c_decimal_double_part = r'\.[0-9\']*' + c_exponent + '?' +c_octal_double_part = r'\.[0-7\']*' + c_exponent + '?' +c_hexidecimal_double_part = r'\.[0-9a-fA-F\']*' + c_hexidecimal_exponent + '?' + +c_decimal = f'{ common_decimal_integer }({ c_decimal_double_part })?' +c_hexidecimal = f'{ common_hexidecimal_integer }({ c_hexidecimal_double_part })?' +c_octal = f'{ common_octal_integer }({ c_octal_double_part })?' + +# not entirely correct... accepts way more than the standard allows +c_number_suffix = r'([uU]|[lL]|(wb|WB)|[fF]|[zZ]){0,5}' + +c_number = regex_concat(regex_or(c_hexidecimal, common_binary_integer, c_decimal, c_octal), c_number_suffix) + diff --git a/elixir/lexers/tests/__init__.py b/elixir/lexers/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/elixir/lexers/tests/base.py b/elixir/lexers/tests/base.py new file mode 100644 index 00000000..e234df33 --- /dev/null +++ b/elixir/lexers/tests/base.py @@ -0,0 +1,65 @@ +import unittest + +class LexerTest(unittest.TestCase): + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + # Checks if each token starts in the claimed position of code, if tokens cover all code and if no tokens overlap + def verify_positions(self, code, tokens): + last_token = None + for t in tokens: + if code[t.span[0]:t.span[1]] != t.token: + self.fail(f"token {t} span != code span {code[t.span[0]:t.span[1]].encode()}") + + if last_token is not None and last_token.span[1] != t.span[0]: + self.fail(f"token does not start where the previous token ends. prev: {last_token}, next: {t}") + elif last_token is None and t.span[0] != 0: + self.fail(f"first token does not start at zero: {t}") + + last_token = t + + if last_token.span[1] != len(code): + self.fail(f"code is longer than position of the last token: {t}, code len: {len(code)}") + + # Checks if each token is in the claimed line of code + def verify_lines(self, code, tokens): + lines = [""] + code.split("\n") # zero line is emtpy + last_line_number = None + last_line_contents_left = None + for t in tokens: + if last_line_number != t.line: + last_line_number = t.line + last_line_contents_left = lines[t.line] + + if last_line_contents_left is None: + self.fail(f"nothing left in line {t.line} for {t.token} {t}") + + newline_count = t.token.count("\n") + all_token_lines = last_line_contents_left + "\n" + \ + "\n".join([lines[i] for i in range(t.line+1, t.line+newline_count+1)]) + "\n" + token_pos_in_lines = all_token_lines.find(t.token) + if token_pos_in_lines == -1: + self.fail(f"token {t.token} not found in line {t.line}: {all_token_lines.encode()}") + if token_pos_in_lines < len(last_line_contents_left): + last_line_contents_left = last_line_contents_left[token_pos_in_lines:] + else: + last_line_contents_left = None + + # Lex code, do basic soundness checks on tokens (lines and positions) and compare lexing results with a list of tokens + def lex(self, code, expected, filtered_tokens=None, lexer_options={}): + if filtered_tokens is None: + filtered_tokens = self.default_filtered_tokens + + code = code.lstrip() + tokens = list(self.lexer_cls(code, **lexer_options).lex()) + self.verify_positions(code, tokens) + self.verify_lines(code, tokens) + + tokens = [[type.name, token] for type, token, span, line in tokens] + tokens = [t for t in tokens if t[0] in filtered_tokens] + try: + self.assertEqual(tokens, expected) + except Exception as e: + print() + for t in tokens: print(t, end=",\n") + raise e + diff --git a/elixir/lexers/tests/test_c.py b/elixir/lexers/tests/test_c.py new file mode 100644 index 00000000..ffd48cee --- /dev/null +++ b/elixir/lexers/tests/test_c.py @@ -0,0 +1,567 @@ +from ..lexers import CLexer +from .base import LexerTest + +class CLexerTest(LexerTest): + lexer_cls = CLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_if0(self): + self.lex(r""" +#if 0 +static bool test_v3_0_test(void *h, + enum type_enum e) { + return false; +} +#endif +static bool test_v3_0_test(void *h, + enum type_enum e) { + return false; +} +""", [ + ['SPECIAL', '#if'], + ['NUMBER', '0'], + ['IDENTIFIER', 'static'], + ['IDENTIFIER', 'bool'], + ['IDENTIFIER', 'test_v3_0_test'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'h'], + ['IDENTIFIER', 'enum'], + ['IDENTIFIER', 'type_enum'], + ['IDENTIFIER', 'e'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'false'], + ['SPECIAL', '#endif'], + ['IDENTIFIER', 'static'], + ['IDENTIFIER', 'bool'], + ['IDENTIFIER', 'test_v3_0_test'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'h'], + ['IDENTIFIER', 'enum'], + ['IDENTIFIER', 'type_enum'], + ['IDENTIFIER', 'e'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'false'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_preproc(self): + self.lex(r""" +#include +# include +# include "test.h" +# include "test.h" + +# warning war +# error err + # error err + #warning war + +#error "escaped\ + message" + +#warning "escaped\ + message" + +# if defined(TEST) +# elif defined(TEST2) +#else +""", [ + ['SPECIAL', '#include '], + ['SPECIAL', '# include '], + ['SPECIAL', '# include "test.h"'], + ['SPECIAL', '# include "test.h"'], + ['SPECIAL', '# warning war\n'], + ['SPECIAL', '# error err\n'], + ['SPECIAL', '# error err\n'], + ['SPECIAL', '#warning war\n'], + ['SPECIAL', '#error "escaped\\\n message"\n'], + ['SPECIAL', '#warning "escaped\\ \n message"\n'], + ['SPECIAL', '# if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', '# elif'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', '#else'], + ]) + + def test_defines(self): + self.lex(""" +# define test "long string \ + escaped newline" + + #define test define1 +# define test2 define12323 + +#define func(name, arg1,arg2...) \ + void name##f() { \ + return arg1 + arg2; + } +""", [ + ['SPECIAL', '# define'], + ['IDENTIFIER', 'test'], + ['STRING', '"long string escaped newline"'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'define1'], + ['SPECIAL', '# define'], + ['IDENTIFIER', 'test2'], + ['IDENTIFIER', 'define12323'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'func'], + ['IDENTIFIER', 'name'], + ['IDENTIFIER', 'arg1'], + ['IDENTIFIER', 'arg2'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'name'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'arg1'], + ['IDENTIFIER', 'arg2'], + ]) + + def test_strings(self): + self.lex(r""" +"asdsad \ + asdasd"; +'asdsad \ + asdasd'; +u8"test string"; +u"test string"; +u"test string"; +L"test string"; +"test \" string"; +"test ' string"; +"test \' string"; +"test \n string"; +"\xff"; +"test" "string"; +"test""string"; +"test" +""", [ + ['STRING', '"asdsad \\ \n asdasd"'], + ['STRING', "'asdsad \\\n asdasd'"], + ['IDENTIFIER', 'u8'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'u'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'u'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'L'], + ['STRING', '"test string"'], + ['STRING', '"test \\" string"'], + ['STRING', '"test \' string"'], + ['STRING', '"test \\\' string"'], + ['STRING', '"test \\n string"'], + ['STRING', '"\\xff"'], + ['STRING', '"test"'], + ['STRING', '"string"'], + ['STRING', '"test"'], + ['STRING', '"string"'], + ['STRING', '"test"'], + ]) + + def test_strings2(self): + self.lex(r""" + "string"; + char* s1 = "asdjlsajdlksad""asdsajdlsad"; //comment6 + char* s2 = "asdjlsajdlksad" "asdsajdlsad"; // \ + single line comment \ + with escapes + char* s3 = " asdsaldjkas \""; + char* s4 = " asdsaldjkas \" zxclzxclk \" asljda"; + char* s5 = " asdsaldjkas \' zxclzxclk \" asljda"; + char* s6 = " asdsaldjkas \"\"\" zxclzxclk \'\'\' ; asljda"; + char* s7 = u8"test"; +""", [ + ['STRING', '"string"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's1'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '//comment6\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's2'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '// \\\n single line comment \\\n with escapes\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's3'], + ['STRING', '" asdsaldjkas \\""'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's4'], + ['STRING', '" asdsaldjkas \\" zxclzxclk \\" asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's5'], + ['STRING', '" asdsaldjkas \\\' zxclzxclk \\" asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's6'], + ['STRING', '" asdsaldjkas \\"\\"\\" zxclzxclk \\\'\\\'\\\' ; asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's7'], + ['IDENTIFIER', 'u8'], + ['STRING', '"test"'], + ]) + + def test_chars(self): + self.lex(r""" +'a'; +u8'a'; +u'a'; +U'a'; +'\''; +'\"'; +'\\'; +'\n'; +'\f'; +'\U0001f34c'; +'\13'; +'\x1234'; +'\u213'; +u'ą'; +""", [ + ['STRING', "'a'"], + ['IDENTIFIER', 'u8'], + ['STRING', "'a'"], + ['IDENTIFIER', 'u'], + ['STRING', "'a'"], + ['IDENTIFIER', 'U'], + ['STRING', "'a'"], + ['STRING', "'\\''"], + ['STRING', '\'\\"\''], + ['STRING', "'\\\\'"], + ['STRING', "'\\n'"], + ['STRING', "'\\f'"], + ['STRING', "'\\U0001f34c'"], + ['STRING', "'\\13'"], + ['STRING', "'\\x1234'"], + ['STRING', "'\\u213'"], + ['IDENTIFIER', 'u'], + ['STRING', "'ą'"], + ]) + + def test_numbers(self): + self.lex(r""" +1239183; +-1239183; +0xAB08902; +-0xAB08902; +0Xab08902; +-0Xab08902; +0b0101001; +-0b0101001; +0B0101001; +-0B0101001; +0231273; +-0231273; +""", [ + ['NUMBER', '1239183'], + ['NUMBER', '1239183'], + ['NUMBER', '0xAB08902'], + ['NUMBER', '0xAB08902'], + ['NUMBER', '0Xab08902'], + ['NUMBER', '0Xab08902'], + ['NUMBER', '0b0101001'], + ['NUMBER', '0b0101001'], + ['NUMBER', '0B0101001'], + ['NUMBER', '0B0101001'], + ['NUMBER', '0231273'], + ['NUMBER', '0231273'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_floats(self): + self.lex(r""" +double e = 0x2ABDEFabcdef; +double + f = 017.048509495; +double -g = 0b1010010; +double g = 0b1010010; +-017.048509495; +017.048509495; +-017.048509495e-12329123; +017.048509495e-12329123; +-0x123.fp34; +0x123.fp34; +-0x123.fP34; +0x123.fP34; +-0x123.fe1p123; +0x123.fe1p123; +-0x123.fe1p123; +0x123.fe1p123; +-.1; +.1; +-1.; +1.; +-0x1.ep+3; +0x1.ep+3; +-0X183083; +0X183083; +-0x213213.1231212'31e21p-2; +0x213213.1231212'31e21p-2; +-123123.123e2; +123123.123e2; +""", [ + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'e'], + ['NUMBER', '0x2ABDEFabcdef'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'f'], + ['NUMBER', '017.048509495'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'g'], + ['NUMBER', '0b1010010'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'g'], + ['NUMBER', '0b1010010'], + ['NUMBER', '017.048509495'], + ['NUMBER', '017.048509495'], + ['NUMBER', '017.048509495e-12329123'], + ['NUMBER', '017.048509495e-12329123'], + ['NUMBER', '0x123.fp34'], + ['NUMBER', '0x123.fp34'], + ['NUMBER', '0x123.fP34'], + ['NUMBER', '0x123.fP34'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '1'], + ['NUMBER', '1'], + ['NUMBER', '1.'], + ['NUMBER', '1.'], + ['NUMBER', '0x1.ep+3'], + ['NUMBER', '0x1.ep+3'], + ['NUMBER', '0X183083'], + ['NUMBER', '0X183083'], + ['NUMBER', "0x213213.1231212'31e21p-2"], + ['NUMBER', "0x213213.1231212'31e21p-2"], + ['NUMBER', '123123.123e2'], + ['NUMBER', '123123.123e2'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_longs(self): + self.lex(r""" +-123213092183ul; +123213092183ul; +-123213092183ull; +123213092183ull; +-123213092183llu; +123213092183llu; +-123213092183uLL; +123213092183uLL; +-123213092183LLU; +123213092183LLU; +-1232'13092183LLU; +1232'13092183LLU; +-1232'1309'2183LLU; +1232'1309'2183LLU; +-1232'1309'218'3LLU; +1232'1309'218'3LLU; +""", [ + ['NUMBER', '123213092183ul'], + ['NUMBER', '123213092183ul'], + ['NUMBER', '123213092183ull'], + ['NUMBER', '123213092183ull'], + ['NUMBER', '123213092183llu'], + ['NUMBER', '123213092183llu'], + ['NUMBER', '123213092183uLL'], + ['NUMBER', '123213092183uLL'], + ['NUMBER', '123213092183LLU'], + ['NUMBER', '123213092183LLU'], + ['NUMBER', "1232'13092183LLU"], + ['NUMBER', "1232'13092183LLU"], + ['NUMBER', "1232'1309'2183LLU"], + ['NUMBER', "1232'1309'2183LLU"], + ['NUMBER', "1232'1309'218'3LLU"], + ['NUMBER', "1232'1309'218'3LLU"], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_comments(self): + self.lex(r""" + /*comment1*/ + /* comment2*/ + /* comment3 */ + /* + * + comment4 + _+}{|":?><~!@#$%&*()_+`123567890-=[];'\,./ + * */ + + /* comment 5 \*\// */ + +// comment5 +char* s2 = "asdjlsajdlksad" "asdsajdlsad"; // \ + single line comment \ + with escapes +char statement; +""", [ + ['COMMENT', '/*comment1*/'], + ['COMMENT', '/* comment2*/'], + ['COMMENT', '/* comment3 */'], + ['COMMENT', '/*\n *\n comment4\n _+}{|":?><~!@#$%&*()_+`123567890-=[];\'\\,./\n * */'], + ['COMMENT', '/* comment 5 \\*\\// */'], + ['COMMENT', '// comment5\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's2'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '// \\\n single line comment \\\n with escapes\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 'statement'], + ]) + + # https://en.cppreference.com/w/cpp/language/pack_indexing + def test_cpp_templates(self): + self.lex(r""" +template +constexpr auto f(Ts&&... ts) { + return sizeof...(Ts); +} + +template +int f() { + std::cout << t << std::endl; + ns1::ns2::type v; + ns1::ns2::type2 v2; + ns1::ns2::type3 v3; +} +""", [ + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'constexpr'], + ['IDENTIFIER', 'auto'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'ts'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'sizeof'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'cout'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'endl'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'v'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type2'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'v2'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type3'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'v3'], + ]) + + # https://en.cppreference.com/w/cpp/language/requires + def test_cpp_concepts(self): + self.lex(r""" +template +concept C = requires(T x) { + {x.count()} -> std::same_as; + requires Same +}; +""", [ + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'concept'], + ['IDENTIFIER', 'C'], + ['IDENTIFIER', 'requires'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'count'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'same_as'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'requires'], + ['IDENTIFIER', 'Same'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'decltype'], + ['IDENTIFIER', 'x'], + ]) + + def test_cpp_class(self): + self.lex(r""" +using namespace std; + +auto f() -> std::string; + +class test { +public: + int operator ""_tx(int); + int a = 123_tx; +}; +""", [ + ['IDENTIFIER', 'using'], + ['IDENTIFIER', 'namespace'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'auto'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'string'], + ['IDENTIFIER', 'class'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'public'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'operator'], + ['STRING', '""'], + ['IDENTIFIER', '_tx'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'a'], + ['IDENTIFIER', '_tx'], + ]) + + def test_cpp_attrs(self): + self.lex(r""" +[[using test: atr1]] [[atr2]] +int f[[atr3]](); +""", [ + ['IDENTIFIER', 'using'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'atr1'], + ['IDENTIFIER', 'atr2'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'atr3'], + ]) + + # https://en.cppreference.com/w/cpp/language/noexcept_spec + def test_cpp_noexpect(self): + self.lex(r""" +void f() noexpect(true) {} +""", [ + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'noexpect'], + ['IDENTIFIER', 'true'], + ]) + + # https://en.cppreference.com/w/cpp/language/coroutines + def test_cpp_coroutines(self): + self.lex(r""" +task<> test() { + co_await test2(); +} +""", [ + ['IDENTIFIER', 'task'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'co_await'], + ['IDENTIFIER', 'test2'], + ]) + diff --git a/elixir/lexers/tests/test_dts.py b/elixir/lexers/tests/test_dts.py new file mode 100644 index 00000000..72f39d7f --- /dev/null +++ b/elixir/lexers/tests/test_dts.py @@ -0,0 +1,271 @@ +from ..lexers import DTSLexer +from .base import LexerTest + +class DTSLexerTests(LexerTest): + lexer_cls = DTSLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_preproc(self): + self.lex(r""" +#include +#include "file2.dtsi" +#error error message asldjlksajdlksad +#warning warning message alsjdlkasjdlksajd +#define MACRO(arg) \ + arg = <3>; +#if 0 +/ { + property = <2>; + MACRO(test) +}; +#endif +""", [ + ['SPECIAL', '#include '], + ['SPECIAL', '#include "file2.dtsi"'], + ['SPECIAL', '#error error message asldjlksajdlksad\n'], + ['SPECIAL', '#warning warning message alsjdlkasjdlksajd\n'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'MACRO'], + ['IDENTIFIER', 'arg'], + ['IDENTIFIER', 'arg'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'property'], + ['IDENTIFIER', 'MACRO'], + ['IDENTIFIER', 'test'], + ['SPECIAL', '#endif'], + ]) + + def test_dts_directives(self): + self.lex(r""" +/include/ "file.dtsi" +/dts-v1/; +/memreserve/ 0x100 0x2; +/ { + test_label: test-node { + test-prop2 = <3>; + }; + test-prop = <2>; + /delete-node/ test-node; + /delete-node/ &test_label; + /delete-property/ test-prop; +}; +""", [ + ['SPECIAL', '/include/'], + ['STRING', '"file.dtsi"'], + ['SPECIAL', '/dts-v1/'], + ['SPECIAL', '/memreserve/'], + ['IDENTIFIER', 'test_label'], + ['IDENTIFIER', 'test-node'], + ['IDENTIFIER', 'test-prop2'], + ['IDENTIFIER', 'test-prop'], + ['SPECIAL', '/delete-node/'], + ['IDENTIFIER', 'test-node'], + ['SPECIAL', '/delete-node/'], + ['IDENTIFIER', 'test_label'], + ['SPECIAL', '/delete-property/'], + ['IDENTIFIER', 'test-prop'], + ]) + + def test_dts_unusual_identifiers(self): + self.lex(r""" +/ { + _test_label: 5id,test._+asd-2 { + property,name = <2>; + 0p,r.o_p+e?r#t-y,name = [1,2,3]; + way_too_long_label_123219380921830218309218309213 : node@234 { + compatible = "asd,zxc"; + } + test = <&way_too_long_label_123219380921830218309218309213>; + }; +}; +""", [ + ['IDENTIFIER', '_test_label'], + ['IDENTIFIER', 'id,test._+asd-2'], + ['IDENTIFIER', 'property,name'], + ['IDENTIFIER', 'p,r.o_p+e?r#t-y,name'], + ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', '234'], + ['IDENTIFIER', 'compatible'], + ['STRING', '"asd,zxc"'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'], + ]) + + def test_non_numeric_unit_address(self): + self.lex(r""" +/ { + test: node@test_address { + }; + test2: node@MACRO_ADDRESS(123) { + }; +}; +""", [ + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'test_address'], + ['IDENTIFIER', 'test2'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'MACRO_ADDRESS'], + ]) + + def test_values_with_labels(self): + self.lex(r""" +/ { + prop1 = label1: <0 label2: 0x21323>; + prop2 = [1 2 3 label3: 4]; + prop3 = label4: "val" label5: ; +}; +""", [ + ['PUNCTUATION', '/'], + ['PUNCTUATION', '{'], + ['IDENTIFIER', 'prop1'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'label1'], + ['PUNCTUATION', ':'], + ['PUNCTUATION', '<'], + ['NUMBER', '0'], + ['IDENTIFIER', 'label2'], + ['PUNCTUATION', ':'], + ['NUMBER', '0x21323'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop2'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '['], + ['NUMBER', '1'], + ['NUMBER', '2'], + ['NUMBER', '3'], + ['IDENTIFIER', 'label3'], + ['PUNCTUATION', ':'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop3'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'label4'], + ['PUNCTUATION', ':'], + ['STRING', '"val"'], + ['IDENTIFIER', 'label5'], + ['PUNCTUATION', ':'], + ['PUNCTUATION', ';'], + ['PUNCTUATION', '}'], + ['PUNCTUATION', ';'], + ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER')) + + def test_references(self): + self.lex(r""" +/ { + interrupt-parent = < &{/node@c2342/another_node@address(2)/node3} >; + property2 = <&{/node@c2342/another_node@address(2)}>; + power-domains = <&power DEVICE_DOMAIN>; +}; +""", [ + ['IDENTIFIER', 'interrupt-parent'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'c2342'], + ['IDENTIFIER', 'another_node'], + ['IDENTIFIER', 'address'], + ['IDENTIFIER', 'node3'], + ['IDENTIFIER', 'property2'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'c2342'], + ['IDENTIFIER', 'another_node'], + ['IDENTIFIER', 'address'], + ['IDENTIFIER', 'power-domains'], + ['IDENTIFIER', 'power'], + ['IDENTIFIER', 'DEVICE_DOMAIN'], + ]) + + def test_property_types(self): + self.lex(r""" +/ { + prop1 = <0 0x21323>; + prop2 = [1 2 3 4]; + prop3 = "val", "val4" ; + prop4 = <~1+2-3*4/5%6&7|8^9<<10>>11>; + prop5; +}; +""", [ + ['PUNCTUATION', '/'], + ['PUNCTUATION', '{'], + ['IDENTIFIER', 'prop1'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '<'], + ['NUMBER', '0'], + ['NUMBER', '0x21323'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop2'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '['], + ['NUMBER', '1'], + ['NUMBER', '2'], + ['NUMBER', '3'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop3'], + ['PUNCTUATION', '='], + ['STRING', '"val"'], + ['PUNCTUATION', ','], + ['STRING', '"val4"'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop4'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '~'], + ['NUMBER', '1'], + ['PUNCTUATION', '+'], + ['NUMBER', '2'], + ['PUNCTUATION', '-'], + ['NUMBER', '3'], + ['PUNCTUATION', '*'], + ['NUMBER', '4'], + ['PUNCTUATION', '/'], + ['NUMBER', '5'], + ['PUNCTUATION', '%'], + ['NUMBER', '6'], + ['PUNCTUATION', '&'], + ['NUMBER', '7'], + ['PUNCTUATION', '|'], + ['NUMBER', '8'], + ['PUNCTUATION', '^'], + ['NUMBER', '9'], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '<'], + ['NUMBER', '10'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', '>'], + ['NUMBER', '11'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop5'], + ['PUNCTUATION', ';'], + ['PUNCTUATION', '}'], + ['PUNCTUATION', ';'], + ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER')) + + def test_comments(self): + self.lex(r""" +//license info +/ { + interrupts = , /* comment 1 */ + ; // comemnt2 + /* long + * coment + * asdasd + */ +}; +""", [ + ['COMMENT', '//license info\n'], + ['IDENTIFIER', 'interrupts'], + ['IDENTIFIER', 'NAME'], + ['IDENTIFIER', 'TYPE'], + ['COMMENT', '/* comment 1 */'], + ['IDENTIFIER', 'NAME'], + ['IDENTIFIER', 'TYPE'], + ['COMMENT', '// comemnt2\n'], + ['COMMENT', '/* long\n * coment\n * asdasd\n */'], + ], self.default_filtered_tokens) + diff --git a/elixir/lexers/tests/test_gas.py b/elixir/lexers/tests/test_gas.py new file mode 100644 index 00000000..3c541f22 --- /dev/null +++ b/elixir/lexers/tests/test_gas.py @@ -0,0 +1,282 @@ +from ..lexers import GasLexer +from .base import LexerTest + +class GasLexerTest(LexerTest): + lexer_cls = GasLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_comments_m68k(self): + self.lex(r""" +# comment 1 +#comment 2 + clrl d1 | comment 3 + clrl d0 |comment 4 +| comment 4 + + clrl d2 # comment 3 + +#if defined(C1) || !defined(C2) + addql #4,%sp +label: + movel #-IDNENT,%sp@(IDENT)| comment 5 +// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S + test # comment 6 +# endif + +#define macro(x) inst &IDENT,%pc@(ident); inst x +""", [ + ['COMMENT', '# comment 1\n'], + ['COMMENT', '#comment 2\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd1'], + ['COMMENT', '| comment 3\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd0'], + ['COMMENT', '|comment 4\n'], + ['COMMENT', '| comment 4\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd2'], + ['COMMENT', '# comment 3\n'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'C1'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'C2'], + ['IDENTIFIER', 'addql'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'label'], + ['IDENTIFIER', 'movel'], + ['IDENTIFIER', 'IDNENT'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'IDENT'], + ['COMMENT', '| comment 5\n'], + ['COMMENT', '// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S\n'], + ['IDENTIFIER', 'test'], + ['COMMENT', '# comment 6\n'], + ['SPECIAL', '# endif'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'macro'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'inst'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'pc'], + ['IDENTIFIER', 'ident'], + ['IDENTIFIER', 'inst'], + ['IDENTIFIER', 'x'], + ], lexer_options={"arch": "m68k"}) + + def test_comments_sparc(self): + self.lex(r""" +#define F(i) \ + .type i,@function; + + std t1, [0x00]; + +/*comment default */ +//comment default2 + .type identifier,#function +label: + sethi %hi(IDENT), %g0 !test comment + wrpr %g1, %sp ! test comment +# comment +#comment + sethi %hi(IDENT_1 | IDENT_2), %l0 +""", [ + ['SPECIAL', '#define'], + ['IDENTIFIER', 'F'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'function'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 't1'], + ['COMMENT', '/*comment default */'], + ['COMMENT', '//comment default2\n'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'identifier'], + ['IDENTIFIER', 'function'], + ['IDENTIFIER', 'label'], + ['IDENTIFIER', 'sethi'], + ['IDENTIFIER', 'hi'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'g0'], + ['COMMENT', '!test comment\n'], + ['IDENTIFIER', 'wrpr'], + ['IDENTIFIER', 'g1'], + ['IDENTIFIER', 'sp'], + ['COMMENT', '! test comment\n'], + ['COMMENT', '# comment\n'], + ['COMMENT', '#comment\n'], + ['IDENTIFIER', 'sethi'], + ['IDENTIFIER', 'hi'], + ['IDENTIFIER', 'IDENT_1'], + ['IDENTIFIER', 'IDENT_2'], + ['IDENTIFIER', 'l0'], + ], lexer_options={"arch": "sparc"}) + + def test_comments_arm32(self): + self.lex(r""" +// comment default +/* comment default2 */ +test: + bic r0, r1, #10 + # comment 1 + #comment 1 +""" ++ "\t# comment 1" + r""" + moveq r0, #IDENTIFIER @ Comment +# comment 2 +#comment 2 + push {r0} + add \addr, \addr, \tmp @comment3 + ldr r1, =TEST3 + ldr TEST, [sp, IDENT(i)]; + .long PMD_TYPE_SECT | \ + PMD_BIT4 + stmfd sp!, {r0, r1, r2, r3} + eor RT0, d, b; +""", [ + ['COMMENT', '// comment default\n'], + ['COMMENT', '/* comment default2 */'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'bic'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'r1'], + ['NUMBER', '10'], + ['COMMENT', '# comment 1\n'], + ['COMMENT', '#comment 1\n'], + ['COMMENT', '# comment 1\n'], + ['IDENTIFIER', 'moveq'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'IDENTIFIER'], + ['COMMENT', '@ Comment\n'], + ['COMMENT', '# comment 2\n'], + ['COMMENT', '#comment 2\n'], + ['IDENTIFIER', 'push'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'add'], + ['IDENTIFIER', 'addr'], + ['IDENTIFIER', 'addr'], + ['IDENTIFIER', 'tmp'], + ['COMMENT', '@comment3\n'], + ['IDENTIFIER', 'ldr'], + ['IDENTIFIER', 'r1'], + ['IDENTIFIER', 'TEST3'], + ['IDENTIFIER', 'ldr'], + ['IDENTIFIER', 'TEST'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'long'], + ['IDENTIFIER', 'PMD_TYPE_SECT'], + ['IDENTIFIER', 'PMD_BIT4'], + ['IDENTIFIER', 'stmfd'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'r1'], + ['IDENTIFIER', 'r2'], + ['IDENTIFIER', 'r3'], + ['IDENTIFIER', 'eor'], + ['IDENTIFIER', 'RT0'], + ['IDENTIFIER', 'd'], + ['IDENTIFIER', 'b'], + ], self.default_filtered_tokens + ("NUMBER",), {"arch": "arm32"}) + + def test_comments_generic(self): + self.lex(r""" +/* comment + * more comment + * more comment + */ + mov r0, r1 //test + mov x0, #IDENT + stp x1, x2, [sp, #-4]! +#if defined(IDENT1) || defined(IDENT2) +#endif +""", [ + ['COMMENT', '/* comment\n * more comment\n * more comment\n */'], + ['IDENTIFIER', 'mov'], + ['IDENTIFIER', 'r0'], + ['PUNCTUATION', ','], + ['IDENTIFIER', 'r1'], + ['COMMENT', '//test\n'], + ['IDENTIFIER', 'mov'], + ['IDENTIFIER', 'x0'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '#'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'stp'], + ['IDENTIFIER', 'x1'], + ['PUNCTUATION', ','], + ['IDENTIFIER', 'x2'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '['], + ['IDENTIFIER', 'sp'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '#'], + ['PUNCTUATION', '-'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', '!'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'IDENT1'], + ['PUNCTUATION', ')'], + ['PUNCTUATION', '||'], + ['IDENTIFIER', 'defined'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'IDENT2'], + ['PUNCTUATION', ')'], + ['SPECIAL', '#endif'], + ], self.default_filtered_tokens + ("PUNCTUATION", "NUMBER")) + + def test_comments_preproc(self): + self.lex(r""" + # error "test" +#warning "test" +#include "test.h" +#include +#if defined(T1) || defined(T2) +#endif +""", [ + ['SPECIAL', '# error "test"\n'], + ['SPECIAL', '#warning "test"\n'], + ['SPECIAL', '#include "test.h"'], + ['SPECIAL', '#include '], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'T1'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'T2'], + ['SPECIAL', '#endif'], + ]) + + def test_comments_literals(self): + self.lex(r""" +.byte 12, 0b1010, 0B1010, 0x34, 0123, 0X45, 'a, '\b +.ascii "asdsad\"zxczc" +.float 0f-12321321030982394324\ + 21321432432.234324324E-14 +.float 0f-123.123213e+13 +.float 0e-123.123213e+13 +""", [ + ['IDENTIFIER', 'byte'], + ['NUMBER', '12'], + ['NUMBER', '0b1010'], + ['NUMBER', '0B1010'], + ['NUMBER', '0x34'], + ['NUMBER', '0123'], + ['NUMBER', '0X45'], + ['STRING', "'a"], + ['STRING', "'\\b"], + ['IDENTIFIER', 'ascii'], + ['STRING', '"asdsad\\"zxczc"'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0f-12321321030982394324\\\n 21321432432.234324324E-14'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0f-123.123213e+13'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0e-123.123213e+13'], + ], self.default_filtered_tokens + ("NUMBER",)) + diff --git a/elixir/lexers/tests/test_kconfig.py b/elixir/lexers/tests/test_kconfig.py new file mode 100644 index 00000000..e0adf379 --- /dev/null +++ b/elixir/lexers/tests/test_kconfig.py @@ -0,0 +1,372 @@ +from ..lexers import KconfigLexer +from .base import LexerTest + +class KconfigLexerTest(LexerTest): + lexer_cls = KconfigLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + # TODO improve macro calls + + def test_comments(self): + self.lex(r""" +# comment1 +config 64BIT # comment2 + bool # comment3 + default "# asd" + default $(shell, \#) + help + asdasdsajdlakjd # not a comment + + asdasdsajdlakjd # not a comment + + # comment 5 + + # comment 6 +""", [ + ['COMMENT', '# comment1\n'], + ['SPECIAL', 'config'], + ['IDENTIFIER', '64BIT'], + ['COMMENT', '# comment2\n'], + ['SPECIAL', 'bool'], + ['COMMENT', '# comment3\n'], + ['SPECIAL', 'default'], + ['STRING', '"# asd"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'shell'], + ['SPECIAL', '\\#)'], + ['SPECIAL', 'help'], + ['COMMENT', ' asdasdsajdlakjd # not a comment\n\n asdasdsajdlakjd # not a comment\n\n # comment 5\n\n'], + ['COMMENT', '# comment 6\n'], + ]) + + + def test_keywords(self): + self.lex(r""", +menu "menu name" + +visible if y + +choice + prompt "test prompt" + default y + +config 86CONIFG + bool "text" + prompt "prompt" + default y + tristate "test" + def_bool TEST_bool + depends on TEST + select TEST2 + imply TEST3 + range 5 512 if CONFIG_512 + help + help text + + more help text + +endmenu +""", [ + ['SPECIAL', 'menu'], + ['STRING', '"menu name"'], + ['SPECIAL', 'visible'], + ['SPECIAL', 'if'], + ['SPECIAL', 'y'], + ['SPECIAL', 'choice'], + ['SPECIAL', 'prompt'], + ['STRING', '"test prompt"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'config'], + ['IDENTIFIER', '86CONIFG'], + ['SPECIAL', 'bool'], + ['STRING', '"text"'], + ['SPECIAL', 'prompt'], + ['STRING', '"prompt"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'tristate'], + ['STRING', '"test"'], + ['SPECIAL', 'def_bool'], + ['IDENTIFIER', 'TEST_bool'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', 'imply'], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'range'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'CONFIG_512'], + ['SPECIAL', 'help'], + ['COMMENT', ' help text\n\n more help text\n\n'], + ['SPECIAL', 'endmenu'], + ]) + + def test_conditions(self): + self.lex(r""" +config TEST + select TEST1 if TEST2 = TEST3 + select TEST2 if TEST5 != TEST6 + select TEST7 if TEST8 < TEST9 + select TEST10 if TEST11 > TEST12 + select TEST13 if TEST14 <= TEST15 +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST1'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST2'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST5'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST6'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST7'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST8'], + ['PUNCTUATION', '<'], + ['IDENTIFIER', 'TEST9'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST10'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST11'], + ['PUNCTUATION', '>'], + ['IDENTIFIER', 'TEST12'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST13'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST14'], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST15'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_conditions2(self): + self.lex(r""" +config TEST + select TEST16 if TEST17 >= TEST3 + select TEST17 if (TEST18 = TEST19) + + select TEST20 if !(TEST21 = TEST22) + select TEST23 if TEST24 && TEST25 + select TEST26 if TEST27 || TEST28 +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST16'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST17'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST17'], + ['SPECIAL', 'if'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'TEST18'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST19'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST20'], + ['SPECIAL', 'if'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'TEST21'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST22'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST23'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST24'], + ['PUNCTUATION', '&'], + ['PUNCTUATION', '&'], + ['IDENTIFIER', 'TEST25'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST26'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST27'], + ['PUNCTUATION', '|'], + ['PUNCTUATION', '|'], + ['IDENTIFIER', 'TEST28'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_macros(self): + self.lex(r""" +config TEST + depends on $(shell,cat file | grep -vi "option 2") + depends on $(info,info to print) + depends on $(warning-if,a != b,warning to print) +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'shell'], + ['PUNCTUATION', ','], + ['SPECIAL', 'cat'], + ['SPECIAL', 'file'], + ['PUNCTUATION', '|'], + ['SPECIAL', 'grep'], + ['PUNCTUATION', '-'], + ['SPECIAL', 'vi'], + ['STRING', '"option 2"'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'info'], + ['PUNCTUATION', ','], + ['SPECIAL', 'info'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'warning-if'], + ['PUNCTUATION', ','], + ['SPECIAL', 'a'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['SPECIAL', 'b'], + ['PUNCTUATION', ','], + ['SPECIAL', 'warning'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + +def test_macros2(self): + self.lex(r""" +config TEST + depends on $(error-if,a != b,warning to print) + depends on $(filename) + depends on $(lineno) +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'error-if'], + ['PUNCTUATION', ','], + ['SPECIAL', 'a'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['SPECIAL', 'b'], + ['PUNCTUATION', ','], + ['SPECIAL', 'warning'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'filename'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'lineno'], + ['PUNCTUATION', ')'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_help(self): + self.lex(r""" +config + help + help test lasdlkajdk sadlksajd + lsajdlad + + salkdjaldlksajd + + " + asdlkajsdlkjsadlajdsk + + salkdjlsakdj' +config + select TEST +config + ---help--- + help test lasdlkajdk sadlksajd + lsajdlad + + salkdjaldlksajd + +config + select TEST +""", [ + ['SPECIAL', 'config'], + ['SPECIAL', 'help'], + ['COMMENT', ' help test lasdlkajdk sadlksajd\n lsajdlad\n\n salkdjaldlksajd\n\n "\n asdlkajsdlkjsadlajdsk\n\n salkdjlsakdj\'\n'], + ['SPECIAL', 'config'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'config'], + ['SPECIAL', '---help---'], + ['COMMENT', ' help test lasdlkajdk sadlksajd\n lsajdlad\n\n salkdjaldlksajd\n \n'], + ['SPECIAL', 'config'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST'], + ]) + + def test_types(self): + self.lex(r""" +config + bool + default y + +config + tristate + default m + +config + hex + default 0xdfffffff00000000 + +config + string + default "string \" test # \# zxc" + +config + int + default 21312323 +""", [ + ['SPECIAL', 'config'], + ['SPECIAL', 'bool'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'config'], + ['SPECIAL', 'tristate'], + ['SPECIAL', 'default'], + ['SPECIAL', 'm'], + ['SPECIAL', 'config'], + ['SPECIAL', 'hex'], + ['SPECIAL', 'default'], + ['IDENTIFIER', '0xdfffffff00000000'], + ['SPECIAL', 'config'], + ['SPECIAL', 'string'], + ['SPECIAL', 'default'], + ['STRING', '"string \\" test # \\# zxc"'], + ['SPECIAL', 'config'], + ['SPECIAL', 'int'], + ['SPECIAL', 'default'], + ]) diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py new file mode 100644 index 00000000..7b991dd8 --- /dev/null +++ b/elixir/lexers/utils.py @@ -0,0 +1,210 @@ +import re +import enum +from collections import namedtuple + +# Supported token types +class TokenType(enum.Enum): + WHITESPACE = 'whitespace', + COMMENT = 'comment' + STRING = 'string' + NUMBER = 'number' + IDENTIFIER = 'identifier' + # may require extra parsing or context information + SPECIAL = 'special' + PUNCTUATION = 'punctuation' + # lexing failure - should be logged, at least until update jobs are preemptible + ERROR = 'error' + +Token = namedtuple('Token', 'token_type, token, span, line') + +def match_regex(regex): + rule = re.compile(regex, flags=re.MULTILINE) + return lambda code, pos, _: rule.match(code, pos) + +def match_token(ctx, pattern, token_type): + match = re.compile(pattern).match(ctx.code, ctx.pos) + if match is None: + return None, ctx + else: + span = match.span() + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line += result.token.count('\n') + return result, ctx + +def split_by_groups(*token_types): + def split(ctx, match): + pos = ctx.pos + line = ctx.line + for gi in range(len(match.groups())): + token = match.group(gi+1) + if len(token) != 0: + action = token_types[gi] + yield Token(action, token, (pos, pos+len(token)), line) + line += token.count("\n") + pos += len(token) + + return split + +def token_from_match(ctx, match, token_type): + span = match.span() + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line = ctx.line+result.token.count('\n') + return result, ctx + +def token_from_string(ctx, match, token_type): + span = (ctx.pos, ctx.pos+len(match)) + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line = ctx.line+result.token.count('\n') + return result, ctx + +# Interface class that allows to match only if certian conditions, +# hard to express in regex, are true +class Matcher: + def update_after_match(self, code: str, pos: int, line: int, token: Token) -> None: + pass + + def match(self, code: str, pos: int, line: int) -> None | re.Match: + pass + +# Match token only if it's the first token in line (skipping whitespace) +class FirstInLine(Matcher): + whitespace = re.compile(r'\s*') + + def __init__(self, regex): + self.rule = re.compile(regex, flags=re.MULTILINE) + self.first_in_line = True + + def update_after_match(self, code, pos, line, token): + # first token is always first in line + if pos == 0: + self.first_in_line = True + return + + # check if matched token contains a newline + newline_pos = code.rfind('\n', token.span[0], token.span[1]) + + # if it doesn't contain a newline, check the part after newline + if newline_pos != -1: + post_newline_tok = code[newline_pos+1:token.span[1]] + + # if part after newline contains only whitespace (or nothing), the next token is first in line + if self.whitespace.fullmatch(post_newline_tok): + self.first_in_line = True + # if currently matched is the first in line, and only contains whitespace, + # the next token also counts as first in line + elif self.first_in_line and self.whitespace.fullmatch(code, token.span[0], token.span[1]): + self.first_in_line = True + # otherwise reset first in line marker + else: + self.first_in_line = False + + def match(self, code, pos, line): + if self.first_in_line: + return self.rule.match(code, pos) + +class LexerContext: + def __init__(self, code, pos, line, filter_tokens): + self.code = code + self.pos = pos + self.line = line + self.filter_tokens = filter_tokens + +def simple_lexer(rules, code, filter_tokens=None): + if len(code) == 0: + return + + # to avoid dealing with files without trailing newlines + if code[-1] != '\n': + code += '\n' + + rules_compiled = [] + after_match_hooks = [] + + # compile rules + for rule, action in rules: + # string rules are actually match regex rules + if type(rule) is str: + rules_compiled.append((match_regex(rule), action)) + # rules can also be callables + elif callable(rule): + rules_compiled.append((rule, action)) + # rules can also be matchers - matchers get more information during parsing, + # that information can stored in their state + elif isinstance(rule, Matcher): + rules_compiled.append((rule.match, action)) + after_match_hooks.append(rule.update_after_match) + + # helper function that calls hooks before yielding + def yield_token(to_yield): + for hook in after_match_hooks: + hook(code, pos, line, to_yield) + return to_yield + + pos = 0 + line = 1 + while pos < len(code): + rule_matched = False + for rule, action in rules_compiled: + match = rule(code, pos, line) + + if match is not None: + span = match.span() + # if match is empty - continue + if span[0] == span[1]: + continue + + rule_matched = True + + if isinstance(action, TokenType): + # only parse tokens of interest - slices apparently copy + if filter_tokens is None or action in filter_tokens: + token = code[span[0]:span[1]] + else: + token = None + + token_obj = Token(action, token, span, line) + yield yield_token(token_obj) + line += code.count('\n', span[0], span[1]) + pos = span[1] + break + elif callable(action): + last_token = None + for token in action(LexerContext(code, pos, line, filter_tokens), match): + last_token = token + yield yield_token(token) + + if last_token is not None: + pos = last_token.span[1] + line = last_token.line + last_token.token.count('\n') + + break + else: + raise Exception(f"invalid action {action}") + + # if no rules match, an error token with a single character is produced. + # this isn't always a big problem, hence it's the decision of the caller + # to decide whether to quit or continue + if not rule_matched: + token = Token(TokenType.ERROR, code[pos], (pos, pos+1), line) + yield yield_token(token) + if code[pos] == '\n': + line += 1 + pos += 1 + +# Combines regexes passed as arguments with pipe operator +def regex_or(*regexes): + result = '(' + for r in regexes: + result += f'({ r })|' + return result[:-1] + ')' + +# Concatenates regexes, putting each in a separate group +def regex_concat(*regexes): + result = '' + for r in regexes: + result += f'({ r })' + return result + diff --git a/elixir/project_utils.py b/elixir/project_utils.py new file mode 100644 index 00000000..242a62c1 --- /dev/null +++ b/elixir/project_utils.py @@ -0,0 +1,47 @@ +import re +from typing import List + +from .filters.utils import Filter, FilterContext +from .filters import default_filters +from .projects import projects +from .lexers import default_lexers + +# Returns a list of applicable filters for project_name under provided filter context +def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: + project_config = projects.get(project_name) + if project_config is None or 'filters' not in project_config: + filter_classes = default_filters + else: + filter_classes = project_config['filters'] + + filters = [] + + for filter_cls in filter_classes: + if type(filter_cls) == tuple and len(filter_cls) == 2: + cls, kwargs = filter_cls + filters.append(cls(**kwargs)) + elif type(filter_cls) == type: + filters.append(filter_cls()) + else: + raise ValueError(f"Invalid filter: {filter_cls}, " \ + "should be either a two element tuple or a type. " \ + "Make sure project_filters in project.py is valid.") + + return [f for f in filters if f.check_if_applies(ctx)] + +def get_lexer(path: str, project_name: str): + project_config = projects.get(project_name) + if project_config is None or 'lexers' not in project_config: + lexers = default_lexers + else: + lexers = project_config['lexers'] + + path = path.lower() + for regex, lexer in lexers.items(): + if re.match(regex, path): + if type(lexer) == tuple: + lexer_cls, kwargs = lexer + return lambda code: lexer_cls(code, **kwargs) + else: + return lambda code: lexer(code) + diff --git a/elixir/projects.py b/elixir/projects.py new file mode 100644 index 00000000..53d4065e --- /dev/null +++ b/elixir/projects.py @@ -0,0 +1,126 @@ +from .filters import * +from collections import OrderedDict +from .filters import * +from .lexers import * + +# Dictionary of custom per-projects settings. +# filters: +# Projects not present in this dictionary only use default_filters. +# Use `*` to unpack filter lists defined above, +# you can pass additional options to filters by putting a Filter +# class and a dictionary with options in a tuple, like this: +# (FilterCls, {"option": True}). +# Check filter files and utils.py for information about available options +projects = { + 'amazon-freertos': { + 'filters': [ + *default_filters, + MakefileSubdirFilter, + ], + }, + 'arm-trusted-firmware': { + 'filters': [ + *default_filters, + CppPathIncFilter, + ], + }, + 'barebox': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + *common_makefile_filters, + ], + }, + 'coreboot': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + *common_makefile_filters, + ], + }, + 'linux': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + *common_makefile_filters, + # include/uapi contains includes to user headers under #ifndef __KERNEL__ + # Our solution is to ignore all includes in such paths + (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}), + ], + 'lexers': OrderedDict({ + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst + + r'/arch/alpha/.*\.s': (GasLexer, {"arch": "alpha"}), + r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}), + r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}), + r'/arch/csky/.*\.s': (GasLexer, {"arch": "csky"}), + r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}), + r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}), + r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}), + r'/arch/openrisc/.*\.s': (GasLexer, {"arch": "openrisc"}), + r'/arch/parisc/.*\.s': (GasLexer, {"arch": "parisc"}), + r'/arch/s390/.*\.s': (GasLexer, {"arch": "s390"}), + r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}), + r'/arch/sparc/.*\.s': (GasLexer, {"arch": "sparc"}), + r'/arch/um/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}), + r'.*\.s': GasLexer, + }), + }, + 'qemu': { + 'filters': [ + *default_filters, + *common_kconfig_filters, + ], + }, + 'u-boot': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + *common_makefile_filters, + ], + 'lexers': OrderedDict({ + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst + + r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}), + r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}), + r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}), + r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}), + r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}), + r'/arch/riscv/.*\.s': (GasLexer, {"arch": "riscv"}), + r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}), + r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/sandbox/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}), + r'.*\.s': GasLexer, + }), + }, + 'uclibc-ng': { + 'filters': [ + *default_filters, + ConfigInFilter, + ], + }, + 'zephyr': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + ], + }, +} + diff --git a/elixir/query.py b/elixir/query.py index ff14d4b1..5476dc6d 100755 --- a/elixir/query.py +++ b/elixir/query.py @@ -21,7 +21,8 @@ from .lib import script, scriptLines, decode from . import lib from . import data -import os +from .lexers import TokenType +import os, sys from collections import OrderedDict from urllib import parse @@ -172,29 +173,38 @@ def query(self, cmd, *args): version = args[0] path = args[1] + lexer = args[2] filename = os.path.basename(path) family = lib.getFileFamily(filename) - if family != None: + if family is not None and lexer is not None: buffer = BytesIO() - tokens = self.scriptLines('tokenize-file', version, path, family) - even = True + code = self.get_file_raw(version, path) prefix = b'' if family == 'K': prefix = b'CONFIG_' - for tok in tokens: - even = not even - tok2 = prefix + tok - if (even and self.db.defs.exists(tok2) and - (lib.compatibleFamily(self.db.defs.get(tok2).get_families(), family) or - lib.compatibleMacro(self.db.defs.get(tok2).get_macros(), family))): - tok = b'\033[31m' + tok2 + b'\033[0m' - else: - tok = lib.unescape(tok) - buffer.write(tok) + for token_type, token, _, line in lexer(code).lex(): + token = token.encode() + + if token_type == TokenType.ERROR: + print("error token: ", token, token_type, filename, line, file=sys.stderr) + elif token_type == TokenType.IDENTIFIER: + token_with_prefix = prefix + token + token_in_db = self.db.defs.exists(token_with_prefix) + if token_in_db: + compatible = \ + lib.compatibleFamily(self.db.defs.get(token_with_prefix).get_families(), family) or \ + lib.compatibleMacro(self.db.defs.get(token_with_prefix).get_macros(), family) + + if compatible: + buffer.write(b'\033[31m' + token_with_prefix + b'\033[0m') + continue + + buffer.write(token) + return decode(buffer.getvalue()) else: return decode(self.script('get-file', version, path)) diff --git a/elixir/web.py b/elixir/web.py index 2a0cbbbb..d25745b0 100755 --- a/elixir/web.py +++ b/elixir/web.py @@ -33,7 +33,7 @@ from .lib import validFamily from .query import Query, SymbolInstance -from .filters import get_filters +from .project_utils import get_filters, get_lexer from .filters.utils import FilterContext from .autocomplete import AutocompleteResource from .api import ApiIdentGetterResource @@ -485,7 +485,8 @@ def format_code(filename, code): # version: requested version of the project # path: path to the file in the repository def generate_source(q, project, version, path): - code = q.query('file', version, path) + lexer = get_lexer(path, project) + code = q.query('file', version, path, lexer) _, fname = os.path.split(path) _, extension = os.path.splitext(fname) diff --git a/update.py b/update.py index 79cb4dcf..3d14e8ce 100755 --- a/update.py +++ b/update.py @@ -22,13 +22,16 @@ # Throughout, an "idx" is the sequential number associated with a blob. # This is different from that blob's Git hash. +import sys from sys import argv from threading import Thread, Lock, Event, Condition +from elixir.lexers import TokenType import elixir.lib as lib from elixir.lib import script, scriptLines import elixir.data as data from elixir.data import PathList +from elixir.project_utils import get_lexer from find_compatible_dts import FindCompatibleDTS verbose = False @@ -56,6 +59,7 @@ bindings_idxes = [] # DT bindings documentation files idx_key_mod = 1000000 defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key. +file_paths = {} tags_done = False # True if all tags have been added to new_idxes @@ -163,7 +167,7 @@ def run(self): progress('vers: Thread finished', index) def update_versions(self, tag): - global blobs_lock + global blobs_lock, file_paths # Get blob hashes and associated file paths blobs = scriptLines('list-blobs', '-p', tag) @@ -174,12 +178,14 @@ def update_versions(self, tag): with blobs_lock: idx = db.blob.get(hash) buf.append((idx, path)) + file_paths[idx] = path buf = sorted(buf) obj = PathList() for idx, path in buf: obj.append(idx, path) + # Store DT bindings documentation files to parse them later if path[:33] == b'Documentation/devicetree/bindings': bindings_idxes.append(idx) @@ -275,6 +281,7 @@ def run(self): new_idxes[self.index][1].wait() # Make sure the tag is ready new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag + new_idxes[self.index][4].wait() # Tell that UpdateVersions processed the tag with tags_refs_lock: tags_refs[0] += 1 @@ -288,45 +295,53 @@ def run(self): progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0]) def update_references(self, idxes): - global hash_file_lock, defs_lock, refs_lock, tags_refs + global hash_file_lock, defs_lock, refs_lock, tags_refs, file_paths for idx in idxes: if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0]) with hash_file_lock: hash = db.hash.get(idx) - filename = db.file.get(idx) + filename = file_paths[idx].decode() family = lib.getFileFamily(filename) if family == None: continue + lexer = get_lexer(filename, project) + if lexer is None: + continue + + try: + code = script('get-blob', hash).decode() + except UnicodeDecodeError: + code = script('get-blob', hash).decode('raw_unicode_escape') + prefix = b'' # Kconfig values are saved as CONFIG_ if family == 'K': prefix = b'CONFIG_' - tokens = scriptLines('tokenize-file', '-b', hash, family) - even = True - line_num = 1 idents = {} with defs_lock: - for tok in tokens: - even = not even - if even: - tok = prefix + tok - - if (db.defs.exists(tok) and - not ( (idx*idx_key_mod + line_num) in defs_idxes and - defs_idxes[idx*idx_key_mod + line_num] == tok ) and - (family != 'M' or tok.startswith(b'CONFIG_'))): - # We only index CONFIG_??? in makefiles - if tok in idents: - idents[tok] += ',' + str(line_num) - else: - idents[tok] = str(line_num) + for token_type, token, _, line in lexer(code).lex(): + if token_type == TokenType.ERROR: + print("error token: ", token, token_type, filename, line, file=sys.stderr) + continue - else: - line_num += tok.count(b'\1') + token = prefix + token.encode() + + if token_type != TokenType.IDENTIFIER: + continue + + if (db.defs.exists(token) and + not ( (idx*idx_key_mod + line) in defs_idxes and + defs_idxes[idx*idx_key_mod + line] == token ) and + (family != 'M' or token.startswith(b'CONFIG_'))): + # We only index CONFIG_??? in makefiles + if token in idents: + idents[token] += ',' + str(line) + else: + idents[token] = str(line) with refs_lock: for ident, lines in idents.items(): @@ -579,6 +594,7 @@ def progress(msg, current): for tag in scriptLines('list-tags'): if not db.vers.exists(tag): tag_buf.append(tag) + break num_tags = len(tag_buf) project = lib.currentProject()