Skip to content

lexer

Contains code to run lexer, turning code into sequence of tokens

This sequence of tokens is then used by other modules and scripts to determine the type of line (for example, line changed by the commit or a patch): does it contain only documentation (only comments or docstrings, possibly with whitespace), or does it contain at least some code.

Currently, the only lexer supported is from Pygments (Python syntax highligter) https://pygments.org/

Example usage:

from pathlib import Path from diffannotator.lexer import Lexer LEXER = Lexer() file_path = Path('tests/test_code_fragments/example_line_callback_func.py') tokens_list = LEXER.lex(file_path.name, file_path.read_text()) LEXER.lexers {'.py': } tokens_list[:3] [(0, Token.Keyword, 'def'), (3, Token.Text, ' '), (4, Token.Name.Function, 'detect_all_whitespace_line')]

This module is used by the diff-annotate script, with sources in annotate.py source code file.

Lexer

Bases: object

Holder and proxy for lexers

Made to be able to reuse lexer objects, and to call the lexing method required by the :meth:AnnotatedHunk.process() method.

Source code in src/diffannotator/lexer.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class Lexer(object):
    """Holder and proxy for lexers

    Made to be able to reuse lexer objects, and to call the lexing method
    required by the :meth:`AnnotatedHunk.process()` method.
    """

    def __init__(self):
        """Construct the Lexer object, creating the holder for lexers"""
        self.lexers: dict[str, PygmentsLexer] = {}

    def get_lexer(self, filename: str) -> PygmentsLexer:
        """Get lexer suitable for file with given path

        :param filename: path to a file inside repository
        :return: appropriate lexer
        """
        suffix = Path(filename).suffix
        # there are many different file types with an empty suffix
        if not suffix:
            # use basename of the file as key in self.lexers
            suffix = Path(filename).name

        if suffix in self.lexers:
            return self.lexers[suffix]

        try:
            lexer = pygments.lexers.get_lexer_for_filename(filename)
        except pygments.util.ClassNotFound:
            logger.warning(f"Warning: No lexer found for '{filename}', trying Text lexer")
            # TODO: use Text lexer directly: pygments.lexers.special.TextLexer
            lexer = lexers.get_lexer_for_filename("Test.txt")

        self.lexers[suffix] = lexer

        return lexer

    def lex(self, filename: str, code: str) -> Iterable[tuple]:
        """Run lexer on a fragment of code from file with given filename

        :param filename: path to file within the repository
        :param code: source code or text to parse
        :return: an iterable of (index, token_type, text_fragment) tuples
        """
        lexer = self.get_lexer(filename)

        if not lexer:
            logger.error(f"Error in lex: no lexer selected for file '{filename}'")
            return []

        # TODO: consider returning generator or iterator, instead of iterable/list
        return list(lexer.get_tokens_unprocessed(code))

__init__()

Construct the Lexer object, creating the holder for lexers

Source code in src/diffannotator/lexer.py
47
48
49
def __init__(self):
    """Construct the Lexer object, creating the holder for lexers"""
    self.lexers: dict[str, PygmentsLexer] = {}

get_lexer(filename)

Get lexer suitable for file with given path

:param filename: path to a file inside repository :return: appropriate lexer

Source code in src/diffannotator/lexer.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_lexer(self, filename: str) -> PygmentsLexer:
    """Get lexer suitable for file with given path

    :param filename: path to a file inside repository
    :return: appropriate lexer
    """
    suffix = Path(filename).suffix
    # there are many different file types with an empty suffix
    if not suffix:
        # use basename of the file as key in self.lexers
        suffix = Path(filename).name

    if suffix in self.lexers:
        return self.lexers[suffix]

    try:
        lexer = pygments.lexers.get_lexer_for_filename(filename)
    except pygments.util.ClassNotFound:
        logger.warning(f"Warning: No lexer found for '{filename}', trying Text lexer")
        # TODO: use Text lexer directly: pygments.lexers.special.TextLexer
        lexer = lexers.get_lexer_for_filename("Test.txt")

    self.lexers[suffix] = lexer

    return lexer

lex(filename, code)

Run lexer on a fragment of code from file with given filename

:param filename: path to file within the repository :param code: source code or text to parse :return: an iterable of (index, token_type, text_fragment) tuples

Source code in src/diffannotator/lexer.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def lex(self, filename: str, code: str) -> Iterable[tuple]:
    """Run lexer on a fragment of code from file with given filename

    :param filename: path to file within the repository
    :param code: source code or text to parse
    :return: an iterable of (index, token_type, text_fragment) tuples
    """
    lexer = self.get_lexer(filename)

    if not lexer:
        logger.error(f"Error in lex: no lexer selected for file '{filename}'")
        return []

    # TODO: consider returning generator or iterator, instead of iterable/list
    return list(lexer.get_tokens_unprocessed(code))