Skip to content

lexer

Contains code to run lexer, turning code into sequence of tokens

This sequence of tokens is then used by other modules and scripts to determine the type of line (for example, line changed by the commit or a patch): does it contain only documentation (only comments or docstrings, possibly with whitespace), or does it contain at least some code.

Currently, the only lexer supported is from Pygments (Python syntax highligter) https://pygments.org/

Example usage:

from pathlib import Path from diffannotator.lexer import Lexer LEXER = Lexer() file_path = Path('tests/test_code_fragments/example_line_callback_func.py') tokens_list = LEXER.lex(file_path.name, file_path.read_text()) LEXER.lexers {'.py': } tokens_list[:3] [(0, Token.Keyword, 'def'), (3, Token.Text, ' '), (4, Token.Name.Function, 'detect_all_whitespace_line')]

This module is used by the diff-annotate script, with sources in annotate.py source code file.

Lexer

Bases: object

Holder and proxy for lexers

Made to be able to reuse lexer objects, and to call the lexing method required by the :meth:AnnotatedHunk.process() method.

Source code in src/diffannotator/lexer.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class Lexer(object):
    """Holder and proxy for lexers

    Made to be able to reuse lexer objects, and to call the lexing method
    required by the :meth:`AnnotatedHunk.process()` method.
    """

    def __init__(self):
        """Construct the Lexer object, creating the holder for lexers"""
        self.lexers: dict[str, PygmentsLexer] = {}

    def get_lexer(self, filename: str) -> PygmentsLexer:
        """Get lexer suitable for file with given path

        Parameters
        ----------
        filename
            path to a file inside repository

        Returns
        -------
        PygmentsLexer
            appropriate lexer
        """
        suffix = Path(filename).suffix
        # there are many different file types with an empty suffix
        if not suffix:
            # use basename of the file as key in self.lexers
            suffix = Path(filename).name

        if suffix in self.lexers:
            return self.lexers[suffix]

        try:
            lexer = pygments.lexers.get_lexer_for_filename(filename)
        except pygments.util.ClassNotFound:
            logger.warning(f"Warning: No lexer found for '{filename}', trying Text lexer")
            # TODO: use Text lexer directly: pygments.lexers.special.TextLexer
            lexer = lexers.get_lexer_for_filename("Test.txt")

        self.lexers[suffix] = lexer

        return lexer

    def lex(self, filename: str, code: str) -> Iterable[tuple]:
        """Run lexer on a fragment of code from file with given filename

        Parameters
        ----------
        filename
            path to file within the repository
        code
            source code or text to parse

        Returns
        -------
        Iterable[tuple]
            iterable of (index, token_type, text_fragment) tuples
        """
        lexer = self.get_lexer(filename)

        if not lexer:
            logger.error(f"Error in lex: no lexer selected for file '{filename}'")
            return []

        # TODO: consider returning generator or iterator, instead of iterable/list
        return list(lexer.get_tokens_unprocessed(code))

__init__

__init__()

Construct the Lexer object, creating the holder for lexers

Source code in src/diffannotator/lexer.py
47
48
49
def __init__(self):
    """Construct the Lexer object, creating the holder for lexers"""
    self.lexers: dict[str, PygmentsLexer] = {}

get_lexer

get_lexer(filename: str) -> PygmentsLexer

Get lexer suitable for file with given path

PARAMETER DESCRIPTION
filename

path to a file inside repository

TYPE: str

RETURNS DESCRIPTION
Lexer

appropriate lexer

Source code in src/diffannotator/lexer.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def get_lexer(self, filename: str) -> PygmentsLexer:
    """Get lexer suitable for file with given path

    Parameters
    ----------
    filename
        path to a file inside repository

    Returns
    -------
    PygmentsLexer
        appropriate lexer
    """
    suffix = Path(filename).suffix
    # there are many different file types with an empty suffix
    if not suffix:
        # use basename of the file as key in self.lexers
        suffix = Path(filename).name

    if suffix in self.lexers:
        return self.lexers[suffix]

    try:
        lexer = pygments.lexers.get_lexer_for_filename(filename)
    except pygments.util.ClassNotFound:
        logger.warning(f"Warning: No lexer found for '{filename}', trying Text lexer")
        # TODO: use Text lexer directly: pygments.lexers.special.TextLexer
        lexer = lexers.get_lexer_for_filename("Test.txt")

    self.lexers[suffix] = lexer

    return lexer

lex

lex(filename: str, code: str) -> Iterable[tuple]

Run lexer on a fragment of code from file with given filename

PARAMETER DESCRIPTION
filename

path to file within the repository

TYPE: str

code

source code or text to parse

TYPE: str

RETURNS DESCRIPTION
Iterable[tuple]

iterable of (index, token_type, text_fragment) tuples

Source code in src/diffannotator/lexer.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def lex(self, filename: str, code: str) -> Iterable[tuple]:
    """Run lexer on a fragment of code from file with given filename

    Parameters
    ----------
    filename
        path to file within the repository
    code
        source code or text to parse

    Returns
    -------
    Iterable[tuple]
        iterable of (index, token_type, text_fragment) tuples
    """
    lexer = self.get_lexer(filename)

    if not lexer:
        logger.error(f"Error in lex: no lexer selected for file '{filename}'")
        return []

    # TODO: consider returning generator or iterator, instead of iterable/list
    return list(lexer.get_tokens_unprocessed(code))