Skip to content

languages

Contains code to detect language of a file based on its file name

It uses the 'language.yml' ('src/diffannotator/languages.yml') taken from the GitHub Linguist project, which is the library is used on GitHub.com to detect blob languages, ignore binary or vendored files, suppress generated files in diffs, and generate language breakdown graphs (written in Ruby, MIT license) https://github.com/github-linguist/linguist

This module has currently much more limited scope: it tries to use the file name, and does not try to check file contents, nor does it take user-provided configuration included in '.gitattributes' file, like GitHub Linguist does https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#using-gitattributes

Overrides to the data extracted from 'languages.yml' are provided via the following globl variables:

  • FILENAME_TO_LANGUAGES - mapping from filenames (basenames) of files to single-element list of language corresponding to that name, for example FILENAME_TO_LANGUAGES['COPYING'] == 'Text'
  • EXT_TO_LANGUAGES - mapping from file extension (including the dot '.') to single-element list of language corresponding to that name, for example EXT_TO_LANGUAGES['.md'] == ['Markdown']
  • PATTERN_TO_PURPOSE - mapping from file wildcard / glob pattern to the purpose of the file (which can be used to determine line types), for example PATTERN_TO_PURPOSE['*.cmake'] == 'project'

NOTE that currently some of those rules are built in into the languages_exceptions() function.

Example usage:

from diffannotator.languages import Languages LANGUAGES = Languages() LANGUAGES.annotate("src/main.cpp")

This module is used by the diff-annotate script, with sources in annotate.py source code file.

Languages

Bases: object

Linguists file support with some simplification

Source code in src/diffannotator/languages.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
class Languages(object):
    """Linguists file support with some simplification"""

    def __init__(self, languages_yaml: PathLike = "languages.yml"):
        super(Languages, self).__init__()
        self.yaml = Path(languages_yaml)

        # make it an absolute path, so that scripts work from any working directory
        if not self.yaml.exists() and not self.yaml.is_absolute():
            self.yaml = Path(__file__).resolve(strict=True).parent.joinpath(self.yaml)

        self._read()
        self._simplify()

    def _read(self):
        """Read, parse, and extract information from 'languages.yml'"""
        with open(self.yaml, "r") as stream:
            self.languages = yaml.safe_load(stream)

        self.ext_primary = defaultdict(list)
        self.ext_lang = defaultdict(list)
        self.filenames_lang = defaultdict(list)

        # reverse lookup
        for lang, v in self.languages.items():
            if "primary_extension" in v:
                for ext in v["primary_extension"]:
                    self.ext_primary[ext].append(lang)
            if "extensions" in v:
                for ext in v["extensions"]:
                    self.ext_lang[ext].append(lang)
            if "filenames" in v:
                for filename in v["filenames"]:
                    self.filenames_lang[filename].append(lang)

    def _simplify(self):
        """simplify languages assigned to file extensions"""
        for ext in EXT_TO_LANGUAGES:
            if ext in self.ext_primary:
                self.ext_primary[ext] = EXT_TO_LANGUAGES[ext]

            if ext in self.ext_lang:
                self.ext_lang[ext] = EXT_TO_LANGUAGES[ext]

    def _path2lang(self, file_path: str) -> str:
        """Convert path of file in repository to programming language of file"""
        # TODO: consider switching from Path.stem to Path.name (basename)
        filename, ext = Path(file_path).stem, Path(file_path).suffix  # os.file_path.splitext(file_path)
        basename = Path(file_path).name
        #print(f"{file_path=}: {filename=}, {ext=}, {basename=}")

        # NOTE: or dict(itertools.chain.from_iterable(d.items() for d in (d1, d2, d3)))
        # NOTE: FILENAME_TO_LANGUAGES overrides what's from Linguist 'languages.yml'
        filenames_lang = dict(self.filenames_lang, **FILENAME_TO_LANGUAGES)
        if basename in filenames_lang:
            ret = languages_exceptions(file_path, filenames_lang[basename])
            # Debug to catch filenames (basenames) with language collisions
            if len(ret) > 1:
                logger.warning(f"Filename collision in filenames_lang for '{file_path}': {ret}")

            #print(f"... filenames_to_languages: {ret}")
            return ret[0]

        # NOTE: EXT_TO_LANGUAGES overrides what's from Linguist 'languages.yml'
        #print(f"checking if {ext=} is in EXT_TO_LANGUAGES keys: {EXT_TO_LANGUAGES.keys()}")
        if ext in EXT_TO_LANGUAGES:
            #print(f"... ext_to_languages: {EXT_TO_LANGUAGES[ext]}")
            return EXT_TO_LANGUAGES[ext][0]

        if ext in self.ext_primary:
            ret = languages_exceptions(file_path, self.ext_primary[ext])
            # DEBUG to catch extensions with language collisions
            if len(ret) > 1:
                logger.warning(f"Extension collision in ext_primary for '{file_path}': {ret}")

            #print(f"... ext_primary: {ret}")
            return ret[0]

        if ext in self.ext_lang:
            ret = languages_exceptions(file_path, self.ext_lang[ext])
            # Debug to catch extensions with language collisions
            if len(ret) > 1:
                logger.warning(f"Extension collision in ext_lang for '{file_path}': {ret}")

            #print(f"... ext_lang: {ret}")
            return ret[0]

        # TODO: move those exceptions to languages_exceptions()
        if "/dev/null" in file_path:
            return "/dev/null"

        # DEBUG information
        logger.warning(f"Unknown file type for '{file_path}' ({filename}{ext})")

        #print(f"... unknown type for {file_path=}")
        return "unknown"

    @staticmethod
    def _path2purpose(path: str, filetype: str) -> str:
        """Parameter is a filepath and filetype. Returns file purpose as a string."""
        # everything that has test in filename -> test
        # TODO: should it consider only basename?
        if "test" in path.lower():
            return "test"

        path_pure = PurePath(path)
        for pattern, purpose in PATTERN_TO_PURPOSE.items():
            if path_pure.match(pattern):
                return purpose

        # let's assume that prose (i.e. txt, markdown, rst, etc.) is documentation
        if "prose" in filetype:
            return "documentation"

        # limit filetype to selected set of file types
        # from languages.yml: Either data, programming, markup, prose, or nil
        if filetype in ["programming", "data", "markup", "other"]:
            return filetype

        # default unknown
        return "unknown"

    def annotate(self, path: str) -> dict:
        """Annotate file with its primary language metadata

        :param path: file path in the repository
        :return: metadata about language, file type, and purpose of file
        """
        language = self._path2lang(path)

        # TODO: maybe convert to .get() with default value
        try:
            filetype = self.languages[language]["type"]
        except KeyError:
            filetype = "other"

        file_purpose = self._path2purpose(path, filetype)

        return {"language": language, "type": filetype, "purpose": file_purpose}

annotate(path)

Annotate file with its primary language metadata

:param path: file path in the repository :return: metadata about language, file type, and purpose of file

Source code in src/diffannotator/languages.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def annotate(self, path: str) -> dict:
    """Annotate file with its primary language metadata

    :param path: file path in the repository
    :return: metadata about language, file type, and purpose of file
    """
    language = self._path2lang(path)

    # TODO: maybe convert to .get() with default value
    try:
        filetype = self.languages[language]["type"]
    except KeyError:
        filetype = "other"

    file_purpose = self._path2purpose(path, filetype)

    return {"language": language, "type": filetype, "purpose": file_purpose}

languages_exceptions(path, lang)

Handle exceptions in determining language of a file

:param path: file path in the repository :param lang: file language determined so far :return: single element list of languages

Source code in src/diffannotator/languages.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def languages_exceptions(path: str, lang: list[str]) -> list[str]:
    """Handle exceptions in determining language of a file

    :param path: file path in the repository
    :param lang: file language determined so far
    :return: single element list of languages
    """
    if "spark" in path.lower() and "Roff" in lang:
        return ["Text"]

    if "kconfig" in path.lower() and "Lex" in lang:
        return ["Lex"]

    if "HTML" in lang:
        return ["HTML"]

    if "Roff" in lang:
        return ["Roff"]

    if "M4" in lang:
        return ["M4"]

    # there are multiple entries for the '.spec' extension
    if path.startswith("rpm/") and path.endswith(".spec"):
        return ["RPM Spec"]

    return lang