Contains code to detect language of a file based on its file name
It uses the 'language.yml' ('src/diffannotator/languages.yml') taken from
the GitHub Linguist project, which is the library is used on GitHub.com
to detect blob languages, ignore binary or vendored files, suppress generated
files in diffs, and generate language breakdown graphs (written in Ruby, MIT license)
https://github.com/github-linguist/linguist
This module has currently much more limited scope: it tries to use the file name,
and does not try to check file contents, nor does it take user-provided
configuration included in '.gitattributes' file, like GitHub Linguist does
https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#using-gitattributes
Overrides to the data extracted from 'languages.yml' are provided via
the following globl variables:
FILENAME_TO_LANGUAGES
- mapping from filenames (basenames) of files
to single-element list of language corresponding to that name, for example
FILENAME_TO_LANGUAGES['COPYING'] == 'Text'
EXT_TO_LANGUAGES
- mapping from file extension (including the dot '.')
to single-element list of language corresponding to that name, for example
EXT_TO_LANGUAGES['.md'] == ['Markdown']
PATTERN_TO_PURPOSE
- mapping from file wildcard / glob pattern
to the purpose of the file (which can be used to determine line types),
for example PATTERN_TO_PURPOSE['*.cmake'] == 'project'
NOTE that currently some of those rules are built in into the
languages_exceptions()
function.
Example usage:
from diffannotator.languages import Languages
LANGUAGES = Languages()
LANGUAGES.annotate("src/main.cpp")
This module is used by the diff-annotate script, with sources in annotate.py
source code file.
Languages
Bases: object
Linguists file support with some simplification
Source code in src/diffannotator/languages.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329 | class Languages(object):
"""Linguists file support with some simplification"""
def __init__(self, languages_yaml: PathLike = "languages.yml"):
super(Languages, self).__init__()
self.yaml = Path(languages_yaml)
# make it an absolute path, so that scripts work from any working directory
if not self.yaml.exists() and not self.yaml.is_absolute():
self.yaml = Path(__file__).resolve(strict=True).parent.joinpath(self.yaml)
self._read()
self._simplify()
def _read(self):
"""Read, parse, and extract information from 'languages.yml'"""
with open(self.yaml, "r") as stream:
self.languages = yaml.safe_load(stream)
self.ext_primary = defaultdict(list)
self.ext_lang = defaultdict(list)
self.filenames_lang = defaultdict(list)
# reverse lookup
for lang, v in self.languages.items():
if "primary_extension" in v:
for ext in v["primary_extension"]:
self.ext_primary[ext].append(lang)
if "extensions" in v:
for ext in v["extensions"]:
self.ext_lang[ext].append(lang)
if "filenames" in v:
for filename in v["filenames"]:
self.filenames_lang[filename].append(lang)
def _simplify(self):
"""simplify languages assigned to file extensions"""
for ext in EXT_TO_LANGUAGES:
if ext in self.ext_primary:
self.ext_primary[ext] = EXT_TO_LANGUAGES[ext]
if ext in self.ext_lang:
self.ext_lang[ext] = EXT_TO_LANGUAGES[ext]
def _path2lang(self, file_path: str) -> str:
"""Convert path of file in repository to programming language of file"""
# TODO: consider switching from Path.stem to Path.name (basename)
filename, ext = Path(file_path).stem, Path(file_path).suffix # os.file_path.splitext(file_path)
basename = Path(file_path).name
#print(f"{file_path=}: {filename=}, {ext=}, {basename=}")
# NOTE: or dict(itertools.chain.from_iterable(d.items() for d in (d1, d2, d3)))
# NOTE: FILENAME_TO_LANGUAGES overrides what's from Linguist 'languages.yml'
filenames_lang = dict(self.filenames_lang, **FILENAME_TO_LANGUAGES)
if basename in filenames_lang:
ret = languages_exceptions(file_path, filenames_lang[basename])
# Debug to catch filenames (basenames) with language collisions
if len(ret) > 1:
logger.warning(f"Filename collision in filenames_lang for '{file_path}': {ret}")
#print(f"... filenames_to_languages: {ret}")
return ret[0]
# NOTE: EXT_TO_LANGUAGES overrides what's from Linguist 'languages.yml'
#print(f"checking if {ext=} is in EXT_TO_LANGUAGES keys: {EXT_TO_LANGUAGES.keys()}")
if ext in EXT_TO_LANGUAGES:
#print(f"... ext_to_languages: {EXT_TO_LANGUAGES[ext]}")
return EXT_TO_LANGUAGES[ext][0]
if ext in self.ext_primary:
ret = languages_exceptions(file_path, self.ext_primary[ext])
# DEBUG to catch extensions with language collisions
if len(ret) > 1:
logger.warning(f"Extension collision in ext_primary for '{file_path}': {ret}")
#print(f"... ext_primary: {ret}")
return ret[0]
if ext in self.ext_lang:
ret = languages_exceptions(file_path, self.ext_lang[ext])
# Debug to catch extensions with language collisions
if len(ret) > 1:
logger.warning(f"Extension collision in ext_lang for '{file_path}': {ret}")
#print(f"... ext_lang: {ret}")
return ret[0]
# TODO: move those exceptions to languages_exceptions()
if "/dev/null" in file_path:
return "/dev/null"
# DEBUG information
logger.warning(f"Unknown file type for '{file_path}' ({filename}{ext})")
#print(f"... unknown type for {file_path=}")
return "unknown"
@staticmethod
def _path2purpose(path: str, filetype: str) -> str:
"""Parameter is a filepath and filetype. Returns file purpose as a string."""
# everything that has test in filename -> test
# TODO: should it consider only basename?
if "test" in path.lower():
return "test"
path_pure = PurePath(path)
for pattern, purpose in PATTERN_TO_PURPOSE.items():
if path_pure.match(pattern):
return purpose
# let's assume that prose (i.e. txt, markdown, rst, etc.) is documentation
if "prose" in filetype:
return "documentation"
# limit filetype to selected set of file types
# from languages.yml: Either data, programming, markup, prose, or nil
if filetype in ["programming", "data", "markup", "other"]:
return filetype
# default unknown
return "unknown"
def annotate(self, path: str) -> dict:
"""Annotate file with its primary language metadata
:param path: file path in the repository
:return: metadata about language, file type, and purpose of file
"""
language = self._path2lang(path)
# TODO: maybe convert to .get() with default value
try:
filetype = self.languages[language]["type"]
except KeyError:
filetype = "other"
file_purpose = self._path2purpose(path, filetype)
return {"language": language, "type": filetype, "purpose": file_purpose}
|
annotate(path)
Annotate file with its primary language metadata
:param path: file path in the repository
:return: metadata about language, file type, and purpose of file
Source code in src/diffannotator/languages.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329 | def annotate(self, path: str) -> dict:
"""Annotate file with its primary language metadata
:param path: file path in the repository
:return: metadata about language, file type, and purpose of file
"""
language = self._path2lang(path)
# TODO: maybe convert to .get() with default value
try:
filetype = self.languages[language]["type"]
except KeyError:
filetype = "other"
file_purpose = self._path2purpose(path, filetype)
return {"language": language, "type": filetype, "purpose": file_purpose}
|
languages_exceptions(path, lang)
Handle exceptions in determining language of a file
:param path: file path in the repository
:param lang: file language determined so far
:return: single element list of languages
Source code in src/diffannotator/languages.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188 | def languages_exceptions(path: str, lang: list[str]) -> list[str]:
"""Handle exceptions in determining language of a file
:param path: file path in the repository
:param lang: file language determined so far
:return: single element list of languages
"""
if "spark" in path.lower() and "Roff" in lang:
return ["Text"]
if "kconfig" in path.lower() and "Lex" in lang:
return ["Lex"]
if "HTML" in lang:
return ["HTML"]
if "Roff" in lang:
return ["Roff"]
if "M4" in lang:
return ["M4"]
# there are multiple entries for the '.spec' extension
if path.startswith("rpm/") and path.endswith(".spec"):
return ["RPM Spec"]
return lang
|