codeintel: Refine language detection for extensions (#59318)

This should reduce file fetching for binary files as well
as certain common extensions such as .md, .yaml and .ts
This commit is contained in:
Varun Gandhi 2024-01-04 23:52:54 +08:00 committed by GitHub
parent de4e5b5496
commit 900dfc3ffa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 417 additions and 4 deletions

View File

@ -227,6 +227,7 @@ export const FILE_ICONS_BY_LANGUAGE: Map<string, UnifiedIcon> = new Map([
['SVG', { react: { icon: SiSvg, className: styles.yellow }, svg: mdiSvg }],
['Swift', { react: { icon: SiSwift, className: styles.blue }, svg: mdiLanguageSwift }],
['Terraform', { react: { icon: SiTerraform, className: styles.blue } }],
['TSX', { react: { icon: SiTypescript, className: styles.blue }, svg: mdiLanguageTypescript }],
['TypeScript', { react: { icon: SiTypescript, className: styles.blue }, svg: mdiLanguageTypescript }],
['Text', { react: { icon: CiTextAlignLeft, className: styles.defaultIcon }, svg: mdiText }],

View File

@ -3,7 +3,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "languages",
srcs = ["languages.go"],
srcs = [
"extensions.go",
"languages.go",
],
importpath = "github.com/sourcegraph/sourcegraph/lib/codeintel/languages",
visibility = ["//visibility:public"],
deps = ["@com_github_go_enry_go_enry_v2//:go-enry"],
@ -11,7 +14,13 @@ go_library(
go_test(
name = "languages_test",
srcs = ["languages_test.go"],
srcs = [
"extensions_test.go",
"languages_test.go",
],
embed = [":languages"],
deps = ["@com_github_stretchr_testify//require"],
deps = [
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_stretchr_testify//require",
],
)

View File

@ -0,0 +1,326 @@
package languages
import (
"path/filepath"
"github.com/go-enry/go-enry/v2"
)
// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
// to work around the following limitations:
// - For some extensions which are overwhelmingly used by a certain file type
// in practice, such as '.ts', '.md' and '.yaml', it returns ambiguous results.
// - It does not provide any information about binary files.
func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFile bool) {
ext := filepath.Ext(path)
if ext == "" {
return nil, false
}
if _, ok := commonBinaryFileExtensions[ext[1:]]; ok {
return nil, true
}
if lang, ok := overrideAmbiguousExtensionsMap[ext]; ok {
return []string{lang}, false
}
return enry.GetLanguagesByExtension(path, nil, nil), false
}
var commonBinaryFileExtensions = func() map[string]struct{} {
m := map[string]struct{}{}
for _, s := range commonBinaryFileExtensionsList {
m[s] = struct{}{}
}
return m
}()
var overrideAmbiguousExtensionsMap = map[string]string{
// Ignoring the uncommon usage of '.cs' for Smalltalk.
".cs": "C#",
// The other languages are Filterscript, Forth, GLSL. Out of that,
// Forth and GLSL commonly use other extensions. Ignore Filterscript
// as it is niche.
".fs": "F#",
// Not considering "GCC Machine Description".
".md": "Markdown",
// The other main language using '.rs' is RenderScript, but that's deprecated.
// See https://developer.android.com/guide/topics/renderscript/compute
".rs": "Rust",
// In i18n contexts, there are XML files with '.ts' and '.tsx' extensions,
// but we ignore those for now to avoid penalizing the common case.
".tsx": "TSX",
".ts": "TypeScript",
// Ignoring "Adblock Filter List" and "Vim Help File".
".txt": "Text",
// Ignoring other variants of YAML.
".yaml": "YAML",
// ".yml" is not included here in parallel to ".yaml"
// as it is the first extension for 'YAML' and not the first
// for other variants of YAML, hence only 'YAML' is picked by enry.
}
// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json
// License: https://github.com/sindresorhus/binary-extensions/blob/main/license
// Replace the contents with
// curl -L https://raw.githubusercontent.com/sindresorhus/binary-extensions/main/binary-extensions.json | jq '.[]' | awk '{print $1 ","}'
//
// Not adding a leading '.' here to make it easier to update/compare the list.
var commonBinaryFileExtensionsList = []string{
"3dm",
"3ds",
"3g2",
"3gp",
"7z",
"a",
"aac",
"adp",
"ai",
"aif",
"aiff",
"alz",
"ape",
"apk",
"appimage",
"ar",
"arj",
"asf",
"au",
"avi",
"bak",
"baml",
"bh",
"bin",
"bk",
"bmp",
"btif",
"bz2",
"bzip2",
"cab",
"caf",
"cgm",
"class",
"cmx",
"cpio",
"cr2",
"cur",
"dat",
"dcm",
"deb",
"dex",
"djvu",
"dll",
"dmg",
"dng",
"doc",
"docm",
"docx",
"dot",
"dotm",
"dra",
"DS_Store",
"dsk",
"dts",
"dtshd",
"dvb",
"dwg",
"dxf",
"ecelp4800",
"ecelp7470",
"ecelp9600",
"egg",
"eol",
"eot",
"epub",
"exe",
"f4v",
"fbs",
"fh",
"fla",
"flac",
"flatpak",
"fli",
"flv",
"fpx",
"fst",
"fvt",
"g3",
"gh",
"gif",
"graffle",
"gz",
"gzip",
"h261",
"h263",
"h264",
"icns",
"ico",
"ief",
"img",
"ipa",
"iso",
"jar",
"jpeg",
"jpg",
"jpgv",
"jpm",
"jxr",
"key",
"ktx",
"lha",
"lib",
"lvp",
"lz",
"lzh",
"lzma",
"lzo",
"m3u",
"m4a",
"m4v",
"mar",
"mdi",
"mht",
"mid",
"midi",
"mj2",
"mka",
"mkv",
"mmr",
"mng",
"mobi",
"mov",
"movie",
"mp3",
"mp4",
"mp4a",
"mpeg",
"mpg",
"mpga",
"mxu",
"nef",
"npx",
"numbers",
"nupkg",
"o",
"odp",
"ods",
"odt",
"oga",
"ogg",
"ogv",
"otf",
"ott",
"pages",
"pbm",
"pcx",
"pdb",
"pdf",
"pea",
"pgm",
"pic",
"png",
"pnm",
"pot",
"potm",
"potx",
"ppa",
"ppam",
"ppm",
"pps",
"ppsm",
"ppsx",
"ppt",
"pptm",
"pptx",
"psd",
"pya",
"pyc",
"pyo",
"pyv",
"qt",
"rar",
"ras",
"raw",
"resources",
"rgb",
"rip",
"rlc",
"rmf",
"rmvb",
"rpm",
"rtf",
"rz",
"s3m",
"s7z",
"scpt",
"sgi",
"shar",
"snap",
"sil",
"sketch",
"slk",
"smv",
"snk",
"so",
"stl",
"suo",
"sub",
"swf",
"tar",
"tbz",
"tbz2",
"tga",
"tgz",
"thmx",
"tif",
"tiff",
"tlz",
"ttc",
"ttf",
"txz",
"udf",
"uvh",
"uvi",
"uvm",
"uvp",
"uvs",
"uvu",
"viv",
"vob",
"war",
"wav",
"wax",
"wbmp",
"wdp",
"weba",
"webm",
"webp",
"whl",
"wim",
"wm",
"wma",
"wmv",
"wmx",
"woff",
"woff2",
"wrm",
"wvx",
"xbm",
"xif",
"xla",
"xlam",
"xls",
"xlsb",
"xlsm",
"xlsx",
"xlt",
"xltm",
"xltx",
"xm",
"xmind",
"xpi",
"xpm",
"xwd",
"xz",
"z",
"zip",
"zipx",
}

View File

@ -0,0 +1,71 @@
package languages
import (
"testing"
"github.com/go-enry/go-enry/v2"
"github.com/stretchr/testify/require"
)
func TestOverrideExtensions(t *testing.T) {
for ext, language := range overrideAmbiguousExtensionsMap {
filename := "foo" + ext
enryLangs := enry.GetLanguagesByExtension(filename, nil, nil)
require.Contains(t, enryLangs, language,
"maybe a typo in `overrideAmbiguousExtensionsMap`?")
require.Greaterf(t, len(enryLangs), 1,
"extension %v is not ambiguous according to enry, remove it from `overrideAmbiguousExtensionsMap`",
ext)
}
}
func TestNonAmbiguousExtensions(t *testing.T) {
// Languages/extensions that we don't want to regress
nonAmbiguousExtensionsCheck := map[string]string{
".js": "JavaScript",
// Linguist removed JSX (but not TSX) as a separate language:
// https://github.com/github-linguist/linguist/pull/5133
".jsx": "JavaScript",
".ts": "TypeScript",
".tsx": "TSX",
".py": "Python",
".rb": "Ruby",
".go": "Go",
".java": "Java",
".kt": "Kotlin",
".scala": "Scala",
".cs": "C#",
".fs": "F#",
".rs": "Rust",
".c": "C",
".cpp": "C++",
".cxx": "C++",
".hpp": "C++",
".hxx": "C++",
".lua": "Lua",
".dart": "Dart",
".swift": "Swift",
".css": "CSS",
".json": "JSON",
".yml": "YAML",
".xml": "XML",
}
for ext, language := range nonAmbiguousExtensionsCheck {
filename := "foo" + ext
languages, isLikelyBinaryFile := getLanguagesByExtension(filename)
require.False(t, isLikelyBinaryFile)
require.Equal(t, []string{language}, languages,
"If this test fails when updating enry, maybe `overrideAmbiguousExtensionsMap` needs updating")
}
}
func TestBinaryExtensions(t *testing.T) {
for _, ext := range []string{".png", ".jpg", ".gif"} {
filename := "foo" + ext
_, isLikelyBinary := getLanguagesByExtension(filename)
require.Truef(t, isLikelyBinary, "filename: %v was not guessed to be binary;"+
"bug in extension matching logic in getLanguagesByExtension maybe?",
filename)
}
}

View File

@ -51,12 +51,18 @@ func GetMostLikelyLanguage(path, contents string) (lang string, found bool) {
// the content.
//
// Only returns an error if getContent returns an error.
//
// getContent is not called if the file is likely to be a binary file,
// as enry only covers programming languages.
func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) {
langs := enry.GetLanguagesByFilename(path, nil, nil)
if len(langs) == 1 {
return langs, nil
}
newLangs := enry.GetLanguagesByExtension(path, nil, langs)
newLangs, isLikelyBinaryFile := getLanguagesByExtension(path)
if isLikelyBinaryFile {
return nil, nil
}
switch len(newLangs) {
case 0:
break