chore: Centralize languages package as source-of-truth (#63292)

This patch does a few things:

- Adds `go-enry` packages to depguard, so that people do not
  accidentally use enry APIs instead of the corresponding APIs
  in the `languages` package.
- Adds more tests for different functions in the languages package
  to ensure mutual consistency in how language<->extension mappings
  are handled.
- Adds tests for enry upgrades
- Adds comments with IDs so that related parts in the code can be
   pieced together easily
This commit is contained in:
Varun Gandhi 2024-06-18 21:10:24 +08:00 committed by GitHub
parent fea35a2733
commit 3437f8253d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 935 additions and 793 deletions

File diff suppressed because it is too large Load Diff

View File

@ -392,7 +392,6 @@ go_test(
"access_tokens_test.go",
"client_configuration_test.go",
"code_hosts_test.go",
"enry_test.go",
"event_log_test.go",
"event_logs_test.go",
"executor_secrets_test.go",
@ -541,7 +540,6 @@ go_test(
"@com_github_davecgh_go_spew//spew",
"@com_github_derision_test_go_mockgen_v2//testutil/assert",
"@com_github_derision_test_go_mockgen_v2//testutil/require",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_golang_jwt_jwt_v4//:jwt",
"@com_github_google_go_cmp//cmp",
"@com_github_google_go_cmp//cmp/cmpopts",

View File

@ -1,22 +0,0 @@
package graphqlbackend
import (
"testing"
"github.com/go-enry/go-enry/v2"
"github.com/stretchr/testify/require"
)
var matlabFile string = `% matlab function to compute square of a value
function [out] = square(x)
out = x * x;
end
function [out] = fourthpower(x)
out = square(square(x));
end`
func TestEnryLangs(t *testing.T) {
langs := enry.GetLanguages("foo.m", []byte(matlabFile))
require.Equal(t, []string{"MATLAB"}, langs)
}

View File

@ -9,14 +9,13 @@ import (
"io"
"io/fs"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2/data"
"github.com/go-enry/go-enry/v2" //nolint:depguard - FIXME: replace this usage of enry with languages package
"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: replace this usage of enry with languages package
"go.opentelemetry.io/otel/attribute"
"github.com/sourcegraph/sourcegraph/internal/trace"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/trace"
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
"github.com/sourcegraph/sourcegraph/lib/errors"
)

View File

@ -13,7 +13,7 @@ import (
"testing"
"time"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2" //nolint:depguard - FIXME: replace this usage of enry with languages package
"github.com/sourcegraph/sourcegraph/lib/errors"
)

View File

@ -24,6 +24,9 @@ var Deny map[string]string = map[string]string{
"regexp$": "Use github.com/grafana/regexp instead",
"github.com/hexops/autogold$": "Use github.com/hexops/autogold/v2 instead",
"github.com/google/go-github/github$": "Use github.com/google/go-github/v55/github instead. To convert between v48 and v55, use the internal/extsvc/github/githubconvert package",
"github.com/go-enry/go-enry$": "Use github.com/sourcegraph/sourcegraph/lib/codeintel/languages instead. If some docs are not clear, please ask in #discuss-graph.",
"github.com/go-enry/go-enry/v2$": "Use github.com/sourcegraph/sourcegraph/lib/codeintel/languages instead. If some docs are not clear, please ask in #discuss-graph.",
"github.com/go-enry/go-enry/v2/data$": "Use github.com/sourcegraph/sourcegraph/lib/codeintel/languages instead. If some needed API is missing, please ask in #discuss-graph.",
}
func createAnalyzer() *analysis.Analyzer {

View File

@ -1066,8 +1066,12 @@ func (s *Service) SyntacticUsages(
// (Meaning we just need a single Searcher/Zoekt search)
searchSymbol := symbolsAtRange[0]
langs, langErr := languages.GetLanguages(path, func() ([]byte, error) { return nil, errors.New("Ambiguous language") })
if langErr != nil || len(langs) == 0 {
langs, _ := languages.GetLanguages(path, nil)
if len(langs) != 1 {
langErr := errors.New("Unknown language")
if len(langs) > 1 {
langErr = errors.New("Ambiguous language")
}
return nil, &SyntacticUsagesError{
Code: SU_FailedToSearch,
UnderlyingError: langErr,

View File

@ -273,12 +273,14 @@ func NewMetaEnvironment(r searchresult.Match, content string) *MetaEnvironment {
Content: string(m.Name),
}
case *searchresult.FileMatch:
// GetLanguages can return multiple matches for ambiguous languages. If there are multiple
// we will take the first one.
languages, _ := languages.GetLanguages(m.Path, nil)
var lang string
if len(languages) > 0 {
lang = languages[0]
// FIXME(id: language-detection-failure-handling):
// Handle failure in language detection as well as ambiguity
langs, _ := languages.GetLanguages(m.Path, func() ([]byte, error) {
return []byte(content), nil
})
lang := ""
if len(langs) > 0 {
lang = langs[0]
}
return &MetaEnvironment{
Repo: string(m.Repo.Name),
@ -298,12 +300,14 @@ func NewMetaEnvironment(r searchresult.Match, content string) *MetaEnvironment {
}
case *searchresult.CommitDiffMatch:
path := m.Path()
// GetLanguages can return multiple matches for ambiguous languages. If there are multiple
// we will take the first one.
languages, _ := languages.GetLanguages(path, nil)
var lang string
if len(languages) > 0 {
lang = languages[0]
// FIXME(id: language-detection-failure-handling):
// Handle failure in language detection as well as ambiguity
langs, _ := languages.GetLanguages(path, func() ([]byte, error) {
return []byte(content), nil
})
lang := ""
if len(langs) > 0 {
lang = langs[0]
}
return &MetaEnvironment{
Repo: string(m.Repo.Name),

View File

@ -722,7 +722,7 @@ func toTextPatternInfo(b query.Basic, resultTypes result.Types, feat *search.Fea
func toLangFilters(aliases []string) []string {
var filters []string
for _, alias := range aliases {
lang, _ := languages.GetLanguageByAlias(alias) // Invariant: lang is valid.
lang, _ := languages.GetLanguageByNameOrAlias(alias) // Invariant: lang is valid.
if !slices.Contains(filters, lang) {
filters = append(filters, lang)
}

View File

@ -4,7 +4,7 @@ import (
"sort"
"strings"
"github.com/go-enry/go-enry/v2/data"
"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: Expose needed APIs in codeintel/languages
"github.com/grafana/regexp"
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
@ -58,7 +58,7 @@ var filenamesFromLanguage = func() map[string][]string {
// LangToFileRegexp converts a lang: parameter to its corresponding file
// patterns for file filters. The lang value must be valid, cf. validate.go
func LangToFileRegexp(lang string) string {
lang, _ = languages.GetLanguageByAlias(lang) // Invariant: lang is valid.
lang, _ = languages.GetLanguageByNameOrAlias(lang) // Invariant: lang is valid.
extensions := languages.GetLanguageExtensions(lang)
patterns := make([]string, len(extensions))
for i, e := range extensions {

View File

@ -199,7 +199,7 @@ func validateField(field, value string, negated bool, seen map[string]struct{})
}
isLanguage := func() error {
_, ok := languages.GetLanguageByAlias(value)
_, ok := languages.GetLanguageByNameOrAlias(value)
if !ok {
return errors.Errorf("unknown language: %q", value)
}

View File

@ -19,8 +19,8 @@ go_library(
"//internal/search/query",
"//internal/search/repos",
"//internal/search/streaming",
"//lib/codeintel/languages",
"//lib/errors",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_grafana_regexp//:regexp",
"@io_opentelemetry_go_otel//attribute",
"@org_gonum_v1_gonum//stat/combin",

View File

@ -6,9 +6,9 @@ import (
"regexp/syntax" //nolint:depguard // using the grafana fork of regexp clashes with zoekt, which uses the std regexp/syntax.
"strings"
"github.com/go-enry/go-enry/v2"
"github.com/grafana/regexp"
"github.com/sourcegraph/sourcegraph/internal/search/query"
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
)
// rule represents a transformation function on a Basic query. Transformation
@ -442,7 +442,7 @@ func langPatterns(b query.Basic) *query.Basic {
var lang string // store the first pattern that matches a recognized language.
isNegated := false
newPattern := query.MapPattern(rawPatternTree, func(value string, negated bool, annotation query.Annotation) query.Node {
langAlias, ok := enry.GetLanguageByAlias(value)
langAlias, ok := languages.GetLanguageByNameOrAlias(value)
if !ok || changed {
return query.Pattern{
Value: value,

View File

@ -81,7 +81,7 @@ func QueryToZoektQuery(b query.Basic, resultTypes result.Types, feat *search.Fea
}
func toLangFilter(lang string) zoekt.Q {
lang, _ = languages.GetLanguageByAlias(lang) // Invariant: lang is valid.
lang, _ = languages.GetLanguageByNameOrAlias(lang) // Invariant: lang is valid.
return &zoekt.Language{Language: lang}
}

View File

@ -4,6 +4,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "languages",
srcs = [
"enry_vendored.go",
"extensions.go",
"languages.go",
],
@ -23,6 +24,7 @@ go_test(
tags = [TAG_PLATFORM_GRAPH],
deps = [
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_go_enry_go_enry_v2//data",
"@com_github_stretchr_testify//require",
"@net_pgregory_rapid//:rapid",
],

View File

@ -0,0 +1,15 @@
package languages
import "strings"
// This file contains functions private functions
// vendored from the go-enry codebase.
// convertToAliasKey is vendored from go-enry to make sure
// we're normalizing strings the same way.
func convertToAliasKey(langName string) string {
ak := strings.SplitN(langName, `,`, 2)[0]
ak = strings.Replace(ak, ` `, `_`, -1)
ak = strings.ToLower(ak)
return ak
}

View File

@ -2,30 +2,59 @@ package languages
import (
"path/filepath"
"strings"
"slices"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2" //nolint:depguard - Only this package can use enry
)
// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias
// It supports languages that are missing in go-enry
func GetLanguageByAlias(alias string) (lang string, ok bool) {
normalizedAlias := strings.ToLower(alias)
if lang, ok = unsupportedByEnryAliasMap[normalizedAlias]; ok {
// GetLanguageByNameOrAlias returns the standardized name for
// a language based on its name (in which case this is an identity operation)
// or based on its alias, which is potentially an alternate name for
// the language.
//
// Aliases are fully lowercase, and map N-1 to languages.
//
// For example,
//
// GetLanguageByNameOrAlias("ada") == "Ada", true
// GetLanguageByNameOrAlias("ada95") == "Ada", true
//
// Historical note: This function was added for replacing usages of
// enry.GetLanguageByAlias, which, unlike the name suggests, also
// handles non-normalized names such as those with spaces.
func GetLanguageByNameOrAlias(nameOrAlias string) (lang string, ok bool) {
alias := convertToAliasKey(nameOrAlias)
if lang, ok = unsupportedByEnryAliasMap[alias]; ok {
return lang, true
}
return enry.GetLanguageByAlias(normalizedAlias)
return enry.GetLanguageByAlias(alias)
}
// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
// It supports languages that are missing in go-enry
func GetLanguageExtensions(alias string) []string {
if lang, ok := unsupportedByEnryNameToExtensionMap[alias]; ok {
// GetLanguageExtensions returns the list of file extensions for a given
// language. Returned extensions are always prefixed with a '.'.
//
// The returned slice will be empty iff the language is not known.
//
// Handles more languages than enry.GetLanguageExtensions.
//
// Mutually consistent with getLanguagesByExtension, see the tests
// for the exact invariants.
func GetLanguageExtensions(language string) []string {
if lang, ok := unsupportedByEnryNameToExtensionMap[language]; ok {
return []string{lang}
}
return enry.GetLanguageExtensions(alias)
ignoreExts, isNiche := nicheExtensionUsages[language]
// Force a copy to avoid accidentally modifying the global variable
enryExts := slices.Clone(enry.GetLanguageExtensions(language))
if !isNiche {
return slices.Clone(enryExts)
}
return slices.DeleteFunc(enryExts, func(ext string) bool {
_, shouldIgnore := ignoreExts[ext]
return shouldIgnore
})
}
// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
@ -85,23 +114,62 @@ var overrideAmbiguousExtensionsMap = map[string]string{
}
var unsupportedByEnryExtensionToNameMap = map[string]string{
// Pkl Configuration Language (https://pkl-lang.org/)
// NOTE: Add to linguist on 6/7/24
// can remove once go-enry package updates
// to that linguist version
".pkl": "Pkl",
// Magik Language
// See TODO(id: remove-pkl-special-case)
".pkl": "Pkl",
".magik": "Magik",
}
// nicheExtensionUsage keeps track of which (lang, extension) mappings
// should not be considered.
//
// We cannot wholesale ignore these languages, as this list includes
// languages like XML, but it can contain unusual extensions like '.tsx'
// which we generally want to classify as TypeScript.
var nicheExtensionUsages = func() map[string]map[string]struct{} {
niche := map[string]map[string]struct{}{}
considered := map[string]struct{}{}
for _, lang := range overrideAmbiguousExtensionsMap {
considered[lang] = struct{}{}
}
for ext := range overrideAmbiguousExtensionsMap {
langs := enry.GetLanguagesByExtension("foo"+ext, nil, nil)
for _, lang := range langs {
if _, found := considered[lang]; !found {
if m, hasMap := niche[lang]; hasMap {
m[ext] = struct{}{}
} else {
niche[lang] = map[string]struct{}{ext: {}}
}
}
}
}
for specialOverrideExt, lang := range unsupportedByEnryExtensionToNameMap {
considered[lang] = struct{}{}
langs := enry.GetLanguagesByExtension("foo"+specialOverrideExt, nil, nil)
for _, lang := range langs {
if _, found := considered[lang]; !found {
if m, hasMap := niche[lang]; hasMap {
m[specialOverrideExt] = struct{}{}
} else {
niche[lang] = map[string]struct{}{specialOverrideExt: {}}
}
}
}
}
return niche
}()
var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap)
var unsupportedByEnryAliasMap = map[string]string{
// Pkl Configuration Language (https://pkl-lang.org/)
"pkl": "Pkl",
// Magik Language
"magik": "Magik",
}
// unsupportedByEnryAliasMap maps alias -> language name for languages
// not tracked by go-enry.
var unsupportedByEnryAliasMap = func() map[string]string {
out := map[string]string{}
for _, lang := range unsupportedByEnryExtensionToNameMap {
out[convertToAliasKey(lang)] = lang
}
return out
}()
func reverseMap(m map[string]string) map[string]string {
n := make(map[string]string, len(m))

View File

@ -1,9 +1,12 @@
package languages
import (
"slices"
"strings"
"testing"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry
enrydata "github.com/go-enry/go-enry/v2/data" //nolint:depguard - This package is allowed to use enry
"github.com/stretchr/testify/require"
)
@ -42,7 +45,7 @@ var nonAmbiguousExtensionsCheck = map[string]string{
func TestGetLanguageByAlias_UnsupportedLanguages(t *testing.T) {
for alias, name := range unsupportedByEnryAliasMap {
resName, _ := GetLanguageByAlias(alias)
resName, _ := GetLanguageByNameOrAlias(alias)
require.Equal(t, name, resName,
"maybe a typo in `unsupportedByEnryAliasMap`?")
}
@ -50,7 +53,7 @@ func TestGetLanguageByAlias_UnsupportedLanguages(t *testing.T) {
func TestGetLanguageByAlias_NonAmbiguousLanguages(t *testing.T) {
for _, language := range nonAmbiguousExtensionsCheck {
_, ok := GetLanguageByAlias(language)
_, ok := GetLanguageByNameOrAlias(language)
require.True(t, ok,
"unable to find language %s in go-enry", language)
}
@ -113,3 +116,68 @@ func TestGetLanguagesByExtension_BinaryExtensions(t *testing.T) {
filename)
}
}
func TestExtensionsConsistency(t *testing.T) {
for ext, overrideLang := range overrideAmbiguousExtensionsMap {
filepath := "foo" + ext
enryLangsForExt := enry.GetLanguagesByExtension(filepath, nil, nil)
require.Containsf(t, enryLangsForExt, overrideLang, "overrideAmbiguousExtensionsMap maps extension %q to language %q but "+
"that mapping is not present in enry's list %v", ext, overrideLang, enryLangsForExt)
require.Greaterf(t, len(enryLangsForExt), 1, "overrideAmbiguousExtensionsMap states that"+
"%q extension is ambiguous, but only found langs: %v", ext, enryLangsForExt)
candidates, isLikelyBinary := getLanguagesByExtension(filepath)
require.False(t, isLikelyBinary, "ambiguous files are all source code")
require.True(t, len(candidates) == 1, "getLanguagesByExtension should respect overrideAmbiguousExtensionsMap")
shouldBeIgnoredLangsForExt := slices.DeleteFunc(enryLangsForExt, func(s string) bool {
return s == overrideLang
})
for _, shouldBeIgnoredLang := range shouldBeIgnoredLangsForExt {
ignoredExts, found := nicheExtensionUsages[shouldBeIgnoredLang]
require.Truef(t, found, "expected lang: %q to have an entry in nicheExtensionUsages for consistency with GetLanguagesByExtension", shouldBeIgnoredLang)
require.Truef(t, len(ignoredExts) >= 1, "sets in nicheExtensionUsages must be non-empty")
nonNicheExts := GetLanguageExtensions(shouldBeIgnoredLang)
for ignoredExt, _ := range ignoredExts {
require.Falsef(t, slices.Contains(nonNicheExts, ignoredExt),
"GetLanguageExtensions should not return %q for lang %q for consistency with GetLanguagesByExtension",
ignoredExt, shouldBeIgnoredLang)
}
}
}
}
func TestExtensionsConsistency2(t *testing.T) {
for lang, _ := range enrydata.ExtensionsByLanguage {
for _, ext := range GetLanguageExtensions(lang) {
if strings.Count(ext, ".") > 1 {
// Ignore unusual edge cases like .coffee.md for Literate CoffeeScript
continue
}
langsByExt, isLikelyBinary := getLanguagesByExtension("foo" + ext)
if !isLikelyBinary {
require.Truef(t, slices.Contains(langsByExt, lang),
"expected getLanguagesByExtension result %v to contain %q (extension: %q)", langsByExt, lang, ext)
}
}
}
}
// TODO(id: remove-pkl-special-case) Linguist v7.30.0 adds support for Pkl,
// so when we upgrade to a matching go-enry version, we can remove special
// cases for Pkl.
func TestUnsupportedByEnry(t *testing.T) {
for lang := range unsupportedByEnryNameToExtensionMap {
_, found := enrydata.ExtensionsByLanguage[lang]
require.False(t, found, "looks like language %q is supported by enry; remove it from unsupportedByEnryNameToExtensionMap")
}
for _, lang := range unsupportedByEnryAliasMap {
_, found := enrydata.ExtensionsByLanguage[lang]
require.False(t, found, "looks like language %q is supported by enry; remove it from unsupportedByEnryAliasMap")
}
for _, lang := range unsupportedByEnryExtensionToNameMap {
_, found := enrydata.ExtensionsByLanguage[lang]
require.False(t, found, "looks like language %q is supported by enry; remove it from unsupportedByEnryExtensionToNameMap")
}
}

View File

@ -4,7 +4,7 @@ import (
"slices"
"strings"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry.
)
// Make sure all names are lowercase here, since they are normalized
@ -62,52 +62,57 @@ func GetMostLikelyLanguage(path, contents string) (lang string, found bool) {
// for simple `.h` files with just comments and macros, they may
// be valid C, C++ or any of their derivative languages (e.g. Objective-C).
func GetLanguages(path string, getContent func() ([]byte, error)) ([]string, error) {
langs := enry.GetLanguagesByFilename(path, nil, nil)
if len(langs) == 1 {
return langs, nil
}
newLangs, isLikelyBinaryFile := getLanguagesByExtension(path)
if isLikelyBinaryFile {
return nil, nil
}
switch len(newLangs) {
case 0:
break
case 1:
return newLangs, nil
default:
langs = newLangs
}
if getContent == nil {
return langs, nil
}
content, err := getContent()
if err != nil {
return nil, err
}
if len(content) == 0 {
return langs, nil
}
if enry.IsBinary(content) {
return nil, nil
}
// enry doesn't expose a way to call GetLanguages with a specific set of
// strategies, so just hand-roll that code here.
var languages = langs
for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, enry.GetLanguagesByContent, enry.GetLanguagesByClassifier} {
candidates := strategy(path, content, languages)
switch len(candidates) {
case 0:
continue
case 1:
return candidates, nil
default:
languages = candidates
impl := func() ([]string, error) {
langs := enry.GetLanguagesByFilename(path, nil, nil)
if len(langs) == 1 {
return langs, nil
}
newLangs, isLikelyBinaryFile := getLanguagesByExtension(path)
if isLikelyBinaryFile {
return nil, nil
}
switch len(newLangs) {
case 0:
break
case 1:
return newLangs, nil
default:
langs = newLangs
}
if getContent == nil {
return langs, nil
}
content, err := getContent()
if err != nil {
return nil, err
}
if len(content) == 0 {
return langs, nil
}
if enry.IsBinary(content) {
return nil, nil
}
// enry doesn't expose a way to call GetLanguages with a specific set of
// strategies, so just hand-roll that code here.
var languages = langs
for _, strategy := range []enry.Strategy{enry.GetLanguagesByModeline, getLanguagesByShebang, enry.GetLanguagesByContent, enry.GetLanguagesByClassifier} {
candidates := strategy(path, content, languages)
switch len(candidates) {
case 0:
continue
case 1:
return candidates, nil
default:
languages = candidates
}
}
return languages, nil
}
return languages, nil
langs, err := impl()
return slices.Clone(langs), err
}
// getLanguagesByShebang is a replacement for enry.GetLanguagesByShebang.
@ -125,5 +130,5 @@ func getLanguagesByShebang(path string, content []byte, candidates []string) []s
return []string{"Raku"}
}
}
return languages
return slices.Clone(languages)
}

View File

@ -3,7 +3,7 @@ package languages
import (
"testing"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2" //nolint:depguard - This package is allowed to use enry
"github.com/stretchr/testify/require"
"pgregory.net/rapid"
)