fix: code insights - language stats - prevent data race condition and drop lock file exclusion

Co-authored-by: Michael Bahr <michael.bahr@sourcegraph.com>
This commit is contained in:
Camden Cheek 2024-04-25 02:32:45 -06:00 committed by GitHub
parent 7d80a583c7
commit 53c2e6d955
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 16 additions and 70 deletions

View File

@ -15,7 +15,6 @@ go_library(
"//lib/errors",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_go_enry_go_enry_v2//data",
"@com_github_grafana_regexp//:regexp",
"@com_github_sourcegraph_conc//iter",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//attribute",
@ -29,7 +28,6 @@ go_test(
"inventory_test.go",
],
embed = [":inventory"],
race = "off",
deps = [
"//internal/fileutil",
"//lib/errors",

View File

@ -8,23 +8,15 @@ import (
"sort"
)
// fileReadBufferSize is the size of the buffer we'll use while reading file contents
const fileReadBufferSize = 16 * 1024
// Entries computes the inventory of languages for the given entries. It traverses trees recursively
// and caches results for each subtree. Results for listed files are cached.
//
// If a file is referenced more than once (e.g., because it is a descendent of a subtree and it is
// passed directly), it will be double-counted in the result.
func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (inv Inventory, err error) {
buf := make([]byte, fileReadBufferSize)
return c.entries(ctx, entries, buf)
}
func (c *Context) entries(ctx context.Context, entries []fs.FileInfo, buf []byte) (Inventory, error) {
func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (Inventory, error) {
invs := make([]Inventory, len(entries))
for i, entry := range entries {
var f func(context.Context, fs.FileInfo, []byte) (Inventory, error)
var f func(context.Context, fs.FileInfo) (Inventory, error)
switch {
case entry.Mode().IsRegular():
f = c.file
@ -36,7 +28,7 @@ func (c *Context) entries(ctx context.Context, entries []fs.FileInfo, buf []byte
}
var err error
invs[i], err = f(ctx, entry, buf)
invs[i], err = f(ctx, entry)
if err != nil {
return Inventory{}, err
}
@ -45,7 +37,7 @@ func (c *Context) entries(ctx context.Context, entries []fs.FileInfo, buf []byte
return Sum(invs), nil
}
func (c *Context) tree(ctx context.Context, tree fs.FileInfo, buf []byte) (inv Inventory, err error) {
func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(ctx, tree); ok {
@ -72,10 +64,10 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo, buf []byte) (inv I
// Don't individually cache files that we found during tree traversal. The hit rate for
// those cache entries is likely to be much lower than cache entries for files whose
// inventory was directly requested.
lang, err := getLang(ctx, e, buf, c.NewFileReader)
lang, err := getLang(ctx, e, c.NewFileReader)
return Inventory{Languages: []Lang{lang}}, err
case e.Mode().IsDir(): // subtree
subtreeInv, err := c.tree(ctx, e, buf)
subtreeInv, err := c.tree(ctx, e)
return subtreeInv, err
default:
// Skip symlinks, submodules, etc.
@ -91,7 +83,7 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo, buf []byte) (inv I
}
// file computes the inventory of a single file. It caches the result.
func (c *Context) file(ctx context.Context, file fs.FileInfo, buf []byte) (inv Inventory, err error) {
func (c *Context) file(ctx context.Context, file fs.FileInfo) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(ctx, file); ok {
@ -106,7 +98,7 @@ func (c *Context) file(ctx context.Context, file fs.FileInfo, buf []byte) (inv I
}()
}
lang, err := getLang(ctx, file, buf, c.NewFileReader)
lang, err := getLang(ctx, file, c.NewFileReader)
if err != nil {
return Inventory{}, errors.Wrapf(err, "inventory file %q", file.Name())
}

View File

@ -8,7 +8,6 @@ import (
"fmt"
"github.com/go-enry/go-enry/v2"
"github.com/go-enry/go-enry/v2/data"
"github.com/grafana/regexp"
"github.com/sourcegraph/sourcegraph/internal/trace"
"go.opentelemetry.io/otel/attribute"
"io"
@ -19,6 +18,9 @@ import (
"github.com/sourcegraph/sourcegraph/lib/errors"
)
// fileReadBufferSize is the size of the buffer we'll use while reading file contents
const fileReadBufferSize = 4 * 1024
// Inventory summarizes a tree's contents (e.g., which programming
// languages are used).
type Inventory struct {
@ -41,35 +43,13 @@ type Lang struct {
var newLine = []byte{'\n'}
func isExcluded(name string) (bool, error) {
// Exclude lock files by default. We can later make the patterns configurable.
excludedFileNamePatterns := []string{".*\\.lock"}
for _, pattern := range excludedFileNamePatterns {
matched, err := regexp.MatchString(pattern, name)
if err != nil {
return false, err
}
if matched {
return matched, nil
}
}
return false, nil
}
func getLang(ctx context.Context, file fs.FileInfo, buf []byte, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error)) (Lang, error) {
func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error)) (Lang, error) {
if file == nil {
return Lang{}, nil
}
if !file.Mode().IsRegular() || enry.IsVendor(file.Name()) {
return Lang{}, nil
}
fileExcluded, err := isExcluded(file.Name())
if err != nil {
return Lang{}, errors.Wrap(err, "Failed to evaluate regex of excluded files.")
}
if fileExcluded {
return Lang{}, nil
}
trc, ctx := trace.New(ctx, "getLang")
defer trc.End()
@ -99,6 +79,8 @@ func getLang(ctx context.Context, file fs.FileInfo, buf []byte, getFileReader fu
return lang, nil
}
buf := make([]byte, fileReadBufferSize)
if !safe {
// Detect language from content
n, err := io.ReadFull(rc, buf)

View File

@ -65,7 +65,6 @@ func TestGetLang_language(t *testing.T) {
t.Run(label, func(t *testing.T) {
lang, err := getLang(context.Background(),
test.file,
make([]byte, fileReadBufferSize),
makeFileReader(test.file.Contents))
if err != nil {
t.Fatal(err)
@ -130,7 +129,7 @@ func TestGet_readFile(t *testing.T) {
for _, test := range tests {
t.Run(test.file.Name(), func(t *testing.T) {
fr := makeFileReader(test.file.(fi).Contents)
lang, err := getLang(context.Background(), test.file, make([]byte, fileReadBufferSize), fr)
lang, err := getLang(context.Background(), test.file, fr)
if err != nil {
t.Fatal(err)
}
@ -160,12 +159,11 @@ func BenchmarkGetLang(b *testing.B) {
b.Fatal(err)
}
fr := newFileReader(files)
buf := make([]byte, fileReadBufferSize)
b.Logf("Calling Get on %d files.", len(files))
b.ResetTimer()
for range b.N {
for _, file := range files {
_, err = getLang(context.Background(), file, buf, fr)
_, err = getLang(context.Background(), file, fr)
if err != nil {
b.Fatal(err)
}
@ -256,27 +254,3 @@ export function baz(n) {
return ""
}
}
func TestIsExcluded(t *testing.T) {
t.Run("should exclude lock file", func(t *testing.T) {
excluded, err := isExcluded("yarn.lock")
if err != nil {
t.Fatal(err)
}
if !excluded {
t.Errorf("expected file to be excluded")
}
})
t.Run("should not exclude non-lock file", func(t *testing.T) {
excluded, err := isExcluded("hello-world.md")
if err != nil {
t.Fatal(err)
}
if excluded {
t.Errorf("expected file to not be excluded")
}
})
}