feat(code insights): language stats speed improvements by using archive loading (#62946)

We previously improved the performance of Language Stats Insights by
introducing parallel requests to gitserver:
https://github.com/sourcegraph/sourcegraph/pull/62011

This PR replaces the previous approach where we would iterate through
and request each file from gitserver with an approach where we request
just one archive. This eliminates a lot of network traffic, and gives us
an additional(!) performance improvement of 70-90%.

Even repositories like chromium (42GB) can now be processed (on my
machine in just one minute).

---

Caching: We dropped most of the caching, and kept only the top-level
caching (repo@commit). This means that we only need to compute the
language stats once per commit, and subsequent users/requests can see
the cached data. We dropped the file/directory level caching, because
(1) the code to do that got very complex and (2) we can assume that most
repositories are able to compute within the 5 minutes timeout (which can
be increase via the environment variable `GET_INVENTORY_TIMEOUT`). The
timeout is not bound to the user's request anymore. Before, the frontend
would request the stats up to three times to let the computation move
forward and pick up where the last request aborted. While we still have
this frontend retry mechanism, we don't have to worry about an
abort-and-continue mechanism in the backend.

---

Credits for the code to @eseliger:
https://github.com/sourcegraph/sourcegraph/issues/62019#issuecomment-2119278481

I've taken the diff, and updated the caching methods to allow for more
advanced use cases should we decide to introduce more caching. We can
take that out again if the current caching is sufficient.

Todos:

- [x] Check if CI passes, manual testing seems to be fine
- [x] Verify that insights are cached at the top level

---

Test data:

- sourcegraph/sourcegraph: 9.07s (main) -> 1.44s (current): 74% better
- facebook/react: 17.52s (main) -> 0.87s (current): 95% better
- godotengine/godot: 28.92s (main) -> 1.98s (current): 93% better
- chromium/chromium: ~1 minute: 100% better, because it didn't compute
before

## Changelog

- Language stats queries now request one archive from gitserver instead
of individual file requests. This leads to a huge performance
improvement. Even extra large repositories like chromium are now able to
compute within one minute. Previously they timed out.

## Test plan

- New unit tests
- Plenty of manual testing
This commit is contained in:
Michael Bahr 2024-07-18 08:40:48 +02:00 committed by GitHub
parent c38e0b1687
commit f61e637062
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 632 additions and 186 deletions

View File

@ -37,6 +37,7 @@ All notable changes to Sourcegraph are documented in this file.
- The default and recommended chat model for Anthropic and Cody Gateway configurations is now `claude-3-sonnet-20240229`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
- The default and recommended autocomplete model for Cody Gateway configurations is now `fireworks/starcoder`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
- Code Insights: Language Stats Insights performance improved by another 70-90%. It's now able to handle repositories above 40 GB. [#62946](https://github.com/sourcegraph/sourcegraph/pull/62946)
- The keyword search toggle has been removed from the search results page. [Keyword search](https://sourcegraph.com/docs/code-search/queries#keyword-search-default) is now enabled by default for all searches in the Sourcegraph web app. [#63584](https://github.com/sourcegraph/sourcegraph/pull/63584)
### Fixed

View File

@ -52,20 +52,16 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
return inventory.Context{}, errors.Errorf("refusing to compute inventory for non-absolute commit ID %q", commitID)
}
cacheKey := func(e fs.FileInfo) string {
info, ok := e.Sys().(gitdomain.ObjectInfo)
if !ok {
return "" // not cacheable
}
return info.OID().String()
}
gitServerSemaphore := semaphore.NewWeighted(int64(gitServerConcurrency))
redisSemaphore := semaphore.NewWeighted(int64(redisConcurrency))
logger = logger.Scoped("InventoryContext").
With(log.String("repo", string(repo)), log.String("commitID", string(commitID)))
invCtx := inventory.Context{
Repo: repo,
CommitID: commitID,
ShouldSkipEnhancedLanguageDetection: !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection,
GitServerClient: gsClient,
ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
trc, ctx := trace.New(ctx, "ReadTree waits for semaphore")
err := gitServerSemaphore.Acquire(ctx, 1)
@ -109,14 +105,20 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
gitServerSemaphore.Release(1)
}}, nil
},
CacheGet: func(ctx context.Context, e fs.FileInfo) (inventory.Inventory, bool) {
cacheKey := cacheKey(e)
CacheKey: func(e fs.FileInfo) string {
info, ok := e.Sys().(gitdomain.ObjectInfo)
if !ok {
return "" // not cacheable
}
return info.OID().String()
},
CacheGet: func(ctx context.Context, cacheKey string) (inventory.Inventory, bool) {
if cacheKey == "" {
return inventory.Inventory{}, false // not cacheable
}
if err := redisSemaphore.Acquire(ctx, 1); err != nil {
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
return inventory.Inventory{}, false
}
defer redisSemaphore.Release(1)
@ -124,26 +126,25 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
if b, ok := inventoryCache.Get(cacheKey); ok {
var inv inventory.Inventory
if err := json.Unmarshal(b, &inv); err != nil {
logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("path", e.Name()), log.Error(err))
logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("cacheKey", cacheKey), log.Error(err))
return inventory.Inventory{}, false
}
return inv, true
}
return inventory.Inventory{}, false
},
CacheSet: func(ctx context.Context, e fs.FileInfo, inv inventory.Inventory) {
cacheKey := cacheKey(e)
CacheSet: func(ctx context.Context, cacheKey string, inv inventory.Inventory) {
if cacheKey == "" {
return // not cacheable
}
b, err := json.Marshal(&inv)
if err != nil {
logger.Warn("Failed to marshal JSON inventory for cache.", log.String("path", e.Name()), log.Error(err))
logger.Warn("Failed to marshal JSON inventory for cache.", log.String("cacheKey", cacheKey), log.Error(err))
return
}
if err := redisSemaphore.Acquire(ctx, 1); err != nil {
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
return
}
defer redisSemaphore.Release(1)
@ -151,13 +152,5 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
},
}
if !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection {
// If USE_ENHANCED_LANGUAGE_DETECTION is disabled, do not read file contents to determine
// the language. This means we won't calculate the number of lines per language.
invCtx.NewFileReader = func(ctx context.Context, path string) (io.ReadCloser, error) {
return nil, nil
}
}
return invCtx, nil
}

View File

@ -3,14 +3,7 @@ package backend
import (
"context"
"fmt"
"strconv"
"time"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/log"
"go.opentelemetry.io/otel/attribute"
"github.com/sourcegraph/sourcegraph/cmd/frontend/internal/inventory"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/database"
@ -19,6 +12,7 @@ import (
"github.com/sourcegraph/sourcegraph/internal/trace"
"github.com/sourcegraph/sourcegraph/internal/types"
"github.com/sourcegraph/sourcegraph/lib/errors"
"go.opentelemetry.io/otel/attribute"
)
type ReposService interface {
@ -112,8 +106,6 @@ func (s *repos) ListIndexable(ctx context.Context) (repos []types.MinimalRepo, e
})
}
var getInventoryTimeout, _ = strconv.Atoi(env.Get("GET_INVENTORY_TIMEOUT", "5", "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process."))
func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID api.CommitID, forceEnhancedLanguageDetection bool) (res *inventory.Inventory, err error) {
if Mocks.Repos.GetInventory != nil {
return Mocks.Repos.GetInventory(ctx, repo, commitID)
@ -122,24 +114,12 @@ func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID ap
ctx, done := startTrace(ctx, "GetInventory", map[string]any{"repo": repo, "commitID": commitID}, &err)
defer done()
ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
defer cancel()
invCtx, err := InventoryContext(s.logger, repo, s.gitserverClient, commitID, forceEnhancedLanguageDetection)
if err != nil {
return nil, err
}
root, err := s.gitserverClient.Stat(ctx, repo, commitID, "")
if err != nil {
return nil, err
}
// In computing the inventory, sub-tree inventories are cached based on the OID of the Git
// tree. Compared to per-blob caching, this creates many fewer cache entries, which means fewer
// stores, fewer lookups, and less cache storage overhead. Compared to per-commit caching, this
// yields a higher cache hit rate because most trees are unchanged across commits.
inv, err := invCtx.Entries(ctx, root)
inv, err := invCtx.All(ctx)
if err != nil {
return nil, err
}

View File

@ -1,6 +1,7 @@
package backend
import (
"archive/tar"
"bytes"
"context"
"flag"
@ -69,6 +70,43 @@ func (oid gitObjectInfo) OID() gitdomain.OID {
return v
}
type FileData struct {
Name string
Content string
}
// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
func createInMemoryTarArchive(files []FileData) ([]byte, error) {
buf := new(bytes.Buffer)
tarWriter := tar.NewWriter(buf)
for _, file := range files {
header := &tar.Header{
Name: file.Name,
Size: int64(len(file.Content)),
}
err := tarWriter.WriteHeader(header)
if err != nil {
return nil, err
}
// Write the content to the tar archive.
_, err = io.WriteString(tarWriter, file.Content)
if err != nil {
return nil, err
}
}
// Close the tar writer to flush the data to the buffer.
err := tarWriter.Close()
if err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func TestReposGetInventory(t *testing.T) {
ctx := testContext()
@ -108,13 +146,27 @@ func TestReposGetInventory(t *testing.T) {
switch name {
case "b.go":
data = []byte("package main")
case "a/c.m":
case "c.m":
data = []byte("@interface X:NSObject {}")
default:
panic("unhandled mock ReadFile " + name)
}
return io.NopCloser(bytes.NewReader(data)), nil
})
gitserverClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
files := []FileData{
{Name: "b.go", Content: "package main"},
{Name: "a/c.m", Content: "@interface X:NSObject {}"},
}
// Create the in-memory tar archive.
archiveData, err := createInMemoryTarArchive(files)
if err != nil {
t.Fatalf("Failed to create in-memory tar archive: %v", err)
}
return io.NopCloser(bytes.NewReader(archiveData)), nil
})
s := repos{
logger: logtest.Scoped(t),
gitserverClient: gitserverClient,

View File

@ -1,6 +1,7 @@
package graphqlbackend
import (
"archive/tar"
"bytes"
"context"
"io"
@ -22,6 +23,43 @@ import (
"github.com/sourcegraph/sourcegraph/internal/types"
)
type FileData struct {
Name string
Content string
}
// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
func createInMemoryTarArchive(files []FileData) ([]byte, error) {
buf := new(bytes.Buffer)
tarWriter := tar.NewWriter(buf)
for _, file := range files {
header := &tar.Header{
Name: file.Name,
Size: int64(len(file.Content)),
}
err := tarWriter.WriteHeader(header)
if err != nil {
return nil, err
}
// Write the content to the tar archive.
_, err = io.WriteString(tarWriter, file.Content)
if err != nil {
return nil, err
}
}
// Close the tar writer to flush the data to the buffer.
err := tarWriter.Close()
if err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func TestSearchResultsStatsLanguages(t *testing.T) {
logger := logtest.Scoped(t)
wantCommitID := api.CommitID(strings.Repeat("a", 40))
@ -58,6 +96,20 @@ func TestSearchResultsStatsLanguages(t *testing.T) {
gsClient.StatFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, _ api.CommitID, path string) (fs.FileInfo, error) {
return &fileutil.FileInfo{Name_: path, Mode_: os.ModeDir}, nil
})
gsClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
files := []FileData{
{Name: "two.go", Content: "a\nb\n"},
{Name: "three.go", Content: "a\nb\nc\n"},
}
// Create the in-memory tar archive.
archiveData, err := createInMemoryTarArchive(files)
if err != nil {
t.Fatalf("Failed to create in-memory tar archive: %v", err)
}
return io.NopCloser(bytes.NewReader(archiveData)), nil
})
mkResult := func(path string, lineNumbers ...int) *result.FileMatch {
rn := types.MinimalRepo{

View File

@ -12,14 +12,15 @@ go_library(
tags = [TAG_SEARCHSUITE],
visibility = ["//cmd/frontend:__subpackages__"],
deps = [
"//internal/trace",
"//internal/api",
"//internal/env",
"//internal/gitserver",
"//lib/codeintel/languages",
"//lib/errors",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_go_enry_go_enry_v2//data",
"@com_github_sourcegraph_conc//iter",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//attribute",
],
)
@ -32,9 +33,7 @@ go_test(
embed = [":inventory"],
tags = [TAG_SEARCHSUITE],
deps = [
"//internal/fileutil",
"//lib/errors",
"@com_github_go_enry_go_enry_v2//:go-enry",
"@com_github_google_go_cmp//cmp",
],
)

View File

@ -2,12 +2,23 @@ package inventory
import (
"context"
"github.com/sourcegraph/sourcegraph/internal/gitserver"
"io"
"io/fs"
"github.com/sourcegraph/sourcegraph/internal/api"
)
// Context defines the environment in which the inventory is computed.
type Context struct {
Repo api.RepoName
CommitID api.CommitID
ShouldSkipEnhancedLanguageDetection bool
GitServerClient gitserver.Client
// ReadTree is called to list the immediate children of a tree at path. The returned fs.FileInfo
// values' Name method must return the full path (that can be passed to another ReadTree or
// ReadFile call), not just the basename.
@ -16,9 +27,11 @@ type Context struct {
// NewFileReader is called to get an io.ReadCloser from the file at path.
NewFileReader func(ctx context.Context, path string) (io.ReadCloser, error)
CacheKey func(e fs.FileInfo) string
// CacheGet, if set, returns the cached inventory and true for the given tree, or false for a cache miss.
CacheGet func(context.Context, fs.FileInfo) (Inventory, bool)
CacheGet func(context.Context, string) (Inventory, bool)
// CacheSet, if set, stores the inventory in the cache for the given tree.
CacheSet func(context.Context, fs.FileInfo, Inventory)
CacheSet func(context.Context, string, Inventory)
}

View File

@ -1,17 +1,113 @@
package inventory
import (
"archive/tar"
"context"
"fmt"
"github.com/sourcegraph/conc/iter"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/sourcegraph/internal/gitserver"
"github.com/sourcegraph/sourcegraph/lib/errors"
"io"
"io/fs"
"sort"
"time"
)
// Based on the benchmarking in entries_test.go, 1_000 is a good number after which we see diminishing returns.
var maxInvsLength = env.MustGetInt("GET_INVENTORY_MAX_INV_IN_MEMORY", 1_000, "When computing the language stats, every nth iteration all loaded files are aggregated into the inventory to reduce the memory footprint. Increasing this value may make the computation run faster, but will require more memory.")
var getInventoryTimeout = env.MustGetInt("GET_INVENTORY_TIMEOUT", 5, "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process.")
func (c *Context) All(ctx context.Context) (inv Inventory, err error) {
// Top-level caching is what we need for users, directory-level caching is to speed up retries if the repo doesn't compute within the timeout.
// To help with the latter we detach the context above, instead of introducing rather complicated caching logic (it turned out to be a leetcode problem).
// Since we're planning to move this to a pre-computed style anyway, the upsides of introducing complicated code
// for retryable caching become less relevant.
ctx = context.WithoutCancel(ctx)
ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
defer cancel()
cacheKey := fmt.Sprintf("%s@%s", c.Repo, c.CommitID)
if c.CacheGet != nil {
if inv, ok := c.CacheGet(ctx, cacheKey); ok {
return inv, nil
}
}
if c.CacheSet != nil {
defer func() {
if err == nil {
c.CacheSet(ctx, cacheKey, inv)
}
}()
}
r, err := c.GitServerClient.ArchiveReader(ctx, c.Repo, gitserver.ArchiveOptions{Treeish: string(c.CommitID), Format: gitserver.ArchiveFormatTar})
if err != nil {
return Inventory{}, err
}
defer func() {
r.Close()
}()
tr := tar.NewReader(r)
return c.ArchiveProcessor(ctx, func() (*NextRecord, error) {
th, err := tr.Next()
if err != nil {
return nil, err
}
return &NextRecord{
Header: th,
GetFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
return io.NopCloser(tr), nil
},
}, nil
})
}
type NextRecord struct {
Header *tar.Header
GetFileReader func(ctx context.Context, path string) (io.ReadCloser, error)
}
func (c *Context) ArchiveProcessor(ctx context.Context, next func() (*NextRecord, error)) (inv Inventory, err error) {
var invs []Inventory
for {
n, err := next()
if err != nil {
// We've seen everything and can sum up the rest.
if errors.Is(err, io.EOF) {
return Sum(invs), nil
}
return Inventory{}, err
}
if len(invs) >= maxInvsLength {
sum := Sum(invs)
invs = invs[:0]
invs = append(invs, sum)
}
entry := n.Header.FileInfo()
switch {
case entry.Mode().IsRegular():
lang, err := getLang(ctx, entry, n.GetFileReader, c.ShouldSkipEnhancedLanguageDetection)
if err != nil {
return Inventory{}, err
}
invs = append(invs, Inventory{Languages: []Lang{lang}})
default:
// Skip anything that is not a readable file (e.g. directories, symlinks, ...)
continue
}
}
}
// Entries computes the inventory of languages for the given entries. It traverses trees recursively
// and caches results for each subtree. Results for listed files are cached.
//
// If a file is referenced more than once (e.g., because it is a descendent of a subtree and it is
// If a file is referenced more than once (e.g., because it is a descendent of a subtree, and it is
// passed directly), it will be double-counted in the result.
func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (Inventory, error) {
invs := make([]Inventory, len(entries))
@ -40,14 +136,14 @@ func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (Inventor
func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(ctx, tree); ok {
if inv, ok := c.CacheGet(ctx, c.CacheKey(tree)); ok {
return inv, nil // cache hit
}
}
if c.CacheSet != nil {
defer func() {
if err == nil {
c.CacheSet(ctx, tree, inv) // store in cache
c.CacheSet(ctx, c.CacheKey(tree), inv) // store in cache
}
}()
}
@ -64,7 +160,7 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, er
// Don't individually cache files that we found during tree traversal. The hit rate for
// those cache entries is likely to be much lower than cache entries for files whose
// inventory was directly requested.
lang, err := getLang(ctx, e, c.NewFileReader)
lang, err := getLang(ctx, e, c.NewFileReader, c.ShouldSkipEnhancedLanguageDetection)
return Inventory{Languages: []Lang{lang}}, err
case e.Mode().IsDir(): // subtree
subtreeInv, err := c.tree(ctx, e)
@ -86,19 +182,19 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, er
func (c *Context) file(ctx context.Context, file fs.FileInfo) (inv Inventory, err error) {
// Get and set from the cache.
if c.CacheGet != nil {
if inv, ok := c.CacheGet(ctx, file); ok {
if inv, ok := c.CacheGet(ctx, c.CacheKey(file)); ok {
return inv, nil // cache hit
}
}
if c.CacheSet != nil {
defer func() {
if err == nil {
c.CacheSet(ctx, file, inv) // store in cache
c.CacheSet(ctx, c.CacheKey(file), inv) // store in cache
}
}()
}
lang, err := getLang(ctx, file, c.NewFileReader)
lang, err := getLang(ctx, file, c.NewFileReader, c.ShouldSkipEnhancedLanguageDetection)
if err != nil {
return Inventory{}, errors.Wrapf(err, "inventory file %q", file.Name())
}

View File

@ -1,123 +1,310 @@
package inventory
import (
"bytes"
"archive/tar"
"context"
"io"
"io/fs"
"os"
"reflect"
"sync"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/sourcegraph/sourcegraph/internal/fileutil"
)
func TestContext_Entries(t *testing.T) {
var (
readTreeCalls []string
newFileReaderCalls []string
cacheGetCalls []string
cacheSetCalls = map[string]Inventory{}
)
var mu sync.Mutex
c := Context{
ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
mu.Lock()
defer mu.Unlock()
readTreeCalls = append(readTreeCalls, path)
switch path {
case "d":
return []fs.FileInfo{
&fileutil.FileInfo{Name_: "d/a", Mode_: os.ModeDir},
&fileutil.FileInfo{Name_: "d/b.go", Size_: 12},
}, nil
case "d/a":
return []fs.FileInfo{&fileutil.FileInfo{Name_: "d/a/c.m", Size_: 24}}, nil
default:
panic("unhandled mock ReadTree " + path)
}
type FileData struct {
Name string
Content string
IsDirectory bool
}
func TestFileMode(t *testing.T) {
testCases := []struct {
name string
isDirectory bool
}{
{
name: "regular file",
isDirectory: false,
},
NewFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
mu.Lock()
defer mu.Unlock()
newFileReaderCalls = append(newFileReaderCalls, path)
var data []byte
switch path {
case "f.go":
data = []byte("package f")
case "d/b.go":
data = []byte("package main")
case "d/a/c.m":
data = []byte("@interface X:NSObject {}")
default:
panic("unhandled mock ReadFile " + path)
}
return io.NopCloser(bytes.NewReader(data)), nil
},
CacheGet: func(ctx context.Context, e fs.FileInfo) (Inventory, bool) {
mu.Lock()
defer mu.Unlock()
cacheGetCalls = append(cacheGetCalls, e.Name())
return Inventory{}, false
},
CacheSet: func(ctx context.Context, e fs.FileInfo, inv Inventory) {
mu.Lock()
defer mu.Unlock()
if _, ok := cacheSetCalls[e.Name()]; ok {
t.Fatalf("already stored %q in cache", e.Name())
}
cacheSetCalls[e.Name()] = inv
{
name: "directory",
isDirectory: true,
},
}
inv, err := c.Entries(context.Background(),
&fileutil.FileInfo{Name_: "d", Mode_: os.ModeDir},
&fileutil.FileInfo{Name_: "f.go", Mode_: 0, Size_: 1 /* HACK to force read */},
)
if err != nil {
t.Fatal(err)
}
if want := (Inventory{
Languages: []Lang{
{Name: "Go", TotalBytes: 21, TotalLines: 2},
{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
},
}); !reflect.DeepEqual(inv, want) {
t.Fatalf("got %#v\nwant %#v", inv, want)
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
h := &tar.Header{
Name: tc.name,
Mode: func() int64 {
if tc.isDirectory {
// 04 (in octal) sets the directory bit
return 040000
}
return 0
}(),
}
got := h.FileInfo().Mode().IsDir()
if got != tc.isDirectory {
t.Errorf("got %v, want %v", got, tc.isDirectory)
}
})
}
// Check that our mocks were called as expected.
if want := []string{"d", "d/a"}; !reflect.DeepEqual(readTreeCalls, want) {
t.Errorf("ReadTree calls: got %q, want %q", readTreeCalls, want)
}
if len(newFileReaderCalls) != 3 {
t.Errorf("GetFileReader calls: got %d, want %d", len(newFileReaderCalls), 3)
}
if want := []string{"d", "d/a", "f.go"}; !reflect.DeepEqual(cacheGetCalls, want) {
t.Errorf("CacheGet calls: got %q, want %q", cacheGetCalls, want)
}
want := map[string]Inventory{
"d": {
Languages: []Lang{
}
func TestEntriesNextProcessor(t *testing.T) {
testCases := []struct {
name string
files []FileData
expected Inventory
}{
{
name: "Go and Objective-C (in directory) files",
files: []FileData{
{Name: "b.go", Content: "package main"},
{Name: "a", Content: "", IsDirectory: true},
{Name: "a/c.m", Content: "@interface X:NSObject {}"},
},
expected: Inventory{Languages: []Lang{
{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
{Name: "Go", TotalBytes: 12, TotalLines: 1},
},
}},
},
"d/a": {
Languages: []Lang{
{
name: "Single file",
files: []FileData{
{Name: "b.go", Content: "package main"},
},
expected: Inventory{Languages: []Lang{
{Name: "Go", TotalBytes: 12, TotalLines: 1},
}},
},
{
name: "Two root file",
files: []FileData{
{Name: "a.go", Content: "package main"},
{Name: "b.go", Content: "package main"},
},
expected: Inventory{Languages: []Lang{
{Name: "Go", TotalBytes: 24, TotalLines: 2},
}},
},
{
name: "File and empty directory",
files: []FileData{
{Name: "a.go", Content: "package main"},
{Name: "b", Content: "", IsDirectory: true},
},
expected: Inventory{Languages: []Lang{
{Name: "Go", TotalBytes: 12, TotalLines: 1},
}},
},
{
name: "Two directories with files",
files: []FileData{
{Name: "a", Content: "", IsDirectory: true},
{Name: "a/a.m", Content: "@interface X:NSObject {}"},
{Name: "b", Content: "", IsDirectory: true},
{Name: "b/b.go", Content: "package main"},
},
expected: Inventory{Languages: []Lang{
{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
{Name: "Go", TotalBytes: 12, TotalLines: 1},
}},
},
{
name: "Two directories with files, and a root file",
files: []FileData{
{Name: "a", Content: "", IsDirectory: true},
{Name: "a/a.m", Content: "@interface X:NSObject {}"},
{Name: "b", Content: "", IsDirectory: true},
{Name: "b/b.go", Content: "package main"},
{Name: "c.go", Content: "package main"},
},
expected: Inventory{Languages: []Lang{
{Name: "Go", TotalBytes: 24, TotalLines: 2},
{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
}},
},
{
name: "Directory with sub-directory and surrounding files",
files: []FileData{
{Name: "a", Content: "", IsDirectory: true},
{Name: "a/A.m", Content: "@interface X:NSObject {}"},
{Name: "a/b", Content: "", IsDirectory: true},
{Name: "a/b/B.m", Content: "@interface X:NSObject {}"},
{Name: "a/c", Content: "", IsDirectory: true},
{Name: "a/c/D.m", Content: "@interface X:NSObject {}"},
{Name: "E.go", Content: "package main"},
},
expected: Inventory{Languages: []Lang{
{Name: "Objective-C", TotalBytes: 72, TotalLines: 3},
{Name: "Go", TotalBytes: 12, TotalLines: 1},
}},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
index := 0
c := Context{
Repo: "testRepo",
CommitID: "testCommit",
ReadTree: nil,
NewFileReader: nil,
CacheKey: nil,
CacheGet: nil,
CacheSet: nil,
ShouldSkipEnhancedLanguageDetection: false,
GitServerClient: nil,
}
next := func() (*NextRecord, error) {
defer func() {
index += 1
}()
if index >= len(tc.files) {
return nil, io.EOF
}
content := tc.files[index].Content
return &NextRecord{
Header: &tar.Header{
Name: tc.files[index].Name,
Size: int64(len(content)),
Mode: func() int64 {
if tc.files[index].IsDirectory {
// 04 (in octal) sets the directory bit
return 040000
}
return 0
}(),
},
GetFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
return io.NopCloser(strings.NewReader(content)), nil
},
}, nil
}
got, err := c.ArchiveProcessor(context.Background(), next)
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(tc.expected, got) {
t.Errorf("got %+v, want %+v", got, tc.expected)
}
})
}
}
func TestSum(t *testing.T) {
testCases := []struct {
name string
invs []Inventory
expected Inventory
}{
{
name: "empty input",
invs: []Inventory{},
expected: Inventory{
Languages: []Lang{},
},
},
"f.go": {
Languages: []Lang{
{Name: "Go", TotalBytes: 9, TotalLines: 1},
{
name: "single inventory",
invs: []Inventory{
{
Languages: []Lang{
{Name: "Go", TotalBytes: 100, TotalLines: 10},
{Name: "Python", TotalBytes: 200, TotalLines: 20},
},
},
},
expected: Inventory{
Languages: []Lang{
{Name: "Python", TotalBytes: 200, TotalLines: 20},
{Name: "Go", TotalBytes: 100, TotalLines: 10},
},
},
},
{
name: "multiple inventories",
invs: []Inventory{
{
Languages: []Lang{
{Name: "Go", TotalBytes: 100, TotalLines: 10},
{Name: "Python", TotalBytes: 200, TotalLines: 20},
},
},
{
Languages: []Lang{
{Name: "Go", TotalBytes: 50, TotalLines: 5},
{Name: "Ruby", TotalBytes: 300, TotalLines: 30},
},
},
},
expected: Inventory{
Languages: []Lang{
{Name: "Ruby", TotalBytes: 300, TotalLines: 30},
{Name: "Python", TotalBytes: 200, TotalLines: 20},
{Name: "Go", TotalBytes: 150, TotalLines: 15},
},
},
},
{
name: "empty language name",
invs: []Inventory{
{
Languages: []Lang{
{Name: "", TotalBytes: 100, TotalLines: 10},
{Name: "Python", TotalBytes: 200, TotalLines: 20},
},
},
},
expected: Inventory{
Languages: []Lang{
{Name: "Python", TotalBytes: 200, TotalLines: 20},
},
},
},
}
if diff := cmp.Diff(want, cacheSetCalls); diff != "" {
t.Error(diff)
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got := Sum(tc.invs)
if !reflect.DeepEqual(got, tc.expected) {
t.Errorf("got %+v, want %+v", got, tc.expected)
}
})
}
}
func BenchmarkSum(b *testing.B) {
// Benchmark results
// n=10000: ~8400 ns/op => 8.4 ns/record
// n=1000: ~8500 ns/op => 8.5 ns/record
// n=100: ~1100 ns/op => 11 ns/record
// n=10: ~360 ns/op => 36 ns/record
// n=5: ~300 ns/op => 60 ns/record
// n=2: ~280 ns/op => 140 ns/record
n := 10_000
invs := make([]Inventory, n)
for i := 0; i < n; i++ {
invs[i] = Inventory{
Languages: []Lang{
{Name: "Go", TotalBytes: uint64(100 + int64(i)), TotalLines: uint64(10 + int64(i))},
{Name: "Python", TotalBytes: uint64(200 + int64(i)), TotalLines: uint64(20 + int64(i))},
},
}
if i%2 == 1 {
invs[i].Languages[1] = Lang{Name: "Ruby", TotalBytes: uint64(300 + int64(i)), TotalLines: uint64(30 + int64(i))}
}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
Sum(invs)
}
}

View File

@ -6,16 +6,12 @@ import (
"bytes"
"context"
"fmt"
"github.com/go-enry/go-enry/v2" //nolint:depguard - FIXME: replace this usage of enry with languages package
"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: replace this usage of enry with languages package
"github.com/sourcegraph/log"
"io"
"io/fs"
"github.com/go-enry/go-enry/v2" //nolint:depguard - FIXME: replace this usage of enry with languages package
"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: replace this usage of enry with languages package
"go.opentelemetry.io/otel/attribute"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/internal/trace"
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
@ -45,7 +41,7 @@ type Lang struct {
var newLine = []byte{'\n'}
func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error)) (Lang, error) {
func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error), skipEnhancedLanguageDetection bool) (Lang, error) {
if file == nil {
return Lang{}, nil
}
@ -53,34 +49,25 @@ func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx conte
return Lang{}, nil
}
trc, ctx := trace.New(ctx, "getLang")
defer trc.End()
rc, err := getFileReader(ctx, file.Name())
if err != nil {
return Lang{}, errors.Wrap(err, "getting file reader")
}
if rc != nil {
defer rc.Close()
}
var lang Lang
// In many cases, GetLanguageByFilename can detect the language conclusively just from the
// filename. If not, we pass a subset of the file contents for analysis.
matchedLang, safe := GetLanguageByFilename(file.Name())
trc.AddEvent("GetLanguageByFilename",
attribute.String("FileName", file.Name()),
attribute.Bool("Safe", safe),
attribute.String("MatchedLang", matchedLang),
)
// No content
if rc == nil {
if skipEnhancedLanguageDetection {
lang.Name = matchedLang
lang.TotalBytes = uint64(file.Size())
return lang, nil
}
rc, err := getFileReader(ctx, file.Name())
if err != nil {
return Lang{}, errors.Wrap(err, "Failed to create a file reader.")
}
if rc != nil {
defer rc.Close()
}
buf := make([]byte, fileReadBufferSize)
if !safe {

View File

@ -65,7 +65,8 @@ func TestGetLang_language(t *testing.T) {
t.Run(label, func(t *testing.T) {
lang, err := getLang(context.Background(),
test.file,
makeFileReader(test.file.Contents))
makeFileReader(test.file.Contents),
false)
if err != nil {
t.Fatal(err)
}
@ -76,6 +77,91 @@ func TestGetLang_language(t *testing.T) {
}
}
type mockCloser struct {
closeFunc func() error
}
func (m *mockCloser) Close() error {
return m.closeFunc()
}
func TestGetLang_fileReader(t *testing.T) {
t.Run("If the file reader is opened, it must be closed", func(t *testing.T) {
openCalled := false
closeCalled := false
fakeFileReader := func(ctx context.Context, path string) (io.ReadCloser, error) {
openCalled = true
return struct {
io.Reader
io.Closer
}{
Reader: strings.NewReader(""),
Closer: &mockCloser{closeFunc: func() error {
closeCalled = true
return nil
}},
}, nil
}
_, err := getLang(context.Background(),
fi{"a.java", ""},
fakeFileReader,
false)
if err != nil {
t.Fatal(err)
}
if !openCalled {
t.Fatal("Open should have been called")
}
if !closeCalled {
t.Error("Close should have been called")
}
})
t.Run("If the file reader is NOT opened, it should not be closed", func(t *testing.T) {
openCalled := false
closeCalled := false
fakeFileReader := func(ctx context.Context, path string) (io.ReadCloser, error) {
openCalled = true
return struct {
io.Reader
io.Closer
}{
Reader: strings.NewReader(""),
Closer: &mockCloser{closeFunc: func() error {
closeCalled = true
return nil
}},
}, nil
}
_, err := getLang(context.Background(),
fi{"a.java", ""},
fakeFileReader,
true)
if err != nil {
t.Fatal(err)
}
if openCalled {
t.Fatal("Open should NOT have been called")
}
if closeCalled {
t.Error("Close should NOT have been called")
}
})
}
func makeFileReader(contents string) func(context.Context, string) (io.ReadCloser, error) {
return func(ctx context.Context, path string) (io.ReadCloser, error) {
return io.NopCloser(strings.NewReader(contents)), nil
@ -135,7 +221,7 @@ func TestGet_readFile(t *testing.T) {
for _, test := range tests {
t.Run(test.file.Name(), func(t *testing.T) {
fr := makeFileReader(test.file.(fi).Contents)
lang, err := getLang(context.Background(), test.file, fr)
lang, err := getLang(context.Background(), test.file, fr, false)
if err != nil {
t.Fatal(err)
}
@ -169,7 +255,7 @@ func BenchmarkGetLang(b *testing.B) {
b.ResetTimer()
for range b.N {
for _, file := range files {
_, err = getLang(context.Background(), file, fr)
_, err = getLang(context.Background(), file, fr, false)
if err != nil {
b.Fatal(err)
}