From f61e637062dea440a1fe65a4a63e7da474823fd9 Mon Sep 17 00:00:00 2001
From: Michael Bahr <1830132+bahrmichael@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:40:48 +0200
Subject: [PATCH] feat(code insights): language stats speed improvements by
 using archive loading (#62946)

We previously improved the performance of Language Stats Insights by
introducing parallel requests to gitserver:
https://github.com/sourcegraph/sourcegraph/pull/62011

This PR replaces the previous approach where we would iterate through
and request each file from gitserver with an approach where we request
just one archive. This eliminates a lot of network traffic, and gives us
an additional(!) performance improvement of 70-90%.

Even repositories like chromium (42GB) can now be processed (on my
machine in just one minute).

---

Caching: We dropped most of the caching, and kept only the top-level
caching (repo@commit). This means that we only need to compute the
language stats once per commit, and subsequent users/requests can see
the cached data. We dropped the file/directory level caching, because
(1) the code to do that got very complex and (2) we can assume that most
repositories are able to compute within the 5 minutes timeout (which can
be increase via the environment variable `GET_INVENTORY_TIMEOUT`). The
timeout is not bound to the user's request anymore. Before, the frontend
would request the stats up to three times to let the computation move
forward and pick up where the last request aborted. While we still have
this frontend retry mechanism, we don't have to worry about an
abort-and-continue mechanism in the backend.

---

Credits for the code to @eseliger:
https://github.com/sourcegraph/sourcegraph/issues/62019#issuecomment-2119278481

I've taken the diff, and updated the caching methods to allow for more
advanced use cases should we decide to introduce more caching. We can
take that out again if the current caching is sufficient.

Todos:

- [x] Check if CI passes, manual testing seems to be fine
- [x] Verify that insights are cached at the top level

---

Test data:

- sourcegraph/sourcegraph: 9.07s (main) -> 1.44s (current): 74% better
- facebook/react: 17.52s (main) -> 0.87s (current): 95% better
- godotengine/godot: 28.92s (main) -> 1.98s (current): 93% better
- chromium/chromium: ~1 minute: 100% better, because it didn't compute
before

## Changelog

- Language stats queries now request one archive from gitserver instead
of individual file requests. This leads to a huge performance
improvement. Even extra large repositories like chromium are now able to
compute within one minute. Previously they timed out.

## Test plan

- New unit tests
- Plenty of manual testing
---
 CHANGELOG.md                                  |   1 +
 cmd/frontend/backend/inventory.go             |  41 +-
 cmd/frontend/backend/repos.go                 |  24 +-
 cmd/frontend/backend/repos_test.go            |  54 ++-
 .../search_results_stats_languages_test.go    |  52 +++
 cmd/frontend/internal/inventory/BUILD.bazel   |   7 +-
 cmd/frontend/internal/inventory/context.go    |  17 +-
 cmd/frontend/internal/inventory/entries.go    | 110 ++++-
 .../internal/inventory/entries_test.go        | 381 +++++++++++++-----
 cmd/frontend/internal/inventory/inventory.go  |  39 +-
 .../internal/inventory/inventory_test.go      |  92 ++++-
 11 files changed, 632 insertions(+), 186 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 783c9512b50..3a972046d85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,6 +37,7 @@ All notable changes to Sourcegraph are documented in this file.
 
 - The default and recommended chat model for Anthropic and Cody Gateway configurations is now `claude-3-sonnet-20240229`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
 - The default and recommended autocomplete model for Cody Gateway configurations is now `fireworks/starcoder`. [#62757](https://github.com/sourcegraph/sourcegraph/pull/62757)
+- Code Insights: Language Stats Insights performance improved by another 70-90%. It's now able to handle repositories above 40 GB. [#62946](https://github.com/sourcegraph/sourcegraph/pull/62946)
 - The keyword search toggle has been removed from the search results page. [Keyword search](https://sourcegraph.com/docs/code-search/queries#keyword-search-default) is now enabled by default for all searches in the Sourcegraph web app. [#63584](https://github.com/sourcegraph/sourcegraph/pull/63584)
 
 ### Fixed
diff --git a/cmd/frontend/backend/inventory.go b/cmd/frontend/backend/inventory.go
index 01d72d5a912..2b2288a6d15 100644
--- a/cmd/frontend/backend/inventory.go
+++ b/cmd/frontend/backend/inventory.go
@@ -52,20 +52,16 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
 		return inventory.Context{}, errors.Errorf("refusing to compute inventory for non-absolute commit ID %q", commitID)
 	}
 
-	cacheKey := func(e fs.FileInfo) string {
-		info, ok := e.Sys().(gitdomain.ObjectInfo)
-		if !ok {
-			return "" // not cacheable
-		}
-		return info.OID().String()
-	}
-
 	gitServerSemaphore := semaphore.NewWeighted(int64(gitServerConcurrency))
 	redisSemaphore := semaphore.NewWeighted(int64(redisConcurrency))
 
 	logger = logger.Scoped("InventoryContext").
 		With(log.String("repo", string(repo)), log.String("commitID", string(commitID)))
 	invCtx := inventory.Context{
+		Repo:                                repo,
+		CommitID:                            commitID,
+		ShouldSkipEnhancedLanguageDetection: !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection,
+		GitServerClient:                     gsClient,
 		ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
 			trc, ctx := trace.New(ctx, "ReadTree waits for semaphore")
 			err := gitServerSemaphore.Acquire(ctx, 1)
@@ -109,14 +105,20 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
 				gitServerSemaphore.Release(1)
 			}}, nil
 		},
-		CacheGet: func(ctx context.Context, e fs.FileInfo) (inventory.Inventory, bool) {
-			cacheKey := cacheKey(e)
+		CacheKey: func(e fs.FileInfo) string {
+			info, ok := e.Sys().(gitdomain.ObjectInfo)
+			if !ok {
+				return "" // not cacheable
+			}
+			return info.OID().String()
+		},
+		CacheGet: func(ctx context.Context, cacheKey string) (inventory.Inventory, bool) {
 			if cacheKey == "" {
 				return inventory.Inventory{}, false // not cacheable
 			}
 
 			if err := redisSemaphore.Acquire(ctx, 1); err != nil {
-				logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
+				logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
 				return inventory.Inventory{}, false
 			}
 			defer redisSemaphore.Release(1)
@@ -124,26 +126,25 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
 			if b, ok := inventoryCache.Get(cacheKey); ok {
 				var inv inventory.Inventory
 				if err := json.Unmarshal(b, &inv); err != nil {
-					logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("path", e.Name()), log.Error(err))
+					logger.Warn("Failed to unmarshal cached JSON inventory.", log.String("cacheKey", cacheKey), log.Error(err))
 					return inventory.Inventory{}, false
 				}
 				return inv, true
 			}
 			return inventory.Inventory{}, false
 		},
-		CacheSet: func(ctx context.Context, e fs.FileInfo, inv inventory.Inventory) {
-			cacheKey := cacheKey(e)
+		CacheSet: func(ctx context.Context, cacheKey string, inv inventory.Inventory) {
 			if cacheKey == "" {
 				return // not cacheable
 			}
 			b, err := json.Marshal(&inv)
 			if err != nil {
-				logger.Warn("Failed to marshal JSON inventory for cache.", log.String("path", e.Name()), log.Error(err))
+				logger.Warn("Failed to marshal JSON inventory for cache.", log.String("cacheKey", cacheKey), log.Error(err))
 				return
 			}
 
 			if err := redisSemaphore.Acquire(ctx, 1); err != nil {
-				logger.Warn("Failed to acquire semaphore for redis cache.", log.String("path", e.Name()), log.Error(err))
+				logger.Warn("Failed to acquire semaphore for redis cache.", log.String("cacheKey", cacheKey), log.Error(err))
 				return
 			}
 			defer redisSemaphore.Release(1)
@@ -151,13 +152,5 @@ func InventoryContext(logger log.Logger, repo api.RepoName, gsClient gitserver.C
 		},
 	}
 
-	if !useEnhancedLanguageDetection && !forceEnhancedLanguageDetection {
-		// If USE_ENHANCED_LANGUAGE_DETECTION is disabled, do not read file contents to determine
-		// the language. This means we won't calculate the number of lines per language.
-		invCtx.NewFileReader = func(ctx context.Context, path string) (io.ReadCloser, error) {
-			return nil, nil
-		}
-	}
-
 	return invCtx, nil
 }
diff --git a/cmd/frontend/backend/repos.go b/cmd/frontend/backend/repos.go
index 4e40373c457..b871a8f568c 100644
--- a/cmd/frontend/backend/repos.go
+++ b/cmd/frontend/backend/repos.go
@@ -3,14 +3,7 @@ package backend
 import (
 	"context"
 	"fmt"
-	"strconv"
-	"time"
-
-	"github.com/sourcegraph/sourcegraph/internal/env"
-
 	"github.com/sourcegraph/log"
-	"go.opentelemetry.io/otel/attribute"
-
 	"github.com/sourcegraph/sourcegraph/cmd/frontend/internal/inventory"
 	"github.com/sourcegraph/sourcegraph/internal/api"
 	"github.com/sourcegraph/sourcegraph/internal/database"
@@ -19,6 +12,7 @@ import (
 	"github.com/sourcegraph/sourcegraph/internal/trace"
 	"github.com/sourcegraph/sourcegraph/internal/types"
 	"github.com/sourcegraph/sourcegraph/lib/errors"
+	"go.opentelemetry.io/otel/attribute"
 )
 
 type ReposService interface {
@@ -112,8 +106,6 @@ func (s *repos) ListIndexable(ctx context.Context) (repos []types.MinimalRepo, e
 	})
 }
 
-var getInventoryTimeout, _ = strconv.Atoi(env.Get("GET_INVENTORY_TIMEOUT", "5", "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process."))
-
 func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID api.CommitID, forceEnhancedLanguageDetection bool) (res *inventory.Inventory, err error) {
 	if Mocks.Repos.GetInventory != nil {
 		return Mocks.Repos.GetInventory(ctx, repo, commitID)
@@ -122,24 +114,12 @@ func (s *repos) GetInventory(ctx context.Context, repo api.RepoName, commitID ap
 	ctx, done := startTrace(ctx, "GetInventory", map[string]any{"repo": repo, "commitID": commitID}, &err)
 	defer done()
 
-	ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
-	defer cancel()
-
 	invCtx, err := InventoryContext(s.logger, repo, s.gitserverClient, commitID, forceEnhancedLanguageDetection)
 	if err != nil {
 		return nil, err
 	}
 
-	root, err := s.gitserverClient.Stat(ctx, repo, commitID, "")
-	if err != nil {
-		return nil, err
-	}
-
-	// In computing the inventory, sub-tree inventories are cached based on the OID of the Git
-	// tree. Compared to per-blob caching, this creates many fewer cache entries, which means fewer
-	// stores, fewer lookups, and less cache storage overhead. Compared to per-commit caching, this
-	// yields a higher cache hit rate because most trees are unchanged across commits.
-	inv, err := invCtx.Entries(ctx, root)
+	inv, err := invCtx.All(ctx)
 	if err != nil {
 		return nil, err
 	}
diff --git a/cmd/frontend/backend/repos_test.go b/cmd/frontend/backend/repos_test.go
index 9f711c961f5..c8e01a1ade6 100644
--- a/cmd/frontend/backend/repos_test.go
+++ b/cmd/frontend/backend/repos_test.go
@@ -1,6 +1,7 @@
 package backend
 
 import (
+	"archive/tar"
 	"bytes"
 	"context"
 	"flag"
@@ -69,6 +70,43 @@ func (oid gitObjectInfo) OID() gitdomain.OID {
 	return v
 }
 
+type FileData struct {
+	Name    string
+	Content string
+}
+
+// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
+func createInMemoryTarArchive(files []FileData) ([]byte, error) {
+	buf := new(bytes.Buffer)
+	tarWriter := tar.NewWriter(buf)
+
+	for _, file := range files {
+		header := &tar.Header{
+			Name: file.Name,
+			Size: int64(len(file.Content)),
+		}
+
+		err := tarWriter.WriteHeader(header)
+		if err != nil {
+			return nil, err
+		}
+
+		// Write the content to the tar archive.
+		_, err = io.WriteString(tarWriter, file.Content)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Close the tar writer to flush the data to the buffer.
+	err := tarWriter.Close()
+	if err != nil {
+		return nil, err
+	}
+
+	return buf.Bytes(), nil
+}
+
 func TestReposGetInventory(t *testing.T) {
 	ctx := testContext()
 
@@ -108,13 +146,27 @@ func TestReposGetInventory(t *testing.T) {
 		switch name {
 		case "b.go":
 			data = []byte("package main")
-		case "a/c.m":
+		case "c.m":
 			data = []byte("@interface X:NSObject {}")
 		default:
 			panic("unhandled mock ReadFile " + name)
 		}
 		return io.NopCloser(bytes.NewReader(data)), nil
 	})
+	gitserverClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
+		files := []FileData{
+			{Name: "b.go", Content: "package main"},
+			{Name: "a/c.m", Content: "@interface X:NSObject {}"},
+		}
+
+		// Create the in-memory tar archive.
+		archiveData, err := createInMemoryTarArchive(files)
+		if err != nil {
+			t.Fatalf("Failed to create in-memory tar archive: %v", err)
+		}
+
+		return io.NopCloser(bytes.NewReader(archiveData)), nil
+	})
 	s := repos{
 		logger:          logtest.Scoped(t),
 		gitserverClient: gitserverClient,
diff --git a/cmd/frontend/graphqlbackend/search_results_stats_languages_test.go b/cmd/frontend/graphqlbackend/search_results_stats_languages_test.go
index c8f6f4c3051..a1182c20df8 100644
--- a/cmd/frontend/graphqlbackend/search_results_stats_languages_test.go
+++ b/cmd/frontend/graphqlbackend/search_results_stats_languages_test.go
@@ -1,6 +1,7 @@
 package graphqlbackend
 
 import (
+	"archive/tar"
 	"bytes"
 	"context"
 	"io"
@@ -22,6 +23,43 @@ import (
 	"github.com/sourcegraph/sourcegraph/internal/types"
 )
 
+type FileData struct {
+	Name    string
+	Content string
+}
+
+// createInMemoryTarArchive creates a tar archive in memory containing multiple files with their given content.
+func createInMemoryTarArchive(files []FileData) ([]byte, error) {
+	buf := new(bytes.Buffer)
+	tarWriter := tar.NewWriter(buf)
+
+	for _, file := range files {
+		header := &tar.Header{
+			Name: file.Name,
+			Size: int64(len(file.Content)),
+		}
+
+		err := tarWriter.WriteHeader(header)
+		if err != nil {
+			return nil, err
+		}
+
+		// Write the content to the tar archive.
+		_, err = io.WriteString(tarWriter, file.Content)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	// Close the tar writer to flush the data to the buffer.
+	err := tarWriter.Close()
+	if err != nil {
+		return nil, err
+	}
+
+	return buf.Bytes(), nil
+}
+
 func TestSearchResultsStatsLanguages(t *testing.T) {
 	logger := logtest.Scoped(t)
 	wantCommitID := api.CommitID(strings.Repeat("a", 40))
@@ -58,6 +96,20 @@ func TestSearchResultsStatsLanguages(t *testing.T) {
 	gsClient.StatFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, _ api.CommitID, path string) (fs.FileInfo, error) {
 		return &fileutil.FileInfo{Name_: path, Mode_: os.ModeDir}, nil
 	})
+	gsClient.ArchiveReaderFunc.SetDefaultHook(func(_ context.Context, _ api.RepoName, archiveOptions gitserver.ArchiveOptions) (io.ReadCloser, error) {
+		files := []FileData{
+			{Name: "two.go", Content: "a\nb\n"},
+			{Name: "three.go", Content: "a\nb\nc\n"},
+		}
+
+		// Create the in-memory tar archive.
+		archiveData, err := createInMemoryTarArchive(files)
+		if err != nil {
+			t.Fatalf("Failed to create in-memory tar archive: %v", err)
+		}
+
+		return io.NopCloser(bytes.NewReader(archiveData)), nil
+	})
 
 	mkResult := func(path string, lineNumbers ...int) *result.FileMatch {
 		rn := types.MinimalRepo{
diff --git a/cmd/frontend/internal/inventory/BUILD.bazel b/cmd/frontend/internal/inventory/BUILD.bazel
index 43e8bb878b5..ed06e1e501a 100644
--- a/cmd/frontend/internal/inventory/BUILD.bazel
+++ b/cmd/frontend/internal/inventory/BUILD.bazel
@@ -12,14 +12,15 @@ go_library(
     tags = [TAG_SEARCHSUITE],
     visibility = ["//cmd/frontend:__subpackages__"],
     deps = [
-        "//internal/trace",
+        "//internal/api",
+        "//internal/env",
+        "//internal/gitserver",
         "//lib/codeintel/languages",
         "//lib/errors",
         "@com_github_go_enry_go_enry_v2//:go-enry",
         "@com_github_go_enry_go_enry_v2//data",
         "@com_github_sourcegraph_conc//iter",
         "@com_github_sourcegraph_log//:log",
-        "@io_opentelemetry_go_otel//attribute",
     ],
 )
 
@@ -32,9 +33,7 @@ go_test(
     embed = [":inventory"],
     tags = [TAG_SEARCHSUITE],
     deps = [
-        "//internal/fileutil",
         "//lib/errors",
         "@com_github_go_enry_go_enry_v2//:go-enry",
-        "@com_github_google_go_cmp//cmp",
     ],
 )
diff --git a/cmd/frontend/internal/inventory/context.go b/cmd/frontend/internal/inventory/context.go
index c3737b024c5..89ae317c541 100644
--- a/cmd/frontend/internal/inventory/context.go
+++ b/cmd/frontend/internal/inventory/context.go
@@ -2,12 +2,23 @@ package inventory
 
 import (
 	"context"
+	"github.com/sourcegraph/sourcegraph/internal/gitserver"
 	"io"
 	"io/fs"
+
+	"github.com/sourcegraph/sourcegraph/internal/api"
 )
 
 // Context defines the environment in which the inventory is computed.
 type Context struct {
+	Repo api.RepoName
+
+	CommitID api.CommitID
+
+	ShouldSkipEnhancedLanguageDetection bool
+
+	GitServerClient gitserver.Client
+
 	// ReadTree is called to list the immediate children of a tree at path. The returned fs.FileInfo
 	// values' Name method must return the full path (that can be passed to another ReadTree or
 	// ReadFile call), not just the basename.
@@ -16,9 +27,11 @@ type Context struct {
 	// NewFileReader is called to get an io.ReadCloser from the file at path.
 	NewFileReader func(ctx context.Context, path string) (io.ReadCloser, error)
 
+	CacheKey func(e fs.FileInfo) string
+
 	// CacheGet, if set, returns the cached inventory and true for the given tree, or false for a cache miss.
-	CacheGet func(context.Context, fs.FileInfo) (Inventory, bool)
+	CacheGet func(context.Context, string) (Inventory, bool)
 
 	// CacheSet, if set, stores the inventory in the cache for the given tree.
-	CacheSet func(context.Context, fs.FileInfo, Inventory)
+	CacheSet func(context.Context, string, Inventory)
 }
diff --git a/cmd/frontend/internal/inventory/entries.go b/cmd/frontend/internal/inventory/entries.go
index 7f53601815a..56cf43a9e22 100644
--- a/cmd/frontend/internal/inventory/entries.go
+++ b/cmd/frontend/internal/inventory/entries.go
@@ -1,17 +1,113 @@
 package inventory
 
 import (
+	"archive/tar"
 	"context"
+	"fmt"
 	"github.com/sourcegraph/conc/iter"
+	"github.com/sourcegraph/sourcegraph/internal/env"
+	"github.com/sourcegraph/sourcegraph/internal/gitserver"
 	"github.com/sourcegraph/sourcegraph/lib/errors"
+	"io"
 	"io/fs"
 	"sort"
+	"time"
 )
 
+// Based on the benchmarking in entries_test.go, 1_000 is a good number after which we see diminishing returns.
+var maxInvsLength = env.MustGetInt("GET_INVENTORY_MAX_INV_IN_MEMORY", 1_000, "When computing the language stats, every nth iteration all loaded files are aggregated into the inventory to reduce the memory footprint. Increasing this value may make the computation run faster, but will require more memory.")
+var getInventoryTimeout = env.MustGetInt("GET_INVENTORY_TIMEOUT", 5, "Time in minutes before cancelling getInventory requests. Raise this if your repositories are large and need a long time to process.")
+
+func (c *Context) All(ctx context.Context) (inv Inventory, err error) {
+	// Top-level caching is what we need for users, directory-level caching is to speed up retries if the repo doesn't compute within the timeout.
+	// To help with the latter we detach the context above, instead of introducing rather complicated caching logic (it turned out to be a leetcode problem).
+	// Since we're planning to move this to a pre-computed style anyway, the upsides of introducing complicated code
+	// for retryable caching become less relevant.
+	ctx = context.WithoutCancel(ctx)
+	ctx, cancel := context.WithTimeout(ctx, time.Duration(getInventoryTimeout)*time.Minute)
+	defer cancel()
+
+	cacheKey := fmt.Sprintf("%s@%s", c.Repo, c.CommitID)
+	if c.CacheGet != nil {
+		if inv, ok := c.CacheGet(ctx, cacheKey); ok {
+			return inv, nil
+		}
+	}
+	if c.CacheSet != nil {
+		defer func() {
+			if err == nil {
+				c.CacheSet(ctx, cacheKey, inv)
+			}
+		}()
+	}
+
+	r, err := c.GitServerClient.ArchiveReader(ctx, c.Repo, gitserver.ArchiveOptions{Treeish: string(c.CommitID), Format: gitserver.ArchiveFormatTar})
+	if err != nil {
+		return Inventory{}, err
+	}
+	defer func() {
+		r.Close()
+	}()
+
+	tr := tar.NewReader(r)
+	return c.ArchiveProcessor(ctx, func() (*NextRecord, error) {
+		th, err := tr.Next()
+		if err != nil {
+			return nil, err
+		}
+		return &NextRecord{
+			Header: th,
+			GetFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
+				return io.NopCloser(tr), nil
+			},
+		}, nil
+	})
+}
+
+type NextRecord struct {
+	Header        *tar.Header
+	GetFileReader func(ctx context.Context, path string) (io.ReadCloser, error)
+}
+
+func (c *Context) ArchiveProcessor(ctx context.Context, next func() (*NextRecord, error)) (inv Inventory, err error) {
+	var invs []Inventory
+
+	for {
+		n, err := next()
+		if err != nil {
+			// We've seen everything and can sum up the rest.
+			if errors.Is(err, io.EOF) {
+				return Sum(invs), nil
+			}
+			return Inventory{}, err
+		}
+
+		if len(invs) >= maxInvsLength {
+			sum := Sum(invs)
+			invs = invs[:0]
+			invs = append(invs, sum)
+		}
+
+		entry := n.Header.FileInfo()
+
+		switch {
+		case entry.Mode().IsRegular():
+			lang, err := getLang(ctx, entry, n.GetFileReader, c.ShouldSkipEnhancedLanguageDetection)
+			if err != nil {
+				return Inventory{}, err
+			}
+			invs = append(invs, Inventory{Languages: []Lang{lang}})
+		default:
+			// Skip anything that is not a readable file (e.g. directories, symlinks, ...)
+			continue
+		}
+	}
+}
+
 // Entries computes the inventory of languages for the given entries. It traverses trees recursively
 // and caches results for each subtree. Results for listed files are cached.
 //
-// If a file is referenced more than once (e.g., because it is a descendent of a subtree and it is
+// If a file is referenced more than once (e.g., because it is a descendent of a subtree, and it is
 // passed directly), it will be double-counted in the result.
 func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (Inventory, error) {
 	invs := make([]Inventory, len(entries))
@@ -40,14 +136,14 @@ func (c *Context) Entries(ctx context.Context, entries ...fs.FileInfo) (Inventor
 func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, err error) {
 	// Get and set from the cache.
 	if c.CacheGet != nil {
-		if inv, ok := c.CacheGet(ctx, tree); ok {
+		if inv, ok := c.CacheGet(ctx, c.CacheKey(tree)); ok {
 			return inv, nil // cache hit
 		}
 	}
 	if c.CacheSet != nil {
 		defer func() {
 			if err == nil {
-				c.CacheSet(ctx, tree, inv) // store in cache
+				c.CacheSet(ctx, c.CacheKey(tree), inv) // store in cache
 			}
 		}()
 	}
@@ -64,7 +160,7 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, er
 			// Don't individually cache files that we found during tree traversal. The hit rate for
 			// those cache entries is likely to be much lower than cache entries for files whose
 			// inventory was directly requested.
-			lang, err := getLang(ctx, e, c.NewFileReader)
+			lang, err := getLang(ctx, e, c.NewFileReader, c.ShouldSkipEnhancedLanguageDetection)
 			return Inventory{Languages: []Lang{lang}}, err
 		case e.Mode().IsDir(): // subtree
 			subtreeInv, err := c.tree(ctx, e)
@@ -86,19 +182,19 @@ func (c *Context) tree(ctx context.Context, tree fs.FileInfo) (inv Inventory, er
 func (c *Context) file(ctx context.Context, file fs.FileInfo) (inv Inventory, err error) {
 	// Get and set from the cache.
 	if c.CacheGet != nil {
-		if inv, ok := c.CacheGet(ctx, file); ok {
+		if inv, ok := c.CacheGet(ctx, c.CacheKey(file)); ok {
 			return inv, nil // cache hit
 		}
 	}
 	if c.CacheSet != nil {
 		defer func() {
 			if err == nil {
-				c.CacheSet(ctx, file, inv) // store in cache
+				c.CacheSet(ctx, c.CacheKey(file), inv) // store in cache
 			}
 		}()
 	}
 
-	lang, err := getLang(ctx, file, c.NewFileReader)
+	lang, err := getLang(ctx, file, c.NewFileReader, c.ShouldSkipEnhancedLanguageDetection)
 	if err != nil {
 		return Inventory{}, errors.Wrapf(err, "inventory file %q", file.Name())
 	}
diff --git a/cmd/frontend/internal/inventory/entries_test.go b/cmd/frontend/internal/inventory/entries_test.go
index 72db555643c..8ddf64d431d 100644
--- a/cmd/frontend/internal/inventory/entries_test.go
+++ b/cmd/frontend/internal/inventory/entries_test.go
@@ -1,123 +1,310 @@
 package inventory
 
 import (
-	"bytes"
+	"archive/tar"
 	"context"
 	"io"
-	"io/fs"
-	"os"
 	"reflect"
-	"sync"
+	"strings"
 	"testing"
-
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/sourcegraph/sourcegraph/internal/fileutil"
 )
 
-func TestContext_Entries(t *testing.T) {
-	var (
-		readTreeCalls      []string
-		newFileReaderCalls []string
-		cacheGetCalls      []string
-		cacheSetCalls      = map[string]Inventory{}
-	)
-	var mu sync.Mutex
-	c := Context{
-		ReadTree: func(ctx context.Context, path string) ([]fs.FileInfo, error) {
-			mu.Lock()
-			defer mu.Unlock()
-			readTreeCalls = append(readTreeCalls, path)
-			switch path {
-			case "d":
-				return []fs.FileInfo{
-					&fileutil.FileInfo{Name_: "d/a", Mode_: os.ModeDir},
-					&fileutil.FileInfo{Name_: "d/b.go", Size_: 12},
-				}, nil
-			case "d/a":
-				return []fs.FileInfo{&fileutil.FileInfo{Name_: "d/a/c.m", Size_: 24}}, nil
-			default:
-				panic("unhandled mock ReadTree " + path)
-			}
+type FileData struct {
+	Name        string
+	Content     string
+	IsDirectory bool
+}
+
+func TestFileMode(t *testing.T) {
+	testCases := []struct {
+		name        string
+		isDirectory bool
+	}{
+		{
+			name:        "regular file",
+			isDirectory: false,
 		},
-		NewFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
-			mu.Lock()
-			defer mu.Unlock()
-			newFileReaderCalls = append(newFileReaderCalls, path)
-			var data []byte
-			switch path {
-			case "f.go":
-				data = []byte("package f")
-			case "d/b.go":
-				data = []byte("package main")
-			case "d/a/c.m":
-				data = []byte("@interface X:NSObject {}")
-			default:
-				panic("unhandled mock ReadFile " + path)
-			}
-			return io.NopCloser(bytes.NewReader(data)), nil
-		},
-		CacheGet: func(ctx context.Context, e fs.FileInfo) (Inventory, bool) {
-			mu.Lock()
-			defer mu.Unlock()
-			cacheGetCalls = append(cacheGetCalls, e.Name())
-			return Inventory{}, false
-		},
-		CacheSet: func(ctx context.Context, e fs.FileInfo, inv Inventory) {
-			mu.Lock()
-			defer mu.Unlock()
-			if _, ok := cacheSetCalls[e.Name()]; ok {
-				t.Fatalf("already stored %q in cache", e.Name())
-			}
-			cacheSetCalls[e.Name()] = inv
+		{
+			name:        "directory",
+			isDirectory: true,
 		},
 	}
 
-	inv, err := c.Entries(context.Background(),
-		&fileutil.FileInfo{Name_: "d", Mode_: os.ModeDir},
-		&fileutil.FileInfo{Name_: "f.go", Mode_: 0, Size_: 1 /* HACK to force read */},
-	)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if want := (Inventory{
-		Languages: []Lang{
-			{Name: "Go", TotalBytes: 21, TotalLines: 2},
-			{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
-		},
-	}); !reflect.DeepEqual(inv, want) {
-		t.Fatalf("got  %#v\nwant %#v", inv, want)
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+
+			h := &tar.Header{
+				Name: tc.name,
+				Mode: func() int64 {
+					if tc.isDirectory {
+						// 04 (in octal) sets the directory bit
+						return 040000
+					}
+					return 0
+				}(),
+			}
+
+			got := h.FileInfo().Mode().IsDir()
+			if got != tc.isDirectory {
+				t.Errorf("got %v, want %v", got, tc.isDirectory)
+			}
+		})
 	}
 
-	// Check that our mocks were called as expected.
-	if want := []string{"d", "d/a"}; !reflect.DeepEqual(readTreeCalls, want) {
-		t.Errorf("ReadTree calls: got %q, want %q", readTreeCalls, want)
-	}
-	if len(newFileReaderCalls) != 3 {
-		t.Errorf("GetFileReader calls: got %d, want %d", len(newFileReaderCalls), 3)
-	}
-	if want := []string{"d", "d/a", "f.go"}; !reflect.DeepEqual(cacheGetCalls, want) {
-		t.Errorf("CacheGet calls: got %q, want %q", cacheGetCalls, want)
-	}
-	want := map[string]Inventory{
-		"d": {
-			Languages: []Lang{
+}
+
+func TestEntriesNextProcessor(t *testing.T) {
+	testCases := []struct {
+		name     string
+		files    []FileData
+		expected Inventory
+	}{
+		{
+			name: "Go and Objective-C (in directory) files",
+			files: []FileData{
+				{Name: "b.go", Content: "package main"},
+				{Name: "a", Content: "", IsDirectory: true},
+				{Name: "a/c.m", Content: "@interface X:NSObject {}"},
+			},
+			expected: Inventory{Languages: []Lang{
 				{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
 				{Name: "Go", TotalBytes: 12, TotalLines: 1},
-			},
+			}},
 		},
-		"d/a": {
-			Languages: []Lang{
+		{
+			name: "Single file",
+			files: []FileData{
+				{Name: "b.go", Content: "package main"},
+			},
+			expected: Inventory{Languages: []Lang{
+				{Name: "Go", TotalBytes: 12, TotalLines: 1},
+			}},
+		},
+		{
+			name: "Two root file",
+			files: []FileData{
+				{Name: "a.go", Content: "package main"},
+				{Name: "b.go", Content: "package main"},
+			},
+			expected: Inventory{Languages: []Lang{
+				{Name: "Go", TotalBytes: 24, TotalLines: 2},
+			}},
+		},
+		{
+			name: "File and empty directory",
+			files: []FileData{
+				{Name: "a.go", Content: "package main"},
+				{Name: "b", Content: "", IsDirectory: true},
+			},
+			expected: Inventory{Languages: []Lang{
+				{Name: "Go", TotalBytes: 12, TotalLines: 1},
+			}},
+		},
+		{
+			name: "Two directories with files",
+			files: []FileData{
+				{Name: "a", Content: "", IsDirectory: true},
+				{Name: "a/a.m", Content: "@interface X:NSObject {}"},
+				{Name: "b", Content: "", IsDirectory: true},
+				{Name: "b/b.go", Content: "package main"},
+			},
+			expected: Inventory{Languages: []Lang{
 				{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
+				{Name: "Go", TotalBytes: 12, TotalLines: 1},
+			}},
+		},
+		{
+			name: "Two directories with files, and a root file",
+			files: []FileData{
+				{Name: "a", Content: "", IsDirectory: true},
+				{Name: "a/a.m", Content: "@interface X:NSObject {}"},
+				{Name: "b", Content: "", IsDirectory: true},
+				{Name: "b/b.go", Content: "package main"},
+				{Name: "c.go", Content: "package main"},
+			},
+			expected: Inventory{Languages: []Lang{
+				{Name: "Go", TotalBytes: 24, TotalLines: 2},
+				{Name: "Objective-C", TotalBytes: 24, TotalLines: 1},
+			}},
+		},
+		{
+			name: "Directory with sub-directory and surrounding files",
+			files: []FileData{
+				{Name: "a", Content: "", IsDirectory: true},
+				{Name: "a/A.m", Content: "@interface X:NSObject {}"},
+				{Name: "a/b", Content: "", IsDirectory: true},
+				{Name: "a/b/B.m", Content: "@interface X:NSObject {}"},
+				{Name: "a/c", Content: "", IsDirectory: true},
+				{Name: "a/c/D.m", Content: "@interface X:NSObject {}"},
+				{Name: "E.go", Content: "package main"},
+			},
+			expected: Inventory{Languages: []Lang{
+				{Name: "Objective-C", TotalBytes: 72, TotalLines: 3},
+				{Name: "Go", TotalBytes: 12, TotalLines: 1},
+			}},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			index := 0
+
+			c := Context{
+				Repo:                                "testRepo",
+				CommitID:                            "testCommit",
+				ReadTree:                            nil,
+				NewFileReader:                       nil,
+				CacheKey:                            nil,
+				CacheGet:                            nil,
+				CacheSet:                            nil,
+				ShouldSkipEnhancedLanguageDetection: false,
+				GitServerClient:                     nil,
+			}
+
+			next := func() (*NextRecord, error) {
+				defer func() {
+					index += 1
+				}()
+				if index >= len(tc.files) {
+					return nil, io.EOF
+				}
+				content := tc.files[index].Content
+				return &NextRecord{
+					Header: &tar.Header{
+						Name: tc.files[index].Name,
+						Size: int64(len(content)),
+						Mode: func() int64 {
+							if tc.files[index].IsDirectory {
+								// 04 (in octal) sets the directory bit
+								return 040000
+							}
+							return 0
+						}(),
+					},
+					GetFileReader: func(ctx context.Context, path string) (io.ReadCloser, error) {
+						return io.NopCloser(strings.NewReader(content)), nil
+					},
+				}, nil
+			}
+
+			got, err := c.ArchiveProcessor(context.Background(), next)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if !reflect.DeepEqual(tc.expected, got) {
+				t.Errorf("got %+v, want %+v", got, tc.expected)
+			}
+		})
+	}
+
+}
+
+func TestSum(t *testing.T) {
+	testCases := []struct {
+		name     string
+		invs     []Inventory
+		expected Inventory
+	}{
+		{
+			name: "empty input",
+			invs: []Inventory{},
+			expected: Inventory{
+				Languages: []Lang{},
 			},
 		},
-		"f.go": {
-			Languages: []Lang{
-				{Name: "Go", TotalBytes: 9, TotalLines: 1},
+		{
+			name: "single inventory",
+			invs: []Inventory{
+				{
+					Languages: []Lang{
+						{Name: "Go", TotalBytes: 100, TotalLines: 10},
+						{Name: "Python", TotalBytes: 200, TotalLines: 20},
+					},
+				},
+			},
+			expected: Inventory{
+				Languages: []Lang{
+					{Name: "Python", TotalBytes: 200, TotalLines: 20},
+					{Name: "Go", TotalBytes: 100, TotalLines: 10},
+				},
+			},
+		},
+		{
+			name: "multiple inventories",
+			invs: []Inventory{
+				{
+					Languages: []Lang{
+						{Name: "Go", TotalBytes: 100, TotalLines: 10},
+						{Name: "Python", TotalBytes: 200, TotalLines: 20},
+					},
+				},
+				{
+					Languages: []Lang{
+						{Name: "Go", TotalBytes: 50, TotalLines: 5},
+						{Name: "Ruby", TotalBytes: 300, TotalLines: 30},
+					},
+				},
+			},
+			expected: Inventory{
+				Languages: []Lang{
+					{Name: "Ruby", TotalBytes: 300, TotalLines: 30},
+					{Name: "Python", TotalBytes: 200, TotalLines: 20},
+					{Name: "Go", TotalBytes: 150, TotalLines: 15},
+				},
+			},
+		},
+		{
+			name: "empty language name",
+			invs: []Inventory{
+				{
+					Languages: []Lang{
+						{Name: "", TotalBytes: 100, TotalLines: 10},
+						{Name: "Python", TotalBytes: 200, TotalLines: 20},
+					},
+				},
+			},
+			expected: Inventory{
+				Languages: []Lang{
+					{Name: "Python", TotalBytes: 200, TotalLines: 20},
+				},
 			},
 		},
 	}
-	if diff := cmp.Diff(want, cacheSetCalls); diff != "" {
-		t.Error(diff)
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := Sum(tc.invs)
+			if !reflect.DeepEqual(got, tc.expected) {
+				t.Errorf("got %+v, want %+v", got, tc.expected)
+			}
+		})
+	}
+}
+
+func BenchmarkSum(b *testing.B) {
+	// Benchmark results
+	// n=10000: ~8400 ns/op => 8.4 ns/record
+	// n=1000: ~8500 ns/op => 8.5 ns/record
+	// n=100: ~1100 ns/op => 11 ns/record
+	// n=10: ~360 ns/op => 36 ns/record
+	// n=5: ~300 ns/op => 60 ns/record
+	// n=2: ~280 ns/op => 140 ns/record
+	n := 10_000
+	invs := make([]Inventory, n)
+	for i := 0; i < n; i++ {
+		invs[i] = Inventory{
+			Languages: []Lang{
+				{Name: "Go", TotalBytes: uint64(100 + int64(i)), TotalLines: uint64(10 + int64(i))},
+				{Name: "Python", TotalBytes: uint64(200 + int64(i)), TotalLines: uint64(20 + int64(i))},
+			},
+		}
+		if i%2 == 1 {
+			invs[i].Languages[1] = Lang{Name: "Ruby", TotalBytes: uint64(300 + int64(i)), TotalLines: uint64(30 + int64(i))}
+		}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		Sum(invs)
 	}
 }
diff --git a/cmd/frontend/internal/inventory/inventory.go b/cmd/frontend/internal/inventory/inventory.go
index 616db35afae..5fa92c52b8b 100644
--- a/cmd/frontend/internal/inventory/inventory.go
+++ b/cmd/frontend/internal/inventory/inventory.go
@@ -6,16 +6,12 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"github.com/go-enry/go-enry/v2"      //nolint:depguard - FIXME: replace this usage of enry with languages package
+	"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: replace this usage of enry with languages package
+	"github.com/sourcegraph/log"
 	"io"
 	"io/fs"
 
-	"github.com/go-enry/go-enry/v2"      //nolint:depguard - FIXME: replace this usage of enry with languages package
-	"github.com/go-enry/go-enry/v2/data" //nolint:depguard - FIXME: replace this usage of enry with languages package
-	"go.opentelemetry.io/otel/attribute"
-
-	"github.com/sourcegraph/log"
-
-	"github.com/sourcegraph/sourcegraph/internal/trace"
 	"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
 	"github.com/sourcegraph/sourcegraph/lib/errors"
 )
@@ -45,7 +41,7 @@ type Lang struct {
 
 var newLine = []byte{'\n'}
 
-func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error)) (Lang, error) {
+func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx context.Context, path string) (io.ReadCloser, error), skipEnhancedLanguageDetection bool) (Lang, error) {
 	if file == nil {
 		return Lang{}, nil
 	}
@@ -53,34 +49,25 @@ func getLang(ctx context.Context, file fs.FileInfo, getFileReader func(ctx conte
 		return Lang{}, nil
 	}
 
-	trc, ctx := trace.New(ctx, "getLang")
-	defer trc.End()
-	rc, err := getFileReader(ctx, file.Name())
-	if err != nil {
-		return Lang{}, errors.Wrap(err, "getting file reader")
-	}
-	if rc != nil {
-		defer rc.Close()
-	}
-
 	var lang Lang
 	// In many cases, GetLanguageByFilename can detect the language conclusively just from the
 	// filename. If not, we pass a subset of the file contents for analysis.
 	matchedLang, safe := GetLanguageByFilename(file.Name())
 
-	trc.AddEvent("GetLanguageByFilename",
-		attribute.String("FileName", file.Name()),
-		attribute.Bool("Safe", safe),
-		attribute.String("MatchedLang", matchedLang),
-	)
-
-	// No content
-	if rc == nil {
+	if skipEnhancedLanguageDetection {
 		lang.Name = matchedLang
 		lang.TotalBytes = uint64(file.Size())
 		return lang, nil
 	}
 
+	rc, err := getFileReader(ctx, file.Name())
+	if err != nil {
+		return Lang{}, errors.Wrap(err, "Failed to create a file reader.")
+	}
+	if rc != nil {
+		defer rc.Close()
+	}
+
 	buf := make([]byte, fileReadBufferSize)
 
 	if !safe {
diff --git a/cmd/frontend/internal/inventory/inventory_test.go b/cmd/frontend/internal/inventory/inventory_test.go
index 29106d47964..a907f0c1eed 100644
--- a/cmd/frontend/internal/inventory/inventory_test.go
+++ b/cmd/frontend/internal/inventory/inventory_test.go
@@ -65,7 +65,8 @@ func TestGetLang_language(t *testing.T) {
 		t.Run(label, func(t *testing.T) {
 			lang, err := getLang(context.Background(),
 				test.file,
-				makeFileReader(test.file.Contents))
+				makeFileReader(test.file.Contents),
+				false)
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -76,6 +77,91 @@ func TestGetLang_language(t *testing.T) {
 	}
 }
 
+type mockCloser struct {
+	closeFunc func() error
+}
+
+func (m *mockCloser) Close() error {
+	return m.closeFunc()
+}
+
+func TestGetLang_fileReader(t *testing.T) {
+	t.Run("If the file reader is opened, it must be closed", func(t *testing.T) {
+
+		openCalled := false
+		closeCalled := false
+
+		fakeFileReader := func(ctx context.Context, path string) (io.ReadCloser, error) {
+			openCalled = true
+			return struct {
+				io.Reader
+				io.Closer
+			}{
+				Reader: strings.NewReader(""),
+				Closer: &mockCloser{closeFunc: func() error {
+					closeCalled = true
+					return nil
+				}},
+			}, nil
+		}
+
+		_, err := getLang(context.Background(),
+			fi{"a.java", ""},
+			fakeFileReader,
+			false)
+
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if !openCalled {
+			t.Fatal("Open should have been called")
+		}
+
+		if !closeCalled {
+			t.Error("Close should have been called")
+		}
+	})
+
+	t.Run("If the file reader is NOT opened, it should not be closed", func(t *testing.T) {
+
+		openCalled := false
+		closeCalled := false
+
+		fakeFileReader := func(ctx context.Context, path string) (io.ReadCloser, error) {
+			openCalled = true
+			return struct {
+				io.Reader
+				io.Closer
+			}{
+				Reader: strings.NewReader(""),
+				Closer: &mockCloser{closeFunc: func() error {
+					closeCalled = true
+					return nil
+				}},
+			}, nil
+		}
+
+		_, err := getLang(context.Background(),
+			fi{"a.java", ""},
+			fakeFileReader,
+			true)
+
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if openCalled {
+			t.Fatal("Open should NOT have been called")
+		}
+
+		if closeCalled {
+			t.Error("Close should NOT have been called")
+		}
+	})
+
+}
+
 func makeFileReader(contents string) func(context.Context, string) (io.ReadCloser, error) {
 	return func(ctx context.Context, path string) (io.ReadCloser, error) {
 		return io.NopCloser(strings.NewReader(contents)), nil
@@ -135,7 +221,7 @@ func TestGet_readFile(t *testing.T) {
 	for _, test := range tests {
 		t.Run(test.file.Name(), func(t *testing.T) {
 			fr := makeFileReader(test.file.(fi).Contents)
-			lang, err := getLang(context.Background(), test.file, fr)
+			lang, err := getLang(context.Background(), test.file, fr, false)
 			if err != nil {
 				t.Fatal(err)
 			}
@@ -169,7 +255,7 @@ func BenchmarkGetLang(b *testing.B) {
 	b.ResetTimer()
 	for range b.N {
 		for _, file := range files {
-			_, err = getLang(context.Background(), file, fr)
+			_, err = getLang(context.Background(), file, fr, false)
 			if err != nil {
 				b.Fatal(err)
 			}