codeintel: Speed up syntactic and search-based usages using batch APIs (#64078)

Closes
https://linear.app/sourcegraph/issue/GRAPH-771/chunk-syntactic-usage-tasks-and-use-batch-apis-to-handle-chunks

This is the last step for properly implementing #63971. We split the
`candidateFiles` search produces into chunks and process these in
parallel using the new batch api on `MappedIndex`.

## Test plan
Existing tests continue passing

---------

Co-authored-by: Varun Gandhi <varun.gandhi@sourcegraph.com>
This commit is contained in:
Christoph Hegemann 2024-07-29 04:09:05 +02:00 committed by GitHub
parent 51cf4dcba8
commit 3f8620508b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -27,12 +27,21 @@ import (
"github.com/sourcegraph/sourcegraph/lib/errors"
)
// SYNTACTIC_USAGES_DOCUMENTS_CHUNK_SIZE is the batch size for SCIP documents and git diffs we load at a time.
//
// I collected traces for various sizes (on my local machine) and 20 gave me "nice looking" ones.
// In general I expect 100 documents to be on the "higher end" of the number of documents to retrieve
// for a single syntactic usage search and 5 concurrent queries and git requests seems like a reasonable
// trade-off for concurrency vs load.
const SYNTACTIC_USAGES_DOCUMENTS_CHUNK_SIZE = 20
type candidateMatch struct {
range_ scip.Range
surroundindContent string
}
type candidateFile struct {
path core.RepoRelPath
matches []candidateMatch // Guaranteed to be sorted by range
didSearchEntireFile bool // Or did we hit the search count limit?
}
@ -60,16 +69,16 @@ func findCandidateOccurrencesViaSearch(
trace observation.TraceLogger,
client searchclient.SearchClient,
args searchArgs,
) (orderedmap.OrderedMap[core.RepoRelPath, candidateFile], error) {
) ([]candidateFile, error) {
if args.identifier == "" {
return *orderedmap.New[core.RepoRelPath, candidateFile](), nil
return []candidateFile{}, nil
}
resultMap := *orderedmap.New[core.RepoRelPath, candidateFile]()
// TODO: countLimit should be dependent on the number of requested usages, with a configured global limit
// For now we're matching the current web app with 500
searchResults, err := executeQuery(ctx, client, trace, args, "file", 500, 0)
if err != nil {
return resultMap, err
return []candidateFile{}, err
}
nonFileMatches := 0
@ -109,8 +118,10 @@ func findCandidateOccurrencesViaSearch(
return s1.range_.CompareStrict(s2.range_)
})
// OK to use Unchecked method here as search API only returns repo-root relative paths
_, alreadyPresent := resultMap.Set(core.NewRepoRelPathUnchecked(path), candidateFile{
relPath := core.NewRepoRelPathUnchecked(path)
_, alreadyPresent := resultMap.Set(relPath, candidateFile{
matches: matches,
path: relPath,
didSearchEntireFile: !fileMatch.LimitHit,
})
if alreadyPresent {
@ -129,7 +140,11 @@ func findCandidateOccurrencesViaSearch(
trace.Warn("Saw mismatched file paths between chunk matches in the same FileMatch. Report this to the search-platform")
}
return resultMap, nil
results := make([]candidateFile, 0, resultMap.Len())
for pair := resultMap.Oldest(); pair != nil; pair = pair.Next() {
results = append(results, pair.Value)
}
return results, nil
}
type symbolData struct {
@ -309,26 +324,12 @@ func symbolAtRange(
func findSyntacticMatchesForCandidateFile(
ctx context.Context,
trace observation.TraceLogger,
mappedIndex MappedIndex,
filePath core.RepoRelPath,
document MappedDocument,
candidateFile candidateFile,
) ([]SyntacticMatch, []SearchBasedMatch, *SyntacticUsagesError) {
documentOpt, docErr := mappedIndex.GetDocument(ctx, filePath)
if docErr != nil {
return nil, nil, &SyntacticUsagesError{
Code: SU_Fatal,
UnderlyingError: docErr,
}
}
document, isSome := documentOpt.Get()
if !isSome {
return nil, nil, &SyntacticUsagesError{
Code: SU_NoSyntacticIndex,
}
}
) ([]SyntacticMatch, []SearchBasedMatch) {
filePath := candidateFile.path
syntacticMatches := []SyntacticMatch{}
searchBasedMatches := []SearchBasedMatch{}
failedTranslationCount := 0
for _, candidateMatch := range candidateFile.matches {
foundSyntacticMatch := false
@ -361,7 +362,7 @@ func findSyntacticMatchesForCandidateFile(
if failedTranslationCount != 0 {
trace.Info("findSyntacticMatchesForCandidateFile", log.Int("failedTranslationCount", failedTranslationCount))
}
return syntacticMatches, searchBasedMatches, nil
return syntacticMatches, searchBasedMatches
}
func syntacticUsagesImpl(
@ -406,21 +407,31 @@ func syntacticUsagesImpl(
}
}
tasks := make([]orderedmap.Pair[core.RepoRelPath, candidateFile], 0, candidateMatches.Len())
for pair := candidateMatches.Oldest(); pair != nil; pair = pair.Next() {
tasks = append(tasks, *pair)
}
results := conciter.Map(tasks, func(pair *orderedmap.Pair[core.RepoRelPath, candidateFile]) []SyntacticMatch {
// We're assuming the index we found earlier contains the relevant SCIP document
// see NOTE(id: single-syntactic-upload)
syntacticMatches, _, err := findSyntacticMatchesForCandidateFile(ctx, trace, mappedIndex, (*pair).Key, (*pair).Value)
tasks, _ := genslices.ChunkEvery(candidateMatches, SYNTACTIC_USAGES_DOCUMENTS_CHUNK_SIZE)
results, err := conciter.MapErr(tasks, func(files *[]candidateFile) ([]SyntacticMatch, error) {
paths := genslices.Map(*files, func(f candidateFile) core.RepoRelPath {
return f.path
})
documents, err := mappedIndex.GetDocuments(ctx, paths)
if err != nil {
// TODO: Errors that are not "no index found in the DB" should be reported
// TODO: Track metrics about how often this happens (GRAPH-693)
return []SyntacticMatch{}
return []SyntacticMatch{}, err
}
return syntacticMatches
results := [][]SyntacticMatch{}
for _, file := range *files {
if document, ok := documents[file.path]; ok {
syntacticMatches, _ := findSyntacticMatchesForCandidateFile(ctx, trace, document, file)
results = append(results, syntacticMatches)
}
}
return slices.Concat(results...), nil
})
if err != nil {
return SyntacticUsagesResult{}, PreviousSyntacticSearch{}, &SyntacticUsagesError{
Code: SU_Fatal,
UnderlyingError: err,
}
}
return SyntacticUsagesResult{Matches: slices.Concat(results...)}, PreviousSyntacticSearch{
MappedIndex: mappedIndex,
SymbolName: symbolName,
@ -447,7 +458,7 @@ func searchBasedUsagesImpl(
}
var matchResults struct {
candidateMatches orderedmap.OrderedMap[core.RepoRelPath, candidateFile]
candidateMatches []candidateFile
err error
}
var symbolResults struct {
@ -471,30 +482,36 @@ func searchBasedUsagesImpl(
candidateMatches := matchResults.candidateMatches
candidateSymbols := symbolResults.candidateSymbols
tasks := make([]orderedmap.Pair[core.RepoRelPath, candidateFile], 0, candidateMatches.Len())
for pair := candidateMatches.Oldest(); pair != nil; pair = pair.Next() {
tasks = append(tasks, *pair)
}
results := conciter.Map(tasks, func(pair *orderedmap.Pair[core.RepoRelPath, candidateFile]) []SearchBasedMatch {
if index, ok := syntacticIndex.Get(); ok {
_, searchBasedMatches, err := findSyntacticMatchesForCandidateFile(ctx, trace, index, pair.Key, pair.Value)
tasks, _ := genslices.ChunkEvery(candidateMatches, SYNTACTIC_USAGES_DOCUMENTS_CHUNK_SIZE)
results := conciter.Map(tasks, func(files *[]candidateFile) []SearchBasedMatch {
documents := map[core.RepoRelPath]MappedDocument{}
if mappedIndex, ok := syntacticIndex.Get(); ok {
paths := genslices.Map(*files, func(f candidateFile) core.RepoRelPath {
return f.path
})
documentsMap, err := mappedIndex.GetDocuments(ctx, paths)
if err == nil {
return searchBasedMatches
} else {
trace.Info("findSyntacticMatches failed, skipping filtering search-based results", log.Error(err))
documents = documentsMap
}
}
matches := []SearchBasedMatch{}
for _, match := range pair.Value.matches {
matches = append(matches, SearchBasedMatch{
Path: pair.Key,
Range: match.range_,
SurroundingContent: match.surroundindContent,
IsDefinition: candidateSymbols.Contains(pair.Key, match.range_),
})
results := [][]SearchBasedMatch{}
for _, file := range *files {
var searchBasedMatches []SearchBasedMatch
if document, ok := documents[file.path]; ok {
_, searchBasedMatches = findSyntacticMatchesForCandidateFile(ctx, trace, document, file)
} else {
for _, match := range file.matches {
searchBasedMatches = append(searchBasedMatches, SearchBasedMatch{
Path: file.path,
Range: match.range_,
SurroundingContent: match.surroundindContent,
IsDefinition: candidateSymbols.Contains(file.path, match.range_),
})
}
}
results = append(results, searchBasedMatches)
}
return matches
return slices.Concat(results...)
})
return slices.Concat(results...), nil
}