chore: Factor out sub-query for locating nearest uploads (#64210)

A large sub-query was duplicated between two queries, making
the code harder to understand. This patch factors that out.

Ancillary changes:
- Added more doc comments
- Tweak tests to make failures easier to follow.
This commit is contained in:
Varun Gandhi 2024-08-01 21:05:03 +08:00 committed by GitHub
parent 66be2ddcb6
commit 554fd33eff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 71 additions and 85 deletions

View File

@ -195,6 +195,8 @@ func populateUploadsByTraversal(graph map[api.CommitID][]api.CommitID, order []a
// smaller distance to the source commit will shadow the other. Similarly, If an ancestor and the
// child commit define uploads for the same root and indexer pair, the upload defined on the commit
// will shadow the upload defined on the ancestor.
//
// IMPORTANT: This logic should be kept in sync with store.makeVisibleUploadsQuery.
func populateUploadsForCommit(uploads map[api.CommitID]map[string]UploadMeta, ancestors []api.CommitID, distance uint32, commitGraphView *CommitGraphView, commit api.CommitID) map[string]UploadMeta {
// The capacity chosen here is an underestimate, but seems to perform well in benchmarks using
// live user data. We have attempted to make this value more precise to minimize the number of

View File

@ -42,8 +42,10 @@ go_library(
"@com_github_jackc_pgtype//:pgtype",
"@com_github_keegancsmith_sqlf//:sqlf",
"@com_github_lib_pq//:pq",
"@com_github_life4_genesis//slices",
"@com_github_sourcegraph_log//:log",
"@io_opentelemetry_go_otel//attribute",
"@org_golang_x_exp//maps",
],
)

View File

@ -11,6 +11,7 @@ import (
"github.com/keegancsmith/sqlf"
"go.opentelemetry.io/otel/attribute"
"golang.org/x/exp/maps"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/codeintel/uploads/internal/commitgraph"
@ -300,7 +301,7 @@ func (s *store) FindClosestCompletedUploads(ctx context.Context, opts shared.Upl
conds := makeFindClosestProcessUploadsConditions(opts)
// TODO(id: completed-state-check) Make sure we only return uploads with state = 'completed' here
query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(int(opts.RepositoryID), string(opts.Commit)), sqlf.Join(conds, " AND "))
query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(opts.RepositoryID, opts.Commit), sqlf.Join(conds, " AND "))
uploads, err := scanCompletedUploads(s.db.Query(ctx, query))
if err != nil {
@ -349,17 +350,9 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context
return nil, nil
}
commitQueries := make([]*sqlf.Query, 0, len(commitGraph.Graph()))
for commit := range commitGraph.Graph() {
commitQueries = append(commitQueries, sqlf.Sprintf("%s", dbutil.CommitBytea(commit)))
}
commitGraphView, err := scanCommitGraphView(s.db.Query(ctx, sqlf.Sprintf(
findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery,
opts.RepositoryID,
sqlf.Join(commitQueries, ", "),
opts.RepositoryID,
sqlf.Join(commitQueries, ", "),
makeNearestUploadsQuery(opts.RepositoryID, maps.Keys(commitGraph.Graph())...),
)))
if err != nil {
return nil, err
@ -390,41 +383,14 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context
}
const findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery = `
WITH
visible_uploads AS (
-- Select the set of uploads visible from one of the given commits. This is done by
-- looking at each commit's row in the lsif_nearest_uploads table, and the (adjusted)
-- set of uploads from each commit's nearest ancestor according to the data compressed
-- in the links table.
--
-- NB: A commit should be present in at most one of these tables.
SELECT
nu.repository_id,
upload_id::integer,
nu.commit_bytea,
u_distance::text::integer as distance
FROM lsif_nearest_uploads nu
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s)
UNION (
SELECT
nu.repository_id,
upload_id::integer,
ul.commit_bytea,
u_distance::text::integer + ul.distance as distance
FROM lsif_nearest_uploads_links ul
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s)
)
)
WITH cte_nearest_uploads AS (%s)
SELECT
vu.upload_id,
encode(vu.commit_bytea, 'hex'),
cte_nu.upload_id,
encode(cte_nu.commit_bytea, 'hex'),
md5(u.root || ':' || u.indexer) as token,
vu.distance
FROM visible_uploads vu
JOIN lsif_uploads u ON u.id = vu.upload_id
cte_nu.distance
FROM cte_nearest_uploads cte_nu
JOIN lsif_uploads u ON u.id = cte_nu.upload_id
`
const findClosestCompletedUploadsFromGraphFragmentQuery = `

View File

@ -10,6 +10,7 @@ import (
"os"
"sort"
"strconv"
"strings"
"testing"
"time"
@ -1299,10 +1300,11 @@ func (t *FindClosestCompletedUploadsTestCase) uploadMatchingOptions() shared.Upl
}
func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []FindClosestCompletedUploadsTestCase) {
t.Helper()
for _, testCase := range testCases {
name := fmt.Sprintf(
"commit=%s file=%s rootMustEnclosePath=%v indexer=%s",
testCase.commit,
strings.TrimLeft(testCase.commit, "0"),
testCase.file,
testCase.rootMustEnclosePath,
testCase.indexer,
@ -1337,7 +1339,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find
}
if testCase.graph != nil {
t.Run(name+" [graph-fragment]", func(t *testing.T) {
t.Run("[graph-fragment] "+name, func(t *testing.T) {
uploads, err := store.FindClosestCompletedUploadsFromGraphFragment(context.Background(), testCase.uploadMatchingOptions(), testCase.graph)
if err != nil {
t.Fatalf("unexpected error finding closest uploads: %s", err)
@ -1351,7 +1353,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find
func testAnyOf(t *testing.T, uploads []shared.CompletedUpload, expectedIDs []int) {
if len(uploads) != 1 {
t.Errorf("unexpected nearest upload length. want=%d have=%d", 1, len(uploads))
t.Errorf("unexpected nearest upload length. want=%d have=%d\nlist: %+v", 1, len(uploads), uploads)
return
}
@ -1462,10 +1464,11 @@ func getProtectedUploads(t testing.TB, db database.DB, repositoryID int) []int {
return ids
}
// getVisibleUploads separately returns the uploads visible at each commit in commits.
func getVisibleUploads(t testing.TB, db database.DB, repositoryID int, commits []string) map[string][]int {
idsByCommit := map[string][]int{}
for _, commit := range commits {
query := makeVisibleUploadsQuery(repositoryID, commit)
query := makeVisibleUploadsQuery(api.RepoID(repositoryID), api.CommitID(commit))
uploadIDs, err := basestore.ScanInts(db.QueryContext(
context.Background(),

View File

@ -5,7 +5,9 @@ import (
"sort"
"github.com/keegancsmith/sqlf"
genslices "github.com/life4/genesis/slices"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/database/basestore"
"github.com/sourcegraph/sourcegraph/internal/database/dbutil"
)
@ -65,17 +67,54 @@ WHERE r.state = 'queued'
`
// makeVisibleUploadsQuery returns a SQL query returning the set of identifiers of uploads
// visible from the given commit. This is done by removing the "shadowed" values created
// by looking at a commit and it's ancestors visible commits.
func makeVisibleUploadsQuery(repositoryID int, commit string) *sqlf.Query {
return sqlf.Sprintf(
visibleUploadsQuery,
repositoryID, dbutil.CommitBytea(commit),
repositoryID, dbutil.CommitBytea(commit),
)
// visible from the given commit. This is done by removing the "shadowed" values
// created by looking at the commit and it's ancestors visible commits.
//
// This function deliberately accepts a single commit and not a list,
// as the shadowing calculation involves comparison by distance,
// and we don't do that across commits.
//
// IMPORTANT: The shadowing logic should be kept in sync with commitgraph.populateUploadsForCommit.
func makeVisibleUploadsQuery(repositoryID api.RepoID, commit api.CommitID) *sqlf.Query {
return sqlf.Sprintf(visibleUploadsQueryFragment, makeNearestUploadsQuery(repositoryID, commit))
}
const visibleUploadsQuery = `
func makeNearestUploadsQuery(repositoryID api.RepoID, commits ...api.CommitID) *sqlf.Query {
format := func(commit api.CommitID) *sqlf.Query {
return sqlf.Sprintf("%s", dbutil.CommitBytea(commit))
}
commitQueries := sqlf.Join(genslices.Map(commits, format), ", ")
return sqlf.Sprintf(nearestUploadsQuery, int(repositoryID), commitQueries, int(repositoryID), commitQueries)
}
// nearestUploadsQuery finds the set of uploads marked as 'nearest'
// from the given list of commits based on the lsif_nearest_uploads
// and the lsif_nearest_uploads_links tables.
//
// NB: A commit should be present in at most one of these tables.
const nearestUploadsQuery = `
SELECT
nu.repository_id,
upload_id::integer,
nu.commit_bytea,
u_distance::text::integer as distance
FROM lsif_nearest_uploads nu
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s)
UNION (
SELECT
nu.repository_id,
upload_id::integer,
ul.commit_bytea,
u_distance::text::integer + ul.distance as distance
FROM lsif_nearest_uploads_links ul
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s)
)
`
const visibleUploadsQueryFragment = `
SELECT
t.upload_id
FROM (
@ -83,35 +122,9 @@ FROM (
t.*,
-- NOTE(id: closest-uploads-postcondition) Only return a single result
-- for an (indexer, root) pair, see also the WHERE clause at the end.
row_number() OVER (PARTITION BY root, indexer ORDER BY distance) AS r
FROM (
-- Select the set of uploads visible from the given commit. This is done by looking
-- at each commit's row in the lsif_nearest_uploads table, and the (adjusted) set of
-- uploads from each commit's nearest ancestor according to the data compressed in
-- the links table.
--
-- NB: A commit should be present in at most one of these tables.
SELECT
nu.repository_id,
upload_id::integer,
nu.commit_bytea,
u_distance::text::integer as distance
FROM lsif_nearest_uploads nu
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND nu.commit_bytea = %s
UNION (
SELECT
nu.repository_id,
upload_id::integer,
ul.commit_bytea,
u_distance::text::integer + ul.distance as distance
FROM lsif_nearest_uploads_links ul
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
WHERE nu.repository_id = %s AND ul.commit_bytea = %s
)
) t
JOIN lsif_uploads u ON u.id = upload_id
row_number() OVER (PARTITION BY u.root, u.indexer ORDER BY t.distance) AS r
FROM (%s) t
JOIN lsif_uploads u ON u.id = t.upload_id
) t
-- Remove ranks > 1, as they are shadowed by another upload in the same output set
WHERE t.r <= 1