From 554fd33eff3ac3f5b35a19780b7b13ae3b35ecbf Mon Sep 17 00:00:00 2001 From: Varun Gandhi Date: Thu, 1 Aug 2024 21:05:03 +0800 Subject: [PATCH] chore: Factor out sub-query for locating nearest uploads (#64210) A large sub-query was duplicated between two queries, making the code harder to understand. This patch factors that out. Ancillary changes: - Added more doc comments - Tweak tests to make failures easier to follow. --- .../internal/commitgraph/commit_graph.go | 2 + .../uploads/internal/store/BUILD.bazel | 2 + .../uploads/internal/store/commitgraph.go | 52 ++--------- .../internal/store/commitgraph_test.go | 11 ++- .../codeintel/uploads/internal/store/util.go | 89 +++++++++++-------- 5 files changed, 71 insertions(+), 85 deletions(-) diff --git a/internal/codeintel/uploads/internal/commitgraph/commit_graph.go b/internal/codeintel/uploads/internal/commitgraph/commit_graph.go index d0e3fb65456..2e304e83ebb 100644 --- a/internal/codeintel/uploads/internal/commitgraph/commit_graph.go +++ b/internal/codeintel/uploads/internal/commitgraph/commit_graph.go @@ -195,6 +195,8 @@ func populateUploadsByTraversal(graph map[api.CommitID][]api.CommitID, order []a // smaller distance to the source commit will shadow the other. Similarly, If an ancestor and the // child commit define uploads for the same root and indexer pair, the upload defined on the commit // will shadow the upload defined on the ancestor. +// +// IMPORTANT: This logic should be kept in sync with store.makeVisibleUploadsQuery. func populateUploadsForCommit(uploads map[api.CommitID]map[string]UploadMeta, ancestors []api.CommitID, distance uint32, commitGraphView *CommitGraphView, commit api.CommitID) map[string]UploadMeta { // The capacity chosen here is an underestimate, but seems to perform well in benchmarks using // live user data. We have attempted to make this value more precise to minimize the number of diff --git a/internal/codeintel/uploads/internal/store/BUILD.bazel b/internal/codeintel/uploads/internal/store/BUILD.bazel index c8698297aca..5cb9e3d8f25 100644 --- a/internal/codeintel/uploads/internal/store/BUILD.bazel +++ b/internal/codeintel/uploads/internal/store/BUILD.bazel @@ -42,8 +42,10 @@ go_library( "@com_github_jackc_pgtype//:pgtype", "@com_github_keegancsmith_sqlf//:sqlf", "@com_github_lib_pq//:pq", + "@com_github_life4_genesis//slices", "@com_github_sourcegraph_log//:log", "@io_opentelemetry_go_otel//attribute", + "@org_golang_x_exp//maps", ], ) diff --git a/internal/codeintel/uploads/internal/store/commitgraph.go b/internal/codeintel/uploads/internal/store/commitgraph.go index 60bdf4d1f93..2888a033dad 100644 --- a/internal/codeintel/uploads/internal/store/commitgraph.go +++ b/internal/codeintel/uploads/internal/store/commitgraph.go @@ -11,6 +11,7 @@ import ( "github.com/keegancsmith/sqlf" "go.opentelemetry.io/otel/attribute" + "golang.org/x/exp/maps" "github.com/sourcegraph/sourcegraph/internal/api" "github.com/sourcegraph/sourcegraph/internal/codeintel/uploads/internal/commitgraph" @@ -300,7 +301,7 @@ func (s *store) FindClosestCompletedUploads(ctx context.Context, opts shared.Upl conds := makeFindClosestProcessUploadsConditions(opts) // TODO(id: completed-state-check) Make sure we only return uploads with state = 'completed' here - query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(int(opts.RepositoryID), string(opts.Commit)), sqlf.Join(conds, " AND ")) + query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(opts.RepositoryID, opts.Commit), sqlf.Join(conds, " AND ")) uploads, err := scanCompletedUploads(s.db.Query(ctx, query)) if err != nil { @@ -349,17 +350,9 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context return nil, nil } - commitQueries := make([]*sqlf.Query, 0, len(commitGraph.Graph())) - for commit := range commitGraph.Graph() { - commitQueries = append(commitQueries, sqlf.Sprintf("%s", dbutil.CommitBytea(commit))) - } - commitGraphView, err := scanCommitGraphView(s.db.Query(ctx, sqlf.Sprintf( findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery, - opts.RepositoryID, - sqlf.Join(commitQueries, ", "), - opts.RepositoryID, - sqlf.Join(commitQueries, ", "), + makeNearestUploadsQuery(opts.RepositoryID, maps.Keys(commitGraph.Graph())...), ))) if err != nil { return nil, err @@ -390,41 +383,14 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context } const findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery = ` -WITH -visible_uploads AS ( - -- Select the set of uploads visible from one of the given commits. This is done by - -- looking at each commit's row in the lsif_nearest_uploads table, and the (adjusted) - -- set of uploads from each commit's nearest ancestor according to the data compressed - -- in the links table. - -- - -- NB: A commit should be present in at most one of these tables. - SELECT - nu.repository_id, - upload_id::integer, - nu.commit_bytea, - u_distance::text::integer as distance - FROM lsif_nearest_uploads nu - CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) - WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s) - UNION ( - SELECT - nu.repository_id, - upload_id::integer, - ul.commit_bytea, - u_distance::text::integer + ul.distance as distance - FROM lsif_nearest_uploads_links ul - JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea - CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) - WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s) - ) -) +WITH cte_nearest_uploads AS (%s) SELECT - vu.upload_id, - encode(vu.commit_bytea, 'hex'), + cte_nu.upload_id, + encode(cte_nu.commit_bytea, 'hex'), md5(u.root || ':' || u.indexer) as token, - vu.distance -FROM visible_uploads vu -JOIN lsif_uploads u ON u.id = vu.upload_id + cte_nu.distance +FROM cte_nearest_uploads cte_nu +JOIN lsif_uploads u ON u.id = cte_nu.upload_id ` const findClosestCompletedUploadsFromGraphFragmentQuery = ` diff --git a/internal/codeintel/uploads/internal/store/commitgraph_test.go b/internal/codeintel/uploads/internal/store/commitgraph_test.go index 50877110159..1eb078b6cc1 100644 --- a/internal/codeintel/uploads/internal/store/commitgraph_test.go +++ b/internal/codeintel/uploads/internal/store/commitgraph_test.go @@ -10,6 +10,7 @@ import ( "os" "sort" "strconv" + "strings" "testing" "time" @@ -1299,10 +1300,11 @@ func (t *FindClosestCompletedUploadsTestCase) uploadMatchingOptions() shared.Upl } func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []FindClosestCompletedUploadsTestCase) { + t.Helper() for _, testCase := range testCases { name := fmt.Sprintf( "commit=%s file=%s rootMustEnclosePath=%v indexer=%s", - testCase.commit, + strings.TrimLeft(testCase.commit, "0"), testCase.file, testCase.rootMustEnclosePath, testCase.indexer, @@ -1337,7 +1339,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find } if testCase.graph != nil { - t.Run(name+" [graph-fragment]", func(t *testing.T) { + t.Run("[graph-fragment] "+name, func(t *testing.T) { uploads, err := store.FindClosestCompletedUploadsFromGraphFragment(context.Background(), testCase.uploadMatchingOptions(), testCase.graph) if err != nil { t.Fatalf("unexpected error finding closest uploads: %s", err) @@ -1351,7 +1353,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find func testAnyOf(t *testing.T, uploads []shared.CompletedUpload, expectedIDs []int) { if len(uploads) != 1 { - t.Errorf("unexpected nearest upload length. want=%d have=%d", 1, len(uploads)) + t.Errorf("unexpected nearest upload length. want=%d have=%d\nlist: %+v", 1, len(uploads), uploads) return } @@ -1462,10 +1464,11 @@ func getProtectedUploads(t testing.TB, db database.DB, repositoryID int) []int { return ids } +// getVisibleUploads separately returns the uploads visible at each commit in commits. func getVisibleUploads(t testing.TB, db database.DB, repositoryID int, commits []string) map[string][]int { idsByCommit := map[string][]int{} for _, commit := range commits { - query := makeVisibleUploadsQuery(repositoryID, commit) + query := makeVisibleUploadsQuery(api.RepoID(repositoryID), api.CommitID(commit)) uploadIDs, err := basestore.ScanInts(db.QueryContext( context.Background(), diff --git a/internal/codeintel/uploads/internal/store/util.go b/internal/codeintel/uploads/internal/store/util.go index 0e047b1584c..2a8e001dd96 100644 --- a/internal/codeintel/uploads/internal/store/util.go +++ b/internal/codeintel/uploads/internal/store/util.go @@ -5,7 +5,9 @@ import ( "sort" "github.com/keegancsmith/sqlf" + genslices "github.com/life4/genesis/slices" + "github.com/sourcegraph/sourcegraph/internal/api" "github.com/sourcegraph/sourcegraph/internal/database/basestore" "github.com/sourcegraph/sourcegraph/internal/database/dbutil" ) @@ -65,17 +67,54 @@ WHERE r.state = 'queued' ` // makeVisibleUploadsQuery returns a SQL query returning the set of identifiers of uploads -// visible from the given commit. This is done by removing the "shadowed" values created -// by looking at a commit and it's ancestors visible commits. -func makeVisibleUploadsQuery(repositoryID int, commit string) *sqlf.Query { - return sqlf.Sprintf( - visibleUploadsQuery, - repositoryID, dbutil.CommitBytea(commit), - repositoryID, dbutil.CommitBytea(commit), - ) +// visible from the given commit. This is done by removing the "shadowed" values +// created by looking at the commit and it's ancestors visible commits. +// +// This function deliberately accepts a single commit and not a list, +// as the shadowing calculation involves comparison by distance, +// and we don't do that across commits. +// +// IMPORTANT: The shadowing logic should be kept in sync with commitgraph.populateUploadsForCommit. +func makeVisibleUploadsQuery(repositoryID api.RepoID, commit api.CommitID) *sqlf.Query { + return sqlf.Sprintf(visibleUploadsQueryFragment, makeNearestUploadsQuery(repositoryID, commit)) } -const visibleUploadsQuery = ` +func makeNearestUploadsQuery(repositoryID api.RepoID, commits ...api.CommitID) *sqlf.Query { + format := func(commit api.CommitID) *sqlf.Query { + return sqlf.Sprintf("%s", dbutil.CommitBytea(commit)) + } + commitQueries := sqlf.Join(genslices.Map(commits, format), ", ") + return sqlf.Sprintf(nearestUploadsQuery, int(repositoryID), commitQueries, int(repositoryID), commitQueries) +} + +// nearestUploadsQuery finds the set of uploads marked as 'nearest' +// from the given list of commits based on the lsif_nearest_uploads +// and the lsif_nearest_uploads_links tables. +// +// NB: A commit should be present in at most one of these tables. +const nearestUploadsQuery = ` +SELECT + nu.repository_id, + upload_id::integer, + nu.commit_bytea, + u_distance::text::integer as distance +FROM lsif_nearest_uploads nu +CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) +WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s) +UNION ( + SELECT + nu.repository_id, + upload_id::integer, + ul.commit_bytea, + u_distance::text::integer + ul.distance as distance + FROM lsif_nearest_uploads_links ul + JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea + CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) + WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s) +) +` + +const visibleUploadsQueryFragment = ` SELECT t.upload_id FROM ( @@ -83,35 +122,9 @@ FROM ( t.*, -- NOTE(id: closest-uploads-postcondition) Only return a single result -- for an (indexer, root) pair, see also the WHERE clause at the end. - row_number() OVER (PARTITION BY root, indexer ORDER BY distance) AS r - FROM ( - -- Select the set of uploads visible from the given commit. This is done by looking - -- at each commit's row in the lsif_nearest_uploads table, and the (adjusted) set of - -- uploads from each commit's nearest ancestor according to the data compressed in - -- the links table. - -- - -- NB: A commit should be present in at most one of these tables. - SELECT - nu.repository_id, - upload_id::integer, - nu.commit_bytea, - u_distance::text::integer as distance - FROM lsif_nearest_uploads nu - CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) - WHERE nu.repository_id = %s AND nu.commit_bytea = %s - UNION ( - SELECT - nu.repository_id, - upload_id::integer, - ul.commit_bytea, - u_distance::text::integer + ul.distance as distance - FROM lsif_nearest_uploads_links ul - JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea - CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance) - WHERE nu.repository_id = %s AND ul.commit_bytea = %s - ) - ) t - JOIN lsif_uploads u ON u.id = upload_id + row_number() OVER (PARTITION BY u.root, u.indexer ORDER BY t.distance) AS r + FROM (%s) t + JOIN lsif_uploads u ON u.id = t.upload_id ) t -- Remove ranks > 1, as they are shadowed by another upload in the same output set WHERE t.r <= 1