mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 17:51:57 +00:00
chore: Factor out sub-query for locating nearest uploads (#64210)
A large sub-query was duplicated between two queries, making the code harder to understand. This patch factors that out. Ancillary changes: - Added more doc comments - Tweak tests to make failures easier to follow.
This commit is contained in:
parent
66be2ddcb6
commit
554fd33eff
@ -195,6 +195,8 @@ func populateUploadsByTraversal(graph map[api.CommitID][]api.CommitID, order []a
|
||||
// smaller distance to the source commit will shadow the other. Similarly, If an ancestor and the
|
||||
// child commit define uploads for the same root and indexer pair, the upload defined on the commit
|
||||
// will shadow the upload defined on the ancestor.
|
||||
//
|
||||
// IMPORTANT: This logic should be kept in sync with store.makeVisibleUploadsQuery.
|
||||
func populateUploadsForCommit(uploads map[api.CommitID]map[string]UploadMeta, ancestors []api.CommitID, distance uint32, commitGraphView *CommitGraphView, commit api.CommitID) map[string]UploadMeta {
|
||||
// The capacity chosen here is an underestimate, but seems to perform well in benchmarks using
|
||||
// live user data. We have attempted to make this value more precise to minimize the number of
|
||||
|
||||
@ -42,8 +42,10 @@ go_library(
|
||||
"@com_github_jackc_pgtype//:pgtype",
|
||||
"@com_github_keegancsmith_sqlf//:sqlf",
|
||||
"@com_github_lib_pq//:pq",
|
||||
"@com_github_life4_genesis//slices",
|
||||
"@com_github_sourcegraph_log//:log",
|
||||
"@io_opentelemetry_go_otel//attribute",
|
||||
"@org_golang_x_exp//maps",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@ -11,6 +11,7 @@ import (
|
||||
|
||||
"github.com/keegancsmith/sqlf"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/api"
|
||||
"github.com/sourcegraph/sourcegraph/internal/codeintel/uploads/internal/commitgraph"
|
||||
@ -300,7 +301,7 @@ func (s *store) FindClosestCompletedUploads(ctx context.Context, opts shared.Upl
|
||||
|
||||
conds := makeFindClosestProcessUploadsConditions(opts)
|
||||
// TODO(id: completed-state-check) Make sure we only return uploads with state = 'completed' here
|
||||
query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(int(opts.RepositoryID), string(opts.Commit)), sqlf.Join(conds, " AND "))
|
||||
query := sqlf.Sprintf(findClosestCompletedUploadsQuery, makeVisibleUploadsQuery(opts.RepositoryID, opts.Commit), sqlf.Join(conds, " AND "))
|
||||
|
||||
uploads, err := scanCompletedUploads(s.db.Query(ctx, query))
|
||||
if err != nil {
|
||||
@ -349,17 +350,9 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
commitQueries := make([]*sqlf.Query, 0, len(commitGraph.Graph()))
|
||||
for commit := range commitGraph.Graph() {
|
||||
commitQueries = append(commitQueries, sqlf.Sprintf("%s", dbutil.CommitBytea(commit)))
|
||||
}
|
||||
|
||||
commitGraphView, err := scanCommitGraphView(s.db.Query(ctx, sqlf.Sprintf(
|
||||
findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery,
|
||||
opts.RepositoryID,
|
||||
sqlf.Join(commitQueries, ", "),
|
||||
opts.RepositoryID,
|
||||
sqlf.Join(commitQueries, ", "),
|
||||
makeNearestUploadsQuery(opts.RepositoryID, maps.Keys(commitGraph.Graph())...),
|
||||
)))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -390,41 +383,14 @@ func (s *store) FindClosestCompletedUploadsFromGraphFragment(ctx context.Context
|
||||
}
|
||||
|
||||
const findClosestCompletedUploadsFromGraphFragmentCommitGraphQuery = `
|
||||
WITH
|
||||
visible_uploads AS (
|
||||
-- Select the set of uploads visible from one of the given commits. This is done by
|
||||
-- looking at each commit's row in the lsif_nearest_uploads table, and the (adjusted)
|
||||
-- set of uploads from each commit's nearest ancestor according to the data compressed
|
||||
-- in the links table.
|
||||
--
|
||||
-- NB: A commit should be present in at most one of these tables.
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
nu.commit_bytea,
|
||||
u_distance::text::integer as distance
|
||||
FROM lsif_nearest_uploads nu
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s)
|
||||
UNION (
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
ul.commit_bytea,
|
||||
u_distance::text::integer + ul.distance as distance
|
||||
FROM lsif_nearest_uploads_links ul
|
||||
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s)
|
||||
)
|
||||
)
|
||||
WITH cte_nearest_uploads AS (%s)
|
||||
SELECT
|
||||
vu.upload_id,
|
||||
encode(vu.commit_bytea, 'hex'),
|
||||
cte_nu.upload_id,
|
||||
encode(cte_nu.commit_bytea, 'hex'),
|
||||
md5(u.root || ':' || u.indexer) as token,
|
||||
vu.distance
|
||||
FROM visible_uploads vu
|
||||
JOIN lsif_uploads u ON u.id = vu.upload_id
|
||||
cte_nu.distance
|
||||
FROM cte_nearest_uploads cte_nu
|
||||
JOIN lsif_uploads u ON u.id = cte_nu.upload_id
|
||||
`
|
||||
|
||||
const findClosestCompletedUploadsFromGraphFragmentQuery = `
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -1299,10 +1300,11 @@ func (t *FindClosestCompletedUploadsTestCase) uploadMatchingOptions() shared.Upl
|
||||
}
|
||||
|
||||
func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []FindClosestCompletedUploadsTestCase) {
|
||||
t.Helper()
|
||||
for _, testCase := range testCases {
|
||||
name := fmt.Sprintf(
|
||||
"commit=%s file=%s rootMustEnclosePath=%v indexer=%s",
|
||||
testCase.commit,
|
||||
strings.TrimLeft(testCase.commit, "0"),
|
||||
testCase.file,
|
||||
testCase.rootMustEnclosePath,
|
||||
testCase.indexer,
|
||||
@ -1337,7 +1339,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find
|
||||
}
|
||||
|
||||
if testCase.graph != nil {
|
||||
t.Run(name+" [graph-fragment]", func(t *testing.T) {
|
||||
t.Run("[graph-fragment] "+name, func(t *testing.T) {
|
||||
uploads, err := store.FindClosestCompletedUploadsFromGraphFragment(context.Background(), testCase.uploadMatchingOptions(), testCase.graph)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error finding closest uploads: %s", err)
|
||||
@ -1351,7 +1353,7 @@ func testFindClosestCompletedUploads(t *testing.T, store Store, testCases []Find
|
||||
|
||||
func testAnyOf(t *testing.T, uploads []shared.CompletedUpload, expectedIDs []int) {
|
||||
if len(uploads) != 1 {
|
||||
t.Errorf("unexpected nearest upload length. want=%d have=%d", 1, len(uploads))
|
||||
t.Errorf("unexpected nearest upload length. want=%d have=%d\nlist: %+v", 1, len(uploads), uploads)
|
||||
return
|
||||
}
|
||||
|
||||
@ -1462,10 +1464,11 @@ func getProtectedUploads(t testing.TB, db database.DB, repositoryID int) []int {
|
||||
return ids
|
||||
}
|
||||
|
||||
// getVisibleUploads separately returns the uploads visible at each commit in commits.
|
||||
func getVisibleUploads(t testing.TB, db database.DB, repositoryID int, commits []string) map[string][]int {
|
||||
idsByCommit := map[string][]int{}
|
||||
for _, commit := range commits {
|
||||
query := makeVisibleUploadsQuery(repositoryID, commit)
|
||||
query := makeVisibleUploadsQuery(api.RepoID(repositoryID), api.CommitID(commit))
|
||||
|
||||
uploadIDs, err := basestore.ScanInts(db.QueryContext(
|
||||
context.Background(),
|
||||
|
||||
@ -5,7 +5,9 @@ import (
|
||||
"sort"
|
||||
|
||||
"github.com/keegancsmith/sqlf"
|
||||
genslices "github.com/life4/genesis/slices"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/api"
|
||||
"github.com/sourcegraph/sourcegraph/internal/database/basestore"
|
||||
"github.com/sourcegraph/sourcegraph/internal/database/dbutil"
|
||||
)
|
||||
@ -65,17 +67,54 @@ WHERE r.state = 'queued'
|
||||
`
|
||||
|
||||
// makeVisibleUploadsQuery returns a SQL query returning the set of identifiers of uploads
|
||||
// visible from the given commit. This is done by removing the "shadowed" values created
|
||||
// by looking at a commit and it's ancestors visible commits.
|
||||
func makeVisibleUploadsQuery(repositoryID int, commit string) *sqlf.Query {
|
||||
return sqlf.Sprintf(
|
||||
visibleUploadsQuery,
|
||||
repositoryID, dbutil.CommitBytea(commit),
|
||||
repositoryID, dbutil.CommitBytea(commit),
|
||||
)
|
||||
// visible from the given commit. This is done by removing the "shadowed" values
|
||||
// created by looking at the commit and it's ancestors visible commits.
|
||||
//
|
||||
// This function deliberately accepts a single commit and not a list,
|
||||
// as the shadowing calculation involves comparison by distance,
|
||||
// and we don't do that across commits.
|
||||
//
|
||||
// IMPORTANT: The shadowing logic should be kept in sync with commitgraph.populateUploadsForCommit.
|
||||
func makeVisibleUploadsQuery(repositoryID api.RepoID, commit api.CommitID) *sqlf.Query {
|
||||
return sqlf.Sprintf(visibleUploadsQueryFragment, makeNearestUploadsQuery(repositoryID, commit))
|
||||
}
|
||||
|
||||
const visibleUploadsQuery = `
|
||||
func makeNearestUploadsQuery(repositoryID api.RepoID, commits ...api.CommitID) *sqlf.Query {
|
||||
format := func(commit api.CommitID) *sqlf.Query {
|
||||
return sqlf.Sprintf("%s", dbutil.CommitBytea(commit))
|
||||
}
|
||||
commitQueries := sqlf.Join(genslices.Map(commits, format), ", ")
|
||||
return sqlf.Sprintf(nearestUploadsQuery, int(repositoryID), commitQueries, int(repositoryID), commitQueries)
|
||||
}
|
||||
|
||||
// nearestUploadsQuery finds the set of uploads marked as 'nearest'
|
||||
// from the given list of commits based on the lsif_nearest_uploads
|
||||
// and the lsif_nearest_uploads_links tables.
|
||||
//
|
||||
// NB: A commit should be present in at most one of these tables.
|
||||
const nearestUploadsQuery = `
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
nu.commit_bytea,
|
||||
u_distance::text::integer as distance
|
||||
FROM lsif_nearest_uploads nu
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND nu.commit_bytea IN (%s)
|
||||
UNION (
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
ul.commit_bytea,
|
||||
u_distance::text::integer + ul.distance as distance
|
||||
FROM lsif_nearest_uploads_links ul
|
||||
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND ul.commit_bytea IN (%s)
|
||||
)
|
||||
`
|
||||
|
||||
const visibleUploadsQueryFragment = `
|
||||
SELECT
|
||||
t.upload_id
|
||||
FROM (
|
||||
@ -83,35 +122,9 @@ FROM (
|
||||
t.*,
|
||||
-- NOTE(id: closest-uploads-postcondition) Only return a single result
|
||||
-- for an (indexer, root) pair, see also the WHERE clause at the end.
|
||||
row_number() OVER (PARTITION BY root, indexer ORDER BY distance) AS r
|
||||
FROM (
|
||||
-- Select the set of uploads visible from the given commit. This is done by looking
|
||||
-- at each commit's row in the lsif_nearest_uploads table, and the (adjusted) set of
|
||||
-- uploads from each commit's nearest ancestor according to the data compressed in
|
||||
-- the links table.
|
||||
--
|
||||
-- NB: A commit should be present in at most one of these tables.
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
nu.commit_bytea,
|
||||
u_distance::text::integer as distance
|
||||
FROM lsif_nearest_uploads nu
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND nu.commit_bytea = %s
|
||||
UNION (
|
||||
SELECT
|
||||
nu.repository_id,
|
||||
upload_id::integer,
|
||||
ul.commit_bytea,
|
||||
u_distance::text::integer + ul.distance as distance
|
||||
FROM lsif_nearest_uploads_links ul
|
||||
JOIN lsif_nearest_uploads nu ON nu.repository_id = ul.repository_id AND nu.commit_bytea = ul.ancestor_commit_bytea
|
||||
CROSS JOIN jsonb_each(nu.uploads) as u(upload_id, u_distance)
|
||||
WHERE nu.repository_id = %s AND ul.commit_bytea = %s
|
||||
)
|
||||
) t
|
||||
JOIN lsif_uploads u ON u.id = upload_id
|
||||
row_number() OVER (PARTITION BY u.root, u.indexer ORDER BY t.distance) AS r
|
||||
FROM (%s) t
|
||||
JOIN lsif_uploads u ON u.id = t.upload_id
|
||||
) t
|
||||
-- Remove ranks > 1, as they are shadowed by another upload in the same output set
|
||||
WHERE t.r <= 1
|
||||
|
||||
Loading…
Reference in New Issue
Block a user