API docs: codeintel: add OOB migration to index API docs for search (#25207)

* migrations/frontend: add OOB migration for API docs search indexing
* codeintel/lsifstore: implement OOB migration for API docs search indexing

Signed-off-by: Stephen Gutekanst <stephen@sourcegraph.com>
This commit is contained in:
Stephen Gutekanst 2021-10-05 13:37:32 -07:00 committed by GitHub
parent 8e54706299
commit b910fedad1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 258 additions and 0 deletions

View File

@ -22,6 +22,8 @@ type Config struct {
ReferencesCountMigrationBatchInterval time.Duration
DocumentColumnSplitMigrationBatchSize int
DocumentColumnSplitMigrationBatchInterval time.Duration
APIDocsSearchMigrationBatchSize int
APIDocsSearchMigrationBatchInterval time.Duration
CommittedAtMigrationBatchSize int
CommittedAtMigrationBatchInterval time.Duration
ReferenceCountMigrationBatchSize int
@ -48,6 +50,8 @@ func init() {
config.ReferencesCountMigrationBatchInterval = config.GetInterval("PRECISE_CODE_INTEL_REFERENCES_COUNT_MIGRATION_BATCH_INTERVAL", "1s", "The timeout between processing migration batches.")
config.DocumentColumnSplitMigrationBatchSize = config.GetInt("PRECISE_CODE_INTEL_DOCUMENT_COLUMN_SPLIT_MIGRATION_BATCH_SIZE", "100", "The maximum number of document records to migrate at a time.")
config.DocumentColumnSplitMigrationBatchInterval = config.GetInterval("PRECISE_CODE_INTEL_DOCUMENT_COLUMN_SPLIT_MIGRATION_BATCH_INTERVAL", "1s", "The timeout between processing migration batches.")
config.APIDocsSearchMigrationBatchSize = config.GetInt("PRECISE_CODE_INTEL_API_DOCS_SEARCH_MIGRATION_BATCH_SIZE", "1", "The maximum number of bundles to migrate at a time.")
config.APIDocsSearchMigrationBatchInterval = config.GetInterval("PRECISE_CODE_INTEL_API_DOCS_SEARCH_MIGRATION_BATCH_INTERVAL", "1s", "The timeout between processing migration batches.")
config.CommittedAtMigrationBatchSize = config.GetInt("PRECISE_CODE_INTEL_COMMITTED_AT_MIGRATION_BATCH_SIZE", "100", "The maximum number of upload records to migrate at a time.")
config.CommittedAtMigrationBatchInterval = config.GetInterval("PRECISE_CODE_INTEL_COMMITTED_AT_MIGRATION_BATCH_INTERVAL", "1s", "The timeout between processing migration batches.")
config.ReferenceCountMigrationBatchSize = config.GetInt("PRECISE_CODE_INTEL_REFERENCE_COUNT_MIGRATION_BATCH_SIZE", "100", "The maximum number of upload records to migrate at a time.")

View File

@ -44,6 +44,20 @@ func registerMigrations(ctx context.Context, db dbutil.DB, outOfBandMigrationRun
return err
}
if err := outOfBandMigrationRunner.Register(
lsifmigrations.APIDocsSearchMigrationID, // 12
lsifmigrations.NewAPIDocsSearchMigrator(
services.lsifStore,
services.dbStore,
services.repoStore,
services.gitserverClient,
config.APIDocsSearchMigrationBatchSize,
),
oobmigration.MigratorOptions{Interval: config.APIDocsSearchMigrationBatchInterval},
); err != nil {
return err
}
if err := outOfBandMigrationRunner.Register(
dbmigrations.CommittedAtMigrationID, // 8
dbmigrations.NewCommittedAtMigrator(services.dbStore, services.gitserverClient, config.CommittedAtMigrationBatchSize),

View File

@ -17,6 +17,7 @@ import (
"github.com/sourcegraph/sourcegraph/enterprise/internal/codeintel/stores/lsifstore"
"github.com/sourcegraph/sourcegraph/enterprise/internal/codeintel/stores/uploadstore"
"github.com/sourcegraph/sourcegraph/internal/conf"
"github.com/sourcegraph/sourcegraph/internal/database"
"github.com/sourcegraph/sourcegraph/internal/database/dbconn"
"github.com/sourcegraph/sourcegraph/internal/database/dbutil"
"github.com/sourcegraph/sourcegraph/internal/database/locker"
@ -29,6 +30,7 @@ var services struct {
dbStore *store.Store
locker *locker.Locker
lsifStore *lsifstore.Store
repoStore *database.RepoStore
uploadStore uploadstore.Store
gitserverClient *gitserver.Client
indexEnqueuer *enqueuer.IndexEnqueuer
@ -72,6 +74,7 @@ func initServices(ctx context.Context, db dbutil.DB) error {
services.dbStore = dbStore
services.locker = locker
services.lsifStore = lsifStore
services.repoStore = database.ReposWith(dbStore.Store)
services.uploadStore = uploadStore
services.gitserverClient = gitserverClient
services.indexEnqueuer = indexEnqueuer

View File

@ -0,0 +1,207 @@
package migration
import (
"context"
"fmt"
"github.com/cockroachdb/errors"
"github.com/hashicorp/go-multierror"
"github.com/inconshreveable/log15"
"github.com/keegancsmith/sqlf"
"github.com/sourcegraph/sourcegraph/enterprise/internal/codeintel/stores/dbstore"
"github.com/sourcegraph/sourcegraph/enterprise/internal/codeintel/stores/lsifstore"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/conf"
"github.com/sourcegraph/sourcegraph/internal/database"
"github.com/sourcegraph/sourcegraph/internal/database/basestore"
"github.com/sourcegraph/sourcegraph/internal/oobmigration"
"github.com/sourcegraph/sourcegraph/lib/codeintel/precise"
)
// APIDocsSearchMigrationID is the primary key of the migration record handled by an instance of
// apiDocsSearchMigrator. This populates the new lsif_data_documentation_search_* tables using data
// decoded from other tables. This is associated with the out-of-band migration record inserted in
// migrations/frontend/1528395874_oob_lsif_data_documentation_search.up.sql.
const APIDocsSearchMigrationID = 12
// NewAPIDocsSearchMigrator creates a new Migrator instance that reads records from the lsif_data_documentation_pages
// table, decodes the GOB payloads, and populates the new lsif_data_documentation_search_* tables with
// the information needed to search API docs.
func NewAPIDocsSearchMigrator(
store *lsifstore.Store,
dbStore *dbstore.Store,
repoStore *database.RepoStore,
gitserverClient GitserverClient,
batchSize int,
) oobmigration.Migrator {
return &apiDocsSearchMigrator{
store: store,
dbStore: dbStore,
repoStore: repoStore,
gitserverClient: gitserverClient,
serializer: lsifstore.NewSerializer(),
batchSize: batchSize,
}
}
// Implements the oobmigration.Migrator interface.
type apiDocsSearchMigrator struct {
store *lsifstore.Store
dbStore *dbstore.Store
repoStore *database.RepoStore
gitserverClient GitserverClient
serializer *lsifstore.Serializer
batchSize int
}
// Progress returns a percentage (in the range range [0, 1]) of data records that need
// to be upgraded in the forward direction. A value of 1 means that no further action
// is required. A value < 1 denotes that a future invocation of the Up method could
// migrate additional data (excluding error conditions and prerequisite migrations).
func (m *apiDocsSearchMigrator) Progress(ctx context.Context) (float64, error) {
progress, _, err := basestore.ScanFirstFloat(m.store.Query(ctx, sqlf.Sprintf(apiDocsSearchMigratorProgressQuery)))
if err != nil {
return 0, err
}
return progress, nil
}
const apiDocsSearchMigratorProgressQuery = `
-- source: enterprise/internal/codeintel/stores/lsifstore/migration/apidocs_search.go:Progress
SELECT CASE c2.count WHEN 0 THEN 1 ELSE cast(c1.count as float) / cast(c2.count as float) END FROM
(SELECT count(DISTINCT dump_id) FROM lsif_data_documentation_pages WHERE search_indexed='true') c1,
(SELECT count(DISTINCT dump_id) FROM lsif_data_documentation_pages) c2
`
// Up runs a batch of the migration. This method is called repeatedly until the Progress
// method reports completion. Errors returned from this method will be associated with the
// migration record.
func (m *apiDocsSearchMigrator) Up(ctx context.Context) error {
if !conf.APIDocsSearchIndexingEnabled() {
return nil
}
tx, err := m.store.Transact(ctx)
if err != nil {
return err
}
defer func() { err = tx.Done(err) }()
dumpIDs, err := basestore.ScanInts(tx.Query(ctx, sqlf.Sprintf(apiDocsSearchMigratorUnprocessedDumpsQuery, m.batchSize)))
if err != nil {
return err
}
done := make(chan error, m.batchSize)
for _, dumpID := range dumpIDs {
dumpID := dumpID
go func() {
err := m.processUpload(ctx, dumpID)
done <- err
}()
}
var errs error
for range dumpIDs {
err := <-done
if err != nil {
errs = multierror.Append(errs, err)
}
}
return errs
}
const apiDocsSearchMigratorUnprocessedDumpsQuery = `
-- source: enterprise/internal/codeintel/stores/lsifstore/migration/apidocs_search.go:Up
SELECT DISTINCT dump_id FROM lsif_data_documentation_pages
WHERE search_indexed='false'
LIMIT %s
`
// processUpload indexes all of the API documentation for the given dump ID by decoding the information
// in lsif_data_documentation_pages and inserting into the new lsif_data_documentation_search_* tables.
func (m *apiDocsSearchMigrator) processUpload(ctx context.Context, uploadID int) error {
upload, exists, err := m.dbStore.GetUploadByID(ctx, uploadID)
if err != nil {
return errors.Wrap(err, "GetUploadByID")
}
if !exists {
// The upload doesn't exist anymore, don't error out - just skip migrating this one.
log15.Error("API docs: migration: could not find LSIF upload, skipping", "id", uploadID)
if err := m.store.Exec(ctx, sqlf.Sprintf(apiDocsSearchMigratorProcessedDumpQuery, uploadID)); err != nil {
return errors.Wrap(err, "marking upload as migrated")
}
return nil
}
// Find the associated repository.
repos, err := m.repoStore.GetByIDs(ctx, api.RepoID(upload.RepositoryID))
if err != nil {
return errors.Wrap(err, "RepoStore.GetByIDs")
}
if len(repos) == 0 {
return fmt.Errorf("could not get repo id=%v name=%q", upload.RepositoryID, upload.RepositoryName) // Repository no longer exists? nothing we can do
}
repo := repos[0]
// Determine if this bundle was for the default branch or not.
isDefaultBranch, err := m.gitserverClient.DefaultBranchContains(ctx, upload.RepositoryID, upload.Commit)
if err != nil {
return errors.Wrap(err, "gitserver.DefaultBranchContains")
}
tx, err := m.store.Transact(ctx)
if err != nil {
return err
}
defer func() { err = tx.Done(err) }()
rows, err := m.store.Query(ctx, sqlf.Sprintf(apiDocsSearchMigratorPagesQuery, uploadID))
if err != nil {
return errors.Wrap(err, "Query")
}
defer func() { err = basestore.CloseRows(rows, err) }()
var (
indexed = 0
pages []*precise.DocumentationPageData
)
for rows.Next() {
indexed++
var pageBytes []byte
if err := rows.Scan(&pageBytes); err != nil {
return errors.Wrap(err, "Scan")
}
page, err := m.serializer.UnmarshalDocumentationPageData(pageBytes)
if err != nil {
return errors.Wrap(err, "UnmarshalDocumentationPageData")
}
pages = append(pages, page)
}
if err := tx.WriteDocumentationSearch(ctx, upload, repo, isDefaultBranch, pages); err != nil {
return errors.Wrap(err, "WriteDocumentationSearch")
}
if err := m.store.Exec(ctx, sqlf.Sprintf(apiDocsSearchMigratorProcessedDumpQuery, uploadID)); err != nil {
return errors.Wrap(err, "marking upload as migrated")
}
log15.Info("Indexed API docs pages for search", "pages_indexed", indexed, "repo", upload.RepositoryName, "upload_id", uploadID)
return nil
}
const apiDocsSearchMigratorPagesQuery = `
-- source: enterprise/internal/codeintel/stores/lsifstore/migration/apidocs_search.go:Up
SELECT data FROM lsif_data_documentation_pages WHERE dump_id=%s
`
const apiDocsSearchMigratorProcessedDumpQuery = `
-- source: enterprise/internal/codeintel/stores/lsifstore/migration/apidocs_search.go:Up
UPDATE lsif_data_documentation_pages SET search_indexed='true' WHERE dump_id=%s
`
// Down runs a batch of the migration in reverse. This does not need to be implemented
// for migrations which are non-destructive. A non-destructive migration only adds data,
// and does not transform fields that were read by previous versions of Sourcegraph and
// therefore do not need to be undone prior to a downgrade.
func (m *apiDocsSearchMigrator) Down(ctx context.Context) error {
return nil // our migration is non-destructive, it only populates a new table
}

View File

@ -0,0 +1,7 @@
package migration
import "context"
type GitserverClient interface {
DefaultBranchContains(ctx context.Context, repositoryID int, commit string) (bool, error)
}

View File

@ -0,0 +1,7 @@
BEGIN;
-- The OOB migration doesn't add any new tables or columns or anything, so we don't need to do
-- anything on down migration. It migrates data from lsif_data_documentation_pages -> the new
-- lsif_data_documentation_search_* tables - but it's fine to just leave those.
COMMIT;

View File

@ -0,0 +1,16 @@
BEGIN;
-- Create the OOB migration according to doc/dev/background-information/oobmigrations.md
INSERT INTO out_of_band_migrations (id, team, component, description, introduced_version_major, introduced_version_minor, non_destructive)
VALUES (
12, -- This must be consistent across all Sourcegraph instances
'apidocs', -- Team owning migration
'codeintel-db.lsif_data_documentation_search', -- Component being migrated
'Index API docs for search', -- Description
3, -- The next minor release (major version)
32, -- The next minor release (minor version)
true -- Can be read with previous version without down migration
)
ON CONFLICT DO NOTHING;
COMMIT;