From 0eaa40528e111735b6c174d08252b13e3179bbb6 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 11 Jan 2024 14:46:23 +0100 Subject: [PATCH] search jobs: refactor SearchQuery interface (#59468) Relates to #59329 This is mostly a refactor to prepare for a JSON writer that replaces our CSV writer. The high-level plan is to switch the internal (object storage) AND the external (download) format from CSV to JSON. Test plan: - CI - Note: I had to update `SearcherFake`, hence some of the test data changed. It only affects a few tests and the character of the tests remains the same. --- .../search/exhaustive_search_repo_revision.go | 5 ++- .../internal/search/exhaustive_search_test.go | 6 ++-- .../search/exhaustive/service/matchcsv.go | 23 ++++++++---- internal/search/exhaustive/service/search.go | 35 ++++++++++++------- .../search/exhaustive/service/search_test.go | 8 +++-- .../search/exhaustive/service/searcher.go | 6 +--- .../exhaustive/service/searcher_test.go | 13 ++++--- 7 files changed, 61 insertions(+), 35 deletions(-) diff --git a/cmd/worker/internal/search/exhaustive_search_repo_revision.go b/cmd/worker/internal/search/exhaustive_search_repo_revision.go index cae181f3eda..4e790b5e1e1 100644 --- a/cmd/worker/internal/search/exhaustive_search_repo_revision.go +++ b/cmd/worker/internal/search/exhaustive_search_repo_revision.go @@ -72,7 +72,10 @@ func (h *exhaustiveSearchRepoRevHandler) Handle(ctx context.Context, logger log. return err } - csvWriter := service.NewBlobstoreCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID)) + csvWriter, err := service.NewCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID)) + if err != nil { + return err + } err = q.Search(ctx, repoRev, csvWriter) if closeErr := csvWriter.Close(); closeErr != nil { diff --git a/cmd/worker/internal/search/exhaustive_search_test.go b/cmd/worker/internal/search/exhaustive_search_test.go index 996d94e442d..0516ace4053 100644 --- a/cmd/worker/internal/search/exhaustive_search_test.go +++ b/cmd/worker/internal/search/exhaustive_search_test.go @@ -118,9 +118,9 @@ func TestExhaustiveSearch(t *testing.T) { } sort.Strings(vals) require.Equal([]string{ - "repo,revspec,revision\n1,spec,rev1\n", - "repo,revspec,revision\n1,spec,rev2\n", - "repo,revspec,revision\n2,spec,rev3\n", + "repository,revision,file_path,match_count,first_match_url\n1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go\n", + "repository,revision,file_path,match_count,first_match_url\n1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go\n", + "repository,revision,file_path,match_count,first_match_url\n2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go\n", }, vals) } diff --git a/internal/search/exhaustive/service/matchcsv.go b/internal/search/exhaustive/service/matchcsv.go index 19c8d920703..31b885841a3 100644 --- a/internal/search/exhaustive/service/matchcsv.go +++ b/internal/search/exhaustive/service/matchcsv.go @@ -1,31 +1,42 @@ package service import ( + "context" "fmt" "net/url" "strconv" "github.com/sourcegraph/sourcegraph/internal/conf" "github.com/sourcegraph/sourcegraph/internal/search/result" + "github.com/sourcegraph/sourcegraph/internal/uploadstore" "github.com/sourcegraph/sourcegraph/lib/errors" ) -type matchCSVWriter struct { +type MatchCSVWriter struct { w CSVWriter headerTyp string host *url.URL } -func newMatchCSVWriter(w CSVWriter) (*matchCSVWriter, error) { +func NewCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) (*MatchCSVWriter, error) { + csvWriter := newBlobstoreCSVWriter(ctx, store, prefix) + return newMatchCSVWriter(csvWriter) +} + +func newMatchCSVWriter(w CSVWriter) (*MatchCSVWriter, error) { externalURL := conf.Get().ExternalURL u, err := url.Parse(externalURL) if err != nil { return nil, err } - return &matchCSVWriter{w: w, host: u}, nil + return &MatchCSVWriter{w: w, host: u}, nil } -func (w *matchCSVWriter) Write(match result.Match) error { +func (w *MatchCSVWriter) Close() error { + return w.w.Close() +} + +func (w *MatchCSVWriter) Write(match result.Match) error { // TODO compare to logic used by the webapp to convert // results into csv. See // client/web/src/search/results/export/searchResultsExport.ts @@ -38,7 +49,7 @@ func (w *matchCSVWriter) Write(match result.Match) error { } } -func (w *matchCSVWriter) writeFileMatch(fm *result.FileMatch) error { +func (w *MatchCSVWriter) writeFileMatch(fm *result.FileMatch) error { // Differences to "Export CSV" in webapp. We have removed columns since it // is easier to add columns than to remove them. // @@ -159,7 +170,7 @@ func minRange(ranges result.Ranges) (result.Range, bool) { return min, true } -func (w *matchCSVWriter) writeHeader(typ string) (bool, error) { +func (w *MatchCSVWriter) writeHeader(typ string) (bool, error) { if w.headerTyp == "" { w.headerTyp = typ return true, nil diff --git a/internal/search/exhaustive/service/search.go b/internal/search/exhaustive/service/search.go index 1a96eab8671..cf0aff3461a 100644 --- a/internal/search/exhaustive/service/search.go +++ b/internal/search/exhaustive/service/search.go @@ -5,11 +5,15 @@ import ( "context" "encoding/csv" "fmt" + "io" "strconv" "strings" "github.com/sourcegraph/sourcegraph/internal/actor" + "github.com/sourcegraph/sourcegraph/internal/api" "github.com/sourcegraph/sourcegraph/internal/search/exhaustive/types" + "github.com/sourcegraph/sourcegraph/internal/search/result" + types2 "github.com/sourcegraph/sourcegraph/internal/types" "github.com/sourcegraph/sourcegraph/internal/uploadstore" "github.com/sourcegraph/sourcegraph/lib/errors" "github.com/sourcegraph/sourcegraph/lib/iterator" @@ -72,15 +76,15 @@ type SearchQuery interface { ResolveRepositoryRevSpec(context.Context, types.RepositoryRevSpecs) ([]types.RepositoryRevision, error) - Search(context.Context, types.RepositoryRevision, CSVWriter) error + Search(context.Context, types.RepositoryRevision, MatchWriter) error } -// CSVWriter makes it so we can avoid caring about search types and leave it +type MatchWriter interface { + Write(match result.Match) error +} + +// CSVWriter makes it, so we can avoid caring about search types and leave it // up to the search job to decide the shape of data. -// -// Note: I expect the implementation of this to handle things like chunking up -// the CSV/etc. EG once we hit 100MB of data it can write the data out then -// start a new file. It takes care of remembering the header for the new file. type CSVWriter interface { // WriteHeader should be called first and only once. WriteHeader(...string) error @@ -88,9 +92,11 @@ type CSVWriter interface { // WriteRow should have the same number of values as WriteHeader and can be // called zero or more times. WriteRow(...string) error + + io.Closer } -// NewBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to +// newBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to // the store. BlobstoreCSVWriter takes care of chunking the CSV into blobs of // 100MiB, each with the same header row. Blobs are named {prefix}-{shard} // except for the first blob, which is named {prefix}. @@ -100,7 +106,7 @@ type CSVWriter interface { // // The caller is expected to call Close() once and only once after the last call // to WriteRow. -func NewBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter { +func newBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter { c := &BlobstoreCSVWriter{ maxBlobSizeBytes: 100 * 1024 * 1024, @@ -312,15 +318,18 @@ func (s searcherFake) ResolveRepositoryRevSpec(ctx context.Context, repoRevSpec return repoRevs, nil } -func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w CSVWriter) error { +func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w MatchWriter) error { if err := isSameUser(ctx, s.userID); err != nil { return err } - if err := w.WriteHeader("repo", "revspec", "revision"); err != nil { - return err - } - return w.WriteRow(strconv.Itoa(int(r.Repository)), string(r.RevisionSpecifiers), string(r.Revision)) + return w.Write(&result.FileMatch{ + File: result.File{ + Repo: types2.MinimalRepo{Name: api.RepoName(strconv.Itoa(int(r.Repository)))}, + CommitID: api.CommitID(r.Revision), + Path: "path/to/file.go", + }, + }) } func isSameUser(ctx context.Context, userID int32) error { diff --git a/internal/search/exhaustive/service/search_test.go b/internal/search/exhaustive/service/search_test.go index 99cc33d41a5..d5b0fa1d58d 100644 --- a/internal/search/exhaustive/service/search_test.go +++ b/internal/search/exhaustive/service/search_test.go @@ -63,10 +63,14 @@ func (c *csvBuffer) WriteRow(row ...string) error { return err } +func (c *csvBuffer) Close() error { + return nil +} + func TestBlobstoreCSVWriter(t *testing.T) { mockStore := setupMockStore(t) - csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob") + csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob") csvWriter.maxBlobSizeBytes = 12 err := csvWriter.WriteHeader("h", "h", "h") // 3 bytes (letters) + 2 bytes (commas) + 1 byte (newline) = 6 bytes @@ -116,7 +120,7 @@ func TestBlobstoreCSVWriter(t *testing.T) { func TestNoUploadIfNotData(t *testing.T) { mockStore := setupMockStore(t) - csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob") + csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob") // No data written, so no upload should happen. err := csvWriter.Close() diff --git a/internal/search/exhaustive/service/searcher.go b/internal/search/exhaustive/service/searcher.go index d3cd0eaef0e..7deea9766da 100644 --- a/internal/search/exhaustive/service/searcher.go +++ b/internal/search/exhaustive/service/searcher.go @@ -161,7 +161,7 @@ func (s searchQuery) toRepoRevSpecs(ctx context.Context, repoRevSpec types.Repos }, nil } -func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, w CSVWriter) error { +func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, matchWriter MatchWriter) error { if err := isSameUser(ctx, s.userID); err != nil { return err } @@ -181,10 +181,6 @@ func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevisio var mu sync.Mutex // serialize writes to w var writeRowErr error // capture if w.Write fails - matchWriter, err := newMatchCSVWriter(w) - if err != nil { - return err - } // TODO currently ignoring returned Alert _, err = job.Run(ctx, s.clients, streaming.StreamFunc(func(se streaming.SearchEvent) { diff --git a/internal/search/exhaustive/service/searcher_test.go b/internal/search/exhaustive/service/searcher_test.go index 4d9eb77939d..a8363c3208d 100644 --- a/internal/search/exhaustive/service/searcher_test.go +++ b/internal/search/exhaustive/service/searcher_test.go @@ -38,10 +38,10 @@ func TestBackendFake(t *testing.T) { Query: "1@rev1 1@rev2 2@rev3", WantRefSpecs: "RepositoryRevSpec{1@spec} RepositoryRevSpec{2@spec}", WantRepoRevs: "RepositoryRevision{1@rev1} RepositoryRevision{1@rev2} RepositoryRevision{2@rev3}", - WantCSV: autogold.Expect(`repo,revspec,revision -1,spec,rev1 -1,spec,rev2 -2,spec,rev3 + WantCSV: autogold.Expect(`repository,revision,file_path,match_count,first_match_url +1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go +1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go +2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go `), }) } @@ -365,8 +365,11 @@ func testNewSearcher(t *testing.T, ctx context.Context, newSearcher NewSearcher, // Test Search var csv csvBuffer + matchWriter, err := newMatchCSVWriter(&csv) + assert.NoError(err) + for _, repoRev := range repoRevs { - err := searcher.Search(ctx, repoRev, &csv) + err := searcher.Search(ctx, repoRev, matchWriter) assert.NoError(err) } if tc.WantCSV != nil {