search jobs: refactor SearchQuery interface (#59468)

Relates to #59329 This is mostly a refactor to prepare for a JSON writer that replaces our CSV writer. The high-level plan is to switch the internal (object storage) AND the external (download) format from CSV to JSON. Test plan: - CI - Note: I had to update `SearcherFake`, hence some of the test data changed. It only affects a few tests and the character of the tests remains the same.
2026-02-06 15:31:48 +00:00 · 2024-01-11 14:46:23 +01:00 · 2024-01-11 14:46:23 +01:00 · 0eaa40528e
commit 0eaa40528e
parent f666295b6f
7 changed files with 61 additions and 35 deletions
--- a/cmd/worker/internal/search/exhaustive_search_repo_revision.go
+++ b/cmd/worker/internal/search/exhaustive_search_repo_revision.go
@ -72,7 +72,10 @@ func (h *exhaustiveSearchRepoRevHandler) Handle(ctx context.Context, logger log.
 		return err
 	}

-	csvWriter := service.NewBlobstoreCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID))
+	csvWriter, err := service.NewCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID))
+	if err != nil {
+		return err
+	}

 	err = q.Search(ctx, repoRev, csvWriter)
 	if closeErr := csvWriter.Close(); closeErr != nil {
--- a/cmd/worker/internal/search/exhaustive_search_test.go
+++ b/cmd/worker/internal/search/exhaustive_search_test.go
@ -118,9 +118,9 @@ func TestExhaustiveSearch(t *testing.T) {
 		}
 		sort.Strings(vals)
 		require.Equal([]string{
-			"repo,revspec,revision\n1,spec,rev1\n",
-			"repo,revspec,revision\n1,spec,rev2\n",
-			"repo,revspec,revision\n2,spec,rev3\n",
+			"repository,revision,file_path,match_count,first_match_url\n1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go\n",
+			"repository,revision,file_path,match_count,first_match_url\n1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go\n",
+			"repository,revision,file_path,match_count,first_match_url\n2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go\n",
 		}, vals)
 	}

--- a/internal/search/exhaustive/service/matchcsv.go
+++ b/internal/search/exhaustive/service/matchcsv.go
@ -1,31 +1,42 @@
 package service

 import (
+	"context"
 	"fmt"
 	"net/url"
 	"strconv"

 	"github.com/sourcegraph/sourcegraph/internal/conf"
 	"github.com/sourcegraph/sourcegraph/internal/search/result"
+	"github.com/sourcegraph/sourcegraph/internal/uploadstore"
 	"github.com/sourcegraph/sourcegraph/lib/errors"
 )

-type matchCSVWriter struct {
+type MatchCSVWriter struct {
 	w         CSVWriter
 	headerTyp string
 	host      *url.URL
 }

-func newMatchCSVWriter(w CSVWriter) (*matchCSVWriter, error) {
+func NewCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) (*MatchCSVWriter, error) {
+	csvWriter := newBlobstoreCSVWriter(ctx, store, prefix)
+	return newMatchCSVWriter(csvWriter)
+}
+
+func newMatchCSVWriter(w CSVWriter) (*MatchCSVWriter, error) {
 	externalURL := conf.Get().ExternalURL
 	u, err := url.Parse(externalURL)
 	if err != nil {
 		return nil, err
 	}
-	return &matchCSVWriter{w: w, host: u}, nil
+	return &MatchCSVWriter{w: w, host: u}, nil
 }

-func (w *matchCSVWriter) Write(match result.Match) error {
+func (w *MatchCSVWriter) Close() error {
+	return w.w.Close()
+}
+
+func (w *MatchCSVWriter) Write(match result.Match) error {
 	// TODO compare to logic used by the webapp to convert
 	// results into csv. See
 	// client/web/src/search/results/export/searchResultsExport.ts
@ -38,7 +49,7 @@ func (w *matchCSVWriter) Write(match result.Match) error {
 	}
 }

-func (w *matchCSVWriter) writeFileMatch(fm *result.FileMatch) error {
+func (w *MatchCSVWriter) writeFileMatch(fm *result.FileMatch) error {
 	// Differences to "Export CSV" in webapp. We have removed columns since it
 	// is easier to add columns than to remove them.
 	//
@ -159,7 +170,7 @@ func minRange(ranges result.Ranges) (result.Range, bool) {
 	return min, true
 }

-func (w *matchCSVWriter) writeHeader(typ string) (bool, error) {
+func (w *MatchCSVWriter) writeHeader(typ string) (bool, error) {
 	if w.headerTyp == "" {
 		w.headerTyp = typ
 		return true, nil
--- a/internal/search/exhaustive/service/search.go
+++ b/internal/search/exhaustive/service/search.go
@ -5,11 +5,15 @@ import (
 	"context"
 	"encoding/csv"
 	"fmt"
+	"io"
 	"strconv"
 	"strings"

 	"github.com/sourcegraph/sourcegraph/internal/actor"
+	"github.com/sourcegraph/sourcegraph/internal/api"
 	"github.com/sourcegraph/sourcegraph/internal/search/exhaustive/types"
+	"github.com/sourcegraph/sourcegraph/internal/search/result"
+	types2 "github.com/sourcegraph/sourcegraph/internal/types"
 	"github.com/sourcegraph/sourcegraph/internal/uploadstore"
 	"github.com/sourcegraph/sourcegraph/lib/errors"
 	"github.com/sourcegraph/sourcegraph/lib/iterator"
@ -72,15 +76,15 @@ type SearchQuery interface {

 	ResolveRepositoryRevSpec(context.Context, types.RepositoryRevSpecs) ([]types.RepositoryRevision, error)

-	Search(context.Context, types.RepositoryRevision, CSVWriter) error
+	Search(context.Context, types.RepositoryRevision, MatchWriter) error
 }

-// CSVWriter makes it so we can avoid caring about search types and leave it
+type MatchWriter interface {
+	Write(match result.Match) error
+}
+
+// CSVWriter makes it, so we can avoid caring about search types and leave it
 // up to the search job to decide the shape of data.
-//
-// Note: I expect the implementation of this to handle things like chunking up
-// the CSV/etc. EG once we hit 100MB of data it can write the data out then
-// start a new file. It takes care of remembering the header for the new file.
 type CSVWriter interface {
 	// WriteHeader should be called first and only once.
 	WriteHeader(...string) error
@ -88,9 +92,11 @@ type CSVWriter interface {
 	// WriteRow should have the same number of values as WriteHeader and can be
 	// called zero or more times.
 	WriteRow(...string) error
+
+	io.Closer
 }

-// NewBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to
+// newBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to
 // the store. BlobstoreCSVWriter takes care of chunking the CSV into blobs of
 // 100MiB, each with the same header row. Blobs are named {prefix}-{shard}
 // except for the first blob, which is named {prefix}.
@ -100,7 +106,7 @@ type CSVWriter interface {
 //
 // The caller is expected to call Close() once and only once after the last call
 // to WriteRow.
-func NewBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter {
+func newBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter {

 	c := &BlobstoreCSVWriter{
 		maxBlobSizeBytes: 100 * 1024 * 1024,
@ -312,15 +318,18 @@ func (s searcherFake) ResolveRepositoryRevSpec(ctx context.Context, repoRevSpec
 	return repoRevs, nil
 }

-func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w CSVWriter) error {
+func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w MatchWriter) error {
 	if err := isSameUser(ctx, s.userID); err != nil {
 		return err
 	}

-	if err := w.WriteHeader("repo", "revspec", "revision"); err != nil {
-		return err
-	}
-	return w.WriteRow(strconv.Itoa(int(r.Repository)), string(r.RevisionSpecifiers), string(r.Revision))
+	return w.Write(&result.FileMatch{
+		File: result.File{
+			Repo:     types2.MinimalRepo{Name: api.RepoName(strconv.Itoa(int(r.Repository)))},
+			CommitID: api.CommitID(r.Revision),
+			Path:     "path/to/file.go",
+		},
+	})
 }

 func isSameUser(ctx context.Context, userID int32) error {
--- a/internal/search/exhaustive/service/search_test.go
+++ b/internal/search/exhaustive/service/search_test.go
@ -63,10 +63,14 @@ func (c *csvBuffer) WriteRow(row ...string) error {
 	return err
 }

+func (c *csvBuffer) Close() error {
+	return nil
+}
+
 func TestBlobstoreCSVWriter(t *testing.T) {
 	mockStore := setupMockStore(t)

-	csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob")
+	csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob")
 	csvWriter.maxBlobSizeBytes = 12

 	err := csvWriter.WriteHeader("h", "h", "h") // 3 bytes (letters) + 2 bytes (commas) + 1 byte (newline) = 6 bytes
@ -116,7 +120,7 @@ func TestBlobstoreCSVWriter(t *testing.T) {

 func TestNoUploadIfNotData(t *testing.T) {
 	mockStore := setupMockStore(t)
-	csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob")
+	csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob")

 	// No data written, so no upload should happen.
 	err := csvWriter.Close()
--- a/internal/search/exhaustive/service/searcher.go
+++ b/internal/search/exhaustive/service/searcher.go
@ -161,7 +161,7 @@ func (s searchQuery) toRepoRevSpecs(ctx context.Context, repoRevSpec types.Repos
 	}, nil
 }

-func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, w CSVWriter) error {
+func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, matchWriter MatchWriter) error {
 	if err := isSameUser(ctx, s.userID); err != nil {
 		return err
 	}
@ -181,10 +181,6 @@ func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevisio

 	var mu sync.Mutex     // serialize writes to w
 	var writeRowErr error // capture if w.Write fails
-	matchWriter, err := newMatchCSVWriter(w)
-	if err != nil {
-		return err
-	}

 	// TODO currently ignoring returned Alert
 	_, err = job.Run(ctx, s.clients, streaming.StreamFunc(func(se streaming.SearchEvent) {
--- a/internal/search/exhaustive/service/searcher_test.go
+++ b/internal/search/exhaustive/service/searcher_test.go
@ -38,10 +38,10 @@ func TestBackendFake(t *testing.T) {
 		Query:        "1@rev1 1@rev2 2@rev3",
 		WantRefSpecs: "RepositoryRevSpec{1@spec} RepositoryRevSpec{2@spec}",
 		WantRepoRevs: "RepositoryRevision{1@rev1} RepositoryRevision{1@rev2} RepositoryRevision{2@rev3}",
-		WantCSV: autogold.Expect(`repo,revspec,revision
-1,spec,rev1
-1,spec,rev2
-2,spec,rev3
+		WantCSV: autogold.Expect(`repository,revision,file_path,match_count,first_match_url
+1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go
+1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go
+2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go
 `),
 	})
 }
@ -365,8 +365,11 @@ func testNewSearcher(t *testing.T, ctx context.Context, newSearcher NewSearcher,

 	// Test Search
 	var csv csvBuffer
+	matchWriter, err := newMatchCSVWriter(&csv)
+	assert.NoError(err)
+
 	for _, repoRev := range repoRevs {
-		err := searcher.Search(ctx, repoRev, &csv)
+		err := searcher.Search(ctx, repoRev, matchWriter)
 		assert.NoError(err)
 	}
 	if tc.WantCSV != nil {