search jobs: refactor SearchQuery interface (#59468)

Relates to #59329 

This is mostly a refactor to prepare for a JSON writer that replaces our CSV writer.

The high-level plan is to switch the internal (object storage) AND the external (download) format from CSV to JSON.

Test plan:
- CI
- Note: I had to update `SearcherFake`, hence some of the test data changed. It only affects a few tests and the character of the tests remains the same.
This commit is contained in:
Stefan Hengl 2024-01-11 14:46:23 +01:00 committed by GitHub
parent f666295b6f
commit 0eaa40528e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 61 additions and 35 deletions

View File

@ -72,7 +72,10 @@ func (h *exhaustiveSearchRepoRevHandler) Handle(ctx context.Context, logger log.
return err
}
csvWriter := service.NewBlobstoreCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID))
csvWriter, err := service.NewCSVWriter(ctx, h.uploadStore, fmt.Sprintf("%d-%d", jobID, record.ID))
if err != nil {
return err
}
err = q.Search(ctx, repoRev, csvWriter)
if closeErr := csvWriter.Close(); closeErr != nil {

View File

@ -118,9 +118,9 @@ func TestExhaustiveSearch(t *testing.T) {
}
sort.Strings(vals)
require.Equal([]string{
"repo,revspec,revision\n1,spec,rev1\n",
"repo,revspec,revision\n1,spec,rev2\n",
"repo,revspec,revision\n2,spec,rev3\n",
"repository,revision,file_path,match_count,first_match_url\n1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go\n",
"repository,revision,file_path,match_count,first_match_url\n1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go\n",
"repository,revision,file_path,match_count,first_match_url\n2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go\n",
}, vals)
}

View File

@ -1,31 +1,42 @@
package service
import (
"context"
"fmt"
"net/url"
"strconv"
"github.com/sourcegraph/sourcegraph/internal/conf"
"github.com/sourcegraph/sourcegraph/internal/search/result"
"github.com/sourcegraph/sourcegraph/internal/uploadstore"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
type matchCSVWriter struct {
type MatchCSVWriter struct {
w CSVWriter
headerTyp string
host *url.URL
}
func newMatchCSVWriter(w CSVWriter) (*matchCSVWriter, error) {
func NewCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) (*MatchCSVWriter, error) {
csvWriter := newBlobstoreCSVWriter(ctx, store, prefix)
return newMatchCSVWriter(csvWriter)
}
func newMatchCSVWriter(w CSVWriter) (*MatchCSVWriter, error) {
externalURL := conf.Get().ExternalURL
u, err := url.Parse(externalURL)
if err != nil {
return nil, err
}
return &matchCSVWriter{w: w, host: u}, nil
return &MatchCSVWriter{w: w, host: u}, nil
}
func (w *matchCSVWriter) Write(match result.Match) error {
func (w *MatchCSVWriter) Close() error {
return w.w.Close()
}
func (w *MatchCSVWriter) Write(match result.Match) error {
// TODO compare to logic used by the webapp to convert
// results into csv. See
// client/web/src/search/results/export/searchResultsExport.ts
@ -38,7 +49,7 @@ func (w *matchCSVWriter) Write(match result.Match) error {
}
}
func (w *matchCSVWriter) writeFileMatch(fm *result.FileMatch) error {
func (w *MatchCSVWriter) writeFileMatch(fm *result.FileMatch) error {
// Differences to "Export CSV" in webapp. We have removed columns since it
// is easier to add columns than to remove them.
//
@ -159,7 +170,7 @@ func minRange(ranges result.Ranges) (result.Range, bool) {
return min, true
}
func (w *matchCSVWriter) writeHeader(typ string) (bool, error) {
func (w *MatchCSVWriter) writeHeader(typ string) (bool, error) {
if w.headerTyp == "" {
w.headerTyp = typ
return true, nil

View File

@ -5,11 +5,15 @@ import (
"context"
"encoding/csv"
"fmt"
"io"
"strconv"
"strings"
"github.com/sourcegraph/sourcegraph/internal/actor"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/search/exhaustive/types"
"github.com/sourcegraph/sourcegraph/internal/search/result"
types2 "github.com/sourcegraph/sourcegraph/internal/types"
"github.com/sourcegraph/sourcegraph/internal/uploadstore"
"github.com/sourcegraph/sourcegraph/lib/errors"
"github.com/sourcegraph/sourcegraph/lib/iterator"
@ -72,15 +76,15 @@ type SearchQuery interface {
ResolveRepositoryRevSpec(context.Context, types.RepositoryRevSpecs) ([]types.RepositoryRevision, error)
Search(context.Context, types.RepositoryRevision, CSVWriter) error
Search(context.Context, types.RepositoryRevision, MatchWriter) error
}
// CSVWriter makes it so we can avoid caring about search types and leave it
type MatchWriter interface {
Write(match result.Match) error
}
// CSVWriter makes it, so we can avoid caring about search types and leave it
// up to the search job to decide the shape of data.
//
// Note: I expect the implementation of this to handle things like chunking up
// the CSV/etc. EG once we hit 100MB of data it can write the data out then
// start a new file. It takes care of remembering the header for the new file.
type CSVWriter interface {
// WriteHeader should be called first and only once.
WriteHeader(...string) error
@ -88,9 +92,11 @@ type CSVWriter interface {
// WriteRow should have the same number of values as WriteHeader and can be
// called zero or more times.
WriteRow(...string) error
io.Closer
}
// NewBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to
// newBlobstoreCSVWriter creates a new BlobstoreCSVWriter which writes a CSV to
// the store. BlobstoreCSVWriter takes care of chunking the CSV into blobs of
// 100MiB, each with the same header row. Blobs are named {prefix}-{shard}
// except for the first blob, which is named {prefix}.
@ -100,7 +106,7 @@ type CSVWriter interface {
//
// The caller is expected to call Close() once and only once after the last call
// to WriteRow.
func NewBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter {
func newBlobstoreCSVWriter(ctx context.Context, store uploadstore.Store, prefix string) *BlobstoreCSVWriter {
c := &BlobstoreCSVWriter{
maxBlobSizeBytes: 100 * 1024 * 1024,
@ -312,15 +318,18 @@ func (s searcherFake) ResolveRepositoryRevSpec(ctx context.Context, repoRevSpec
return repoRevs, nil
}
func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w CSVWriter) error {
func (s searcherFake) Search(ctx context.Context, r types.RepositoryRevision, w MatchWriter) error {
if err := isSameUser(ctx, s.userID); err != nil {
return err
}
if err := w.WriteHeader("repo", "revspec", "revision"); err != nil {
return err
}
return w.WriteRow(strconv.Itoa(int(r.Repository)), string(r.RevisionSpecifiers), string(r.Revision))
return w.Write(&result.FileMatch{
File: result.File{
Repo: types2.MinimalRepo{Name: api.RepoName(strconv.Itoa(int(r.Repository)))},
CommitID: api.CommitID(r.Revision),
Path: "path/to/file.go",
},
})
}
func isSameUser(ctx context.Context, userID int32) error {

View File

@ -63,10 +63,14 @@ func (c *csvBuffer) WriteRow(row ...string) error {
return err
}
func (c *csvBuffer) Close() error {
return nil
}
func TestBlobstoreCSVWriter(t *testing.T) {
mockStore := setupMockStore(t)
csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob")
csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob")
csvWriter.maxBlobSizeBytes = 12
err := csvWriter.WriteHeader("h", "h", "h") // 3 bytes (letters) + 2 bytes (commas) + 1 byte (newline) = 6 bytes
@ -116,7 +120,7 @@ func TestBlobstoreCSVWriter(t *testing.T) {
func TestNoUploadIfNotData(t *testing.T) {
mockStore := setupMockStore(t)
csvWriter := NewBlobstoreCSVWriter(context.Background(), mockStore, "blob")
csvWriter := newBlobstoreCSVWriter(context.Background(), mockStore, "blob")
// No data written, so no upload should happen.
err := csvWriter.Close()

View File

@ -161,7 +161,7 @@ func (s searchQuery) toRepoRevSpecs(ctx context.Context, repoRevSpec types.Repos
}, nil
}
func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, w CSVWriter) error {
func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevision, matchWriter MatchWriter) error {
if err := isSameUser(ctx, s.userID); err != nil {
return err
}
@ -181,10 +181,6 @@ func (s searchQuery) Search(ctx context.Context, repoRev types.RepositoryRevisio
var mu sync.Mutex // serialize writes to w
var writeRowErr error // capture if w.Write fails
matchWriter, err := newMatchCSVWriter(w)
if err != nil {
return err
}
// TODO currently ignoring returned Alert
_, err = job.Run(ctx, s.clients, streaming.StreamFunc(func(se streaming.SearchEvent) {

View File

@ -38,10 +38,10 @@ func TestBackendFake(t *testing.T) {
Query: "1@rev1 1@rev2 2@rev3",
WantRefSpecs: "RepositoryRevSpec{1@spec} RepositoryRevSpec{2@spec}",
WantRepoRevs: "RepositoryRevision{1@rev1} RepositoryRevision{1@rev2} RepositoryRevision{2@rev3}",
WantCSV: autogold.Expect(`repo,revspec,revision
1,spec,rev1
1,spec,rev2
2,spec,rev3
WantCSV: autogold.Expect(`repository,revision,file_path,match_count,first_match_url
1,rev1,path/to/file.go,0,/1@rev1/-/blob/path/to/file.go
1,rev2,path/to/file.go,0,/1@rev2/-/blob/path/to/file.go
2,rev3,path/to/file.go,0,/2@rev3/-/blob/path/to/file.go
`),
})
}
@ -365,8 +365,11 @@ func testNewSearcher(t *testing.T, ctx context.Context, newSearcher NewSearcher,
// Test Search
var csv csvBuffer
matchWriter, err := newMatchCSVWriter(&csv)
assert.NoError(err)
for _, repoRev := range repoRevs {
err := searcher.Search(ctx, repoRev, &csv)
err := searcher.Search(ctx, repoRev, matchWriter)
assert.NoError(err)
}
if tc.WantCSV != nil {