search: simplify Deduper (#61139)

Apart from the expected deduping based on `Match.Key()`, `Deduper` kept track of the deduped results but none of the call sites ever read the results. On top of that, `Deduper` modified the matches passed to it via `Deduper.Add` which, looking at the call sites, I don't think was intentional, because all call sites keep track of results themselves. The test I fix in the second commit documents that behavior.

Notes:
- I also removed NewDedupingStream which was unused.

Test plan:
- updated unit test
- CI
This commit is contained in:
Stefan Hengl 2024-03-14 16:08:23 +01:00 committed by GitHub
parent abf9ce3ef2
commit ffbbccac49
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 140 additions and 276 deletions

View File

@ -45,73 +45,51 @@ func TestWithSelect(t *testing.T) {
}
autogold.Expect(`[
{
"Path": "pokeman/",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "digiman/",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
}
]`).Equal(t, test("file.directory"))
{
"Path": "pokeman/",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "digiman/",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
}
]`).Equal(t, test("file.directory"))
autogold.Expect(`[
{
"Path": "pokeman/charmandar",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "pokeman/bulbosaur",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "digiman/ummm",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
}
]`).Equal(t, test("file"))
{
"Path": "pokeman/charmandar",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "pokeman/bulbosaur",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
},
{
"Path": "digiman/ummm",
"PreciseLanguage": "",
"ChunkMatches": null,
"PathMatches": null,
"LimitHit": false
}
]`).Equal(t, test("file"))
autogold.Expect(`[
{
"Path": "pokeman/charmandar",
"PreciseLanguage": "",
"ChunkMatches": [
{
"Content": "",
"ContentStart": [
0,
0,
0
],
"Ranges": [
{
"start": [
0,
0,
0
],
"end": [
0,
0,
0
]
}
]
},
{
"Content": "",
"ContentStart": [

View File

@ -7,7 +7,6 @@ go_library(
"commit.go",
"commit_diff.go",
"commit_json.go",
"deduper.go",
"file.go",
"highlight.go",
"match.go",
@ -43,7 +42,6 @@ go_test(
"commit_diff_test.go",
"commit_json_test.go",
"commit_test.go",
"deduper_test.go",
"file_test.go",
"match_test.go",
"merger_test.go",

View File

@ -1,41 +0,0 @@
package result
// Deduper deduplicates matches added to it with Add(). Matches are deduplicated by their key,
// and the return value of Results() is ordered in the same order results are added with Add().
type Deduper struct {
results Matches
seen map[Key]Match
}
func NewDeduper() Deduper {
return Deduper{
seen: make(map[Key]Match),
}
}
func (d *Deduper) Add(m Match) {
prev, seen := d.seen[m.Key()]
if seen {
switch prevMatch := prev.(type) {
// key matches, so we know to convert to respective type
case *FileMatch:
prevMatch.AppendMatches(m.(*FileMatch))
case *CommitMatch:
prevMatch.AppendMatches(m.(*CommitMatch))
}
return
}
d.results = append(d.results, m)
d.seen[m.Key()] = m
}
func (d *Deduper) Seen(m Match) bool {
_, ok := d.seen[m.Key()]
return ok
}
func (d *Deduper) Results() Matches {
return d.results
}

View File

@ -1,125 +0,0 @@
package result
import (
"testing"
"github.com/stretchr/testify/require"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/gitserver/gitdomain"
"github.com/sourcegraph/sourcegraph/internal/types"
)
func TestDeduper(t *testing.T) {
commit := func(repo, id string) *CommitMatch {
return &CommitMatch{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
Commit: gitdomain.Commit{
ID: api.CommitID(id),
},
}
}
diff := func(repo, id string) *CommitMatch {
return &CommitMatch{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
Commit: gitdomain.Commit{
ID: api.CommitID(id),
},
DiffPreview: &MatchedString{},
}
}
repo := func(name, rev string) *RepoMatch {
return &RepoMatch{
Name: api.RepoName(name),
Rev: rev,
}
}
file := func(repo, commit, path string, hms ChunkMatches) *FileMatch {
return &FileMatch{
File: File{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
CommitID: api.CommitID(commit),
Path: path,
},
ChunkMatches: hms,
}
}
hm := func(s string) ChunkMatch {
return ChunkMatch{
Content: s,
}
}
cases := []struct {
name string
input Matches
expected Matches
}{
{
name: "no dups",
input: []Match{
commit("a", "b"),
diff("c", "d"),
repo("e", "f"),
file("g", "h", "i", nil),
},
expected: []Match{
commit("a", "b"),
diff("c", "d"),
repo("e", "f"),
file("g", "h", "i", nil),
},
},
{
name: "merge files",
input: []Match{
file("a", "b", "c", ChunkMatches{hm("a"), hm("b")}),
file("a", "b", "c", ChunkMatches{hm("c"), hm("d")}),
},
expected: []Match{
file("a", "b", "c", ChunkMatches{hm("a"), hm("b"), hm("c"), hm("d")}),
},
},
{
name: "diff and commit are not equal",
input: []Match{
commit("a", "b"),
diff("a", "b"),
},
expected: []Match{
commit("a", "b"),
diff("a", "b"),
},
},
{
name: "different revs not deduped",
input: []Match{
repo("a", "b"),
repo("a", "c"),
},
expected: []Match{
repo("a", "b"),
repo("a", "c"),
},
},
}
for _, tc := range cases {
dedup := NewDeduper()
for _, match := range tc.input {
dedup.Add(match)
}
require.Equal(t, tc.expected, dedup.Results())
}
}

View File

@ -146,3 +146,19 @@ func (m Matches) ResultCount() int {
}
return count
}
// Deduper is a convenience type to deduplicate matches
type Deduper map[Key]struct{}
func NewDeduper() Deduper {
return make(map[Key]struct{})
}
func (d Deduper) Add(m Match) {
d[m.Key()] = struct{}{}
}
func (d Deduper) Seen(m Match) bool {
_, ok := d[m.Key()]
return ok
}

View File

@ -8,6 +8,7 @@ import (
"github.com/hexops/autogold/v2"
"github.com/stretchr/testify/require"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/gitserver/gitdomain"
"github.com/sourcegraph/sourcegraph/internal/search/filter"
"github.com/sourcegraph/sourcegraph/internal/types"
@ -192,3 +193,89 @@ func TestKeyEquality(t *testing.T) {
})
}
}
func TestDeduper(t *testing.T) {
commit := func(repo, id string) *CommitMatch {
return &CommitMatch{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
Commit: gitdomain.Commit{
ID: api.CommitID(id),
},
}
}
diff := func(repo, id string) *CommitMatch {
return &CommitMatch{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
Commit: gitdomain.Commit{
ID: api.CommitID(id),
},
DiffPreview: &MatchedString{},
}
}
repo := func(name, rev string) *RepoMatch {
return &RepoMatch{
Name: api.RepoName(name),
Rev: rev,
}
}
file := func(repo, commit, path string, hms ChunkMatches) *FileMatch {
return &FileMatch{
File: File{
Repo: types.MinimalRepo{
Name: api.RepoName(repo),
},
CommitID: api.CommitID(commit),
Path: path,
},
ChunkMatches: hms,
}
}
cases := []struct {
name string
input Matches
seen bool
}{
{
name: "no dups",
input: []Match{
commit("a", "b"),
diff("c", "d"),
repo("e", "f"),
file("g", "h", "i", nil),
},
seen: false,
},
{
name: "diff and commit are not equal",
input: []Match{
commit("a", "b"),
diff("a", "b"),
},
seen: false,
},
{
name: "different revs not deduped",
input: []Match{
repo("a", "b"),
repo("a", "c"),
},
seen: false,
},
}
for _, tc := range cases {
dedup := NewDeduper()
for _, match := range tc.input {
require.Equal(t, tc.seen, dedup.Seen(match))
dedup.Add(match)
}
}
}

View File

@ -86,38 +86,6 @@ func (c *resultCountingStream) Count() int {
return int(c.count.Load())
}
// NewDedupingStream ensures only unique results are sent on the stream. Any
// result that has already been seen is discard. Note: using this function
// requires storing the result set of seen result.
func NewDedupingStream(s Sender) *dedupingStream {
return &dedupingStream{
parent: s,
deduper: result.NewDeduper(),
}
}
type dedupingStream struct {
parent Sender
sync.Mutex
deduper result.Deduper
}
func (d *dedupingStream) Send(event SearchEvent) {
d.Mutex.Lock()
results := event.Results[:0]
for _, match := range event.Results {
seen := d.deduper.Seen(match)
if seen {
continue
}
d.deduper.Add(match)
results = append(results, match)
}
event.Results = results
d.Mutex.Unlock()
d.parent.Send(event)
}
// NewBatchingStream returns a stream that batches results sent to it, holding
// delaying their forwarding by a max time of maxDelay, then sending the batched
// event to the parent stream. The first event is passed through without delay.

View File

@ -131,20 +131,3 @@ func TestBatchingStream(t *testing.T) {
require.Equal(t, count.Load(), int64(10))
})
}
func TestDedupingStream(t *testing.T) {
var sent []result.Match
s := NewDedupingStream(StreamFunc(func(e SearchEvent) {
sent = append(sent, e.Results...)
}))
for range 2 {
s.Send(SearchEvent{
Results: []result.Match{&result.FileMatch{
File: result.File{Path: "lombardy"},
}},
})
}
require.Equal(t, 1, len(sent))
}