mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 20:51:43 +00:00
search: simplify Deduper (#61139)
Apart from the expected deduping based on `Match.Key()`, `Deduper` kept track of the deduped results but none of the call sites ever read the results. On top of that, `Deduper` modified the matches passed to it via `Deduper.Add` which, looking at the call sites, I don't think was intentional, because all call sites keep track of results themselves. The test I fix in the second commit documents that behavior. Notes: - I also removed NewDedupingStream which was unused. Test plan: - updated unit test - CI
This commit is contained in:
parent
abf9ce3ef2
commit
ffbbccac49
@ -45,73 +45,51 @@ func TestWithSelect(t *testing.T) {
|
||||
}
|
||||
|
||||
autogold.Expect(`[
|
||||
{
|
||||
"Path": "pokeman/",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "digiman/",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
}
|
||||
]`).Equal(t, test("file.directory"))
|
||||
{
|
||||
"Path": "pokeman/",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "digiman/",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
}
|
||||
]`).Equal(t, test("file.directory"))
|
||||
|
||||
autogold.Expect(`[
|
||||
{
|
||||
"Path": "pokeman/charmandar",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "pokeman/bulbosaur",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "digiman/ummm",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
}
|
||||
]`).Equal(t, test("file"))
|
||||
{
|
||||
"Path": "pokeman/charmandar",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "pokeman/bulbosaur",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
},
|
||||
{
|
||||
"Path": "digiman/ummm",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": null,
|
||||
"PathMatches": null,
|
||||
"LimitHit": false
|
||||
}
|
||||
]`).Equal(t, test("file"))
|
||||
|
||||
autogold.Expect(`[
|
||||
{
|
||||
"Path": "pokeman/charmandar",
|
||||
"PreciseLanguage": "",
|
||||
"ChunkMatches": [
|
||||
{
|
||||
"Content": "",
|
||||
"ContentStart": [
|
||||
0,
|
||||
0,
|
||||
0
|
||||
],
|
||||
"Ranges": [
|
||||
{
|
||||
"start": [
|
||||
0,
|
||||
0,
|
||||
0
|
||||
],
|
||||
"end": [
|
||||
0,
|
||||
0,
|
||||
0
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"Content": "",
|
||||
"ContentStart": [
|
||||
|
||||
@ -7,7 +7,6 @@ go_library(
|
||||
"commit.go",
|
||||
"commit_diff.go",
|
||||
"commit_json.go",
|
||||
"deduper.go",
|
||||
"file.go",
|
||||
"highlight.go",
|
||||
"match.go",
|
||||
@ -43,7 +42,6 @@ go_test(
|
||||
"commit_diff_test.go",
|
||||
"commit_json_test.go",
|
||||
"commit_test.go",
|
||||
"deduper_test.go",
|
||||
"file_test.go",
|
||||
"match_test.go",
|
||||
"merger_test.go",
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
package result
|
||||
|
||||
// Deduper deduplicates matches added to it with Add(). Matches are deduplicated by their key,
|
||||
// and the return value of Results() is ordered in the same order results are added with Add().
|
||||
type Deduper struct {
|
||||
results Matches
|
||||
seen map[Key]Match
|
||||
}
|
||||
|
||||
func NewDeduper() Deduper {
|
||||
return Deduper{
|
||||
seen: make(map[Key]Match),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Deduper) Add(m Match) {
|
||||
prev, seen := d.seen[m.Key()]
|
||||
|
||||
if seen {
|
||||
switch prevMatch := prev.(type) {
|
||||
// key matches, so we know to convert to respective type
|
||||
case *FileMatch:
|
||||
prevMatch.AppendMatches(m.(*FileMatch))
|
||||
case *CommitMatch:
|
||||
prevMatch.AppendMatches(m.(*CommitMatch))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
d.results = append(d.results, m)
|
||||
d.seen[m.Key()] = m
|
||||
}
|
||||
|
||||
func (d *Deduper) Seen(m Match) bool {
|
||||
_, ok := d.seen[m.Key()]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (d *Deduper) Results() Matches {
|
||||
return d.results
|
||||
}
|
||||
@ -1,125 +0,0 @@
|
||||
package result
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/api"
|
||||
"github.com/sourcegraph/sourcegraph/internal/gitserver/gitdomain"
|
||||
"github.com/sourcegraph/sourcegraph/internal/types"
|
||||
)
|
||||
|
||||
func TestDeduper(t *testing.T) {
|
||||
commit := func(repo, id string) *CommitMatch {
|
||||
return &CommitMatch{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
Commit: gitdomain.Commit{
|
||||
ID: api.CommitID(id),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
diff := func(repo, id string) *CommitMatch {
|
||||
return &CommitMatch{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
Commit: gitdomain.Commit{
|
||||
ID: api.CommitID(id),
|
||||
},
|
||||
DiffPreview: &MatchedString{},
|
||||
}
|
||||
}
|
||||
|
||||
repo := func(name, rev string) *RepoMatch {
|
||||
return &RepoMatch{
|
||||
Name: api.RepoName(name),
|
||||
Rev: rev,
|
||||
}
|
||||
}
|
||||
|
||||
file := func(repo, commit, path string, hms ChunkMatches) *FileMatch {
|
||||
return &FileMatch{
|
||||
File: File{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
CommitID: api.CommitID(commit),
|
||||
Path: path,
|
||||
},
|
||||
ChunkMatches: hms,
|
||||
}
|
||||
}
|
||||
|
||||
hm := func(s string) ChunkMatch {
|
||||
return ChunkMatch{
|
||||
Content: s,
|
||||
}
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
input Matches
|
||||
expected Matches
|
||||
}{
|
||||
{
|
||||
name: "no dups",
|
||||
input: []Match{
|
||||
commit("a", "b"),
|
||||
diff("c", "d"),
|
||||
repo("e", "f"),
|
||||
file("g", "h", "i", nil),
|
||||
},
|
||||
expected: []Match{
|
||||
commit("a", "b"),
|
||||
diff("c", "d"),
|
||||
repo("e", "f"),
|
||||
file("g", "h", "i", nil),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "merge files",
|
||||
input: []Match{
|
||||
file("a", "b", "c", ChunkMatches{hm("a"), hm("b")}),
|
||||
file("a", "b", "c", ChunkMatches{hm("c"), hm("d")}),
|
||||
},
|
||||
expected: []Match{
|
||||
file("a", "b", "c", ChunkMatches{hm("a"), hm("b"), hm("c"), hm("d")}),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "diff and commit are not equal",
|
||||
input: []Match{
|
||||
commit("a", "b"),
|
||||
diff("a", "b"),
|
||||
},
|
||||
expected: []Match{
|
||||
commit("a", "b"),
|
||||
diff("a", "b"),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "different revs not deduped",
|
||||
input: []Match{
|
||||
repo("a", "b"),
|
||||
repo("a", "c"),
|
||||
},
|
||||
expected: []Match{
|
||||
repo("a", "b"),
|
||||
repo("a", "c"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
dedup := NewDeduper()
|
||||
for _, match := range tc.input {
|
||||
dedup.Add(match)
|
||||
}
|
||||
|
||||
require.Equal(t, tc.expected, dedup.Results())
|
||||
}
|
||||
}
|
||||
@ -146,3 +146,19 @@ func (m Matches) ResultCount() int {
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// Deduper is a convenience type to deduplicate matches
|
||||
type Deduper map[Key]struct{}
|
||||
|
||||
func NewDeduper() Deduper {
|
||||
return make(map[Key]struct{})
|
||||
}
|
||||
|
||||
func (d Deduper) Add(m Match) {
|
||||
d[m.Key()] = struct{}{}
|
||||
}
|
||||
|
||||
func (d Deduper) Seen(m Match) bool {
|
||||
_, ok := d[m.Key()]
|
||||
return ok
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"github.com/hexops/autogold/v2"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/internal/api"
|
||||
"github.com/sourcegraph/sourcegraph/internal/gitserver/gitdomain"
|
||||
"github.com/sourcegraph/sourcegraph/internal/search/filter"
|
||||
"github.com/sourcegraph/sourcegraph/internal/types"
|
||||
@ -192,3 +193,89 @@ func TestKeyEquality(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduper(t *testing.T) {
|
||||
commit := func(repo, id string) *CommitMatch {
|
||||
return &CommitMatch{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
Commit: gitdomain.Commit{
|
||||
ID: api.CommitID(id),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
diff := func(repo, id string) *CommitMatch {
|
||||
return &CommitMatch{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
Commit: gitdomain.Commit{
|
||||
ID: api.CommitID(id),
|
||||
},
|
||||
DiffPreview: &MatchedString{},
|
||||
}
|
||||
}
|
||||
|
||||
repo := func(name, rev string) *RepoMatch {
|
||||
return &RepoMatch{
|
||||
Name: api.RepoName(name),
|
||||
Rev: rev,
|
||||
}
|
||||
}
|
||||
|
||||
file := func(repo, commit, path string, hms ChunkMatches) *FileMatch {
|
||||
return &FileMatch{
|
||||
File: File{
|
||||
Repo: types.MinimalRepo{
|
||||
Name: api.RepoName(repo),
|
||||
},
|
||||
CommitID: api.CommitID(commit),
|
||||
Path: path,
|
||||
},
|
||||
ChunkMatches: hms,
|
||||
}
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
input Matches
|
||||
seen bool
|
||||
}{
|
||||
{
|
||||
name: "no dups",
|
||||
input: []Match{
|
||||
commit("a", "b"),
|
||||
diff("c", "d"),
|
||||
repo("e", "f"),
|
||||
file("g", "h", "i", nil),
|
||||
},
|
||||
seen: false,
|
||||
},
|
||||
{
|
||||
name: "diff and commit are not equal",
|
||||
input: []Match{
|
||||
commit("a", "b"),
|
||||
diff("a", "b"),
|
||||
},
|
||||
seen: false,
|
||||
},
|
||||
{
|
||||
name: "different revs not deduped",
|
||||
input: []Match{
|
||||
repo("a", "b"),
|
||||
repo("a", "c"),
|
||||
},
|
||||
seen: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
dedup := NewDeduper()
|
||||
for _, match := range tc.input {
|
||||
require.Equal(t, tc.seen, dedup.Seen(match))
|
||||
dedup.Add(match)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -86,38 +86,6 @@ func (c *resultCountingStream) Count() int {
|
||||
return int(c.count.Load())
|
||||
}
|
||||
|
||||
// NewDedupingStream ensures only unique results are sent on the stream. Any
|
||||
// result that has already been seen is discard. Note: using this function
|
||||
// requires storing the result set of seen result.
|
||||
func NewDedupingStream(s Sender) *dedupingStream {
|
||||
return &dedupingStream{
|
||||
parent: s,
|
||||
deduper: result.NewDeduper(),
|
||||
}
|
||||
}
|
||||
|
||||
type dedupingStream struct {
|
||||
parent Sender
|
||||
sync.Mutex
|
||||
deduper result.Deduper
|
||||
}
|
||||
|
||||
func (d *dedupingStream) Send(event SearchEvent) {
|
||||
d.Mutex.Lock()
|
||||
results := event.Results[:0]
|
||||
for _, match := range event.Results {
|
||||
seen := d.deduper.Seen(match)
|
||||
if seen {
|
||||
continue
|
||||
}
|
||||
d.deduper.Add(match)
|
||||
results = append(results, match)
|
||||
}
|
||||
event.Results = results
|
||||
d.Mutex.Unlock()
|
||||
d.parent.Send(event)
|
||||
}
|
||||
|
||||
// NewBatchingStream returns a stream that batches results sent to it, holding
|
||||
// delaying their forwarding by a max time of maxDelay, then sending the batched
|
||||
// event to the parent stream. The first event is passed through without delay.
|
||||
|
||||
@ -131,20 +131,3 @@ func TestBatchingStream(t *testing.T) {
|
||||
require.Equal(t, count.Load(), int64(10))
|
||||
})
|
||||
}
|
||||
|
||||
func TestDedupingStream(t *testing.T) {
|
||||
var sent []result.Match
|
||||
s := NewDedupingStream(StreamFunc(func(e SearchEvent) {
|
||||
sent = append(sent, e.Results...)
|
||||
}))
|
||||
|
||||
for range 2 {
|
||||
s.Send(SearchEvent{
|
||||
Results: []result.Match{&result.FileMatch{
|
||||
File: result.File{Path: "lombardy"},
|
||||
}},
|
||||
})
|
||||
}
|
||||
|
||||
require.Equal(t, 1, len(sent))
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user