Context: match on filename (#61664)

Currently, Cody keyword context only matches on file contents. For many queries
it's critical to match the filename too. The reason we only matched contents is
purely historical -- I just chose the simplest implementation at first.

Examples of when this is important:
* "who owns third party licenses?" should match `third-party-licenses/CODEOWNERS`
* "grafana version deps.bzl" should clearly match `deps.bzl`

This PR extends the searches to match filename too. It doesn't immediately
improve results for most Ownership or Changelog queries, because Zoekt does not
count filename matches towards a result's score. I'll fix this in follow-up work.
This commit is contained in:
Julie Tibshirani 2024-04-09 08:27:22 -07:00 committed by GitHub
parent b5014ccda9
commit f63cda7b30
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 101 additions and 17 deletions

View File

@ -40,16 +40,21 @@ go_library(
go_test(
name = "codycontext_test",
srcs = ["filter_test.go"],
srcs = [
"context_test.go",
"filter_test.go",
],
embed = [":codycontext"],
deps = [
"//internal/api",
"//internal/conf",
"//internal/gitserver",
"//internal/search/result",
"//internal/types",
"//lib/errors",
"//lib/pointers",
"//schema",
"@com_github_google_go_cmp//cmp",
"@com_github_sourcegraph_log//logtest",
"@com_github_stretchr_testify//require",
],

View File

@ -3,7 +3,6 @@ package codycontext
import (
"context"
"fmt"
"strconv"
"strings"
"sync"
@ -268,8 +267,8 @@ func (c *CodyContextClient) getKeywordContext(ctx context.Context, args GetConte
regexEscapedRepoNames[i] = regexp.QuoteMeta(string(repo.Name))
}
textQuery := fmt.Sprintf(`repo:^%s$ %s content:%s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, strconv.Quote(args.Query))
codeQuery := fmt.Sprintf(`repo:^%s$ -%s content:%s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, strconv.Quote(args.Query))
textQuery := fmt.Sprintf(`repo:^%s$ type:file type:path %s %s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, args.Query)
codeQuery := fmt.Sprintf(`repo:^%s$ type:file type:path -%s %s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, args.Query)
doSearch := func(ctx context.Context, query string, limit int) ([]FileChunkContext, error) {
if limit == 0 {
@ -344,7 +343,15 @@ func (c *CodyContextClient) getKeywordContext(ctx context.Context, args GetConte
func fileMatchToContextMatches(fm *result.FileMatch) []FileChunkContext {
if len(fm.ChunkMatches) == 0 {
return nil
// If this is a filename-only match, we return the first 20 lines of the file.
return []FileChunkContext{{
RepoName: fm.Repo.Name,
RepoID: fm.Repo.ID,
CommitID: fm.CommitID,
Path: fm.Path,
StartLine: 0,
EndLine: 20,
}}
}
// To provide some context variety, we just use the top-ranked

View File

@ -0,0 +1,75 @@
package codycontext
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/sourcegraph/sourcegraph/internal/search/result"
"github.com/sourcegraph/sourcegraph/internal/types"
)
func TestFileMatchToContextMatches(t *testing.T) {
cases := []struct {
fileMatch *result.FileMatch
want []FileChunkContext
}{
{
// No chunk matches returns first 20 lines
fileMatch: &result.FileMatch{
File: result.File{
Path: "main.go",
CommitID: "abc123",
Repo: types.MinimalRepo{
Name: "repo",
ID: 1,
},
},
ChunkMatches: nil,
},
want: []FileChunkContext{{
RepoName: "repo",
RepoID: 1,
CommitID: "abc123",
Path: "main.go",
StartLine: 0,
EndLine: 20,
}},
},
{
// With chunk match returns context around first chunk
fileMatch: &result.FileMatch{
File: result.File{
Path: "main.go",
CommitID: "abc123",
Repo: types.MinimalRepo{
Name: "repo",
ID: 1,
},
},
ChunkMatches: []result.ChunkMatch{{
Content: "first chunk of content",
ContentStart: result.Location{Line: 90, Column: 2},
}, {
Content: "second chunk of content",
ContentStart: result.Location{Line: 37, Column: 10},
}},
},
want: []FileChunkContext{{
RepoName: "repo",
RepoID: 1,
CommitID: "abc123",
Path: "main.go",
StartLine: 85,
EndLine: 105,
}},
},
}
for _, tc := range cases {
got := fileMatchToContextMatches(tc.fileMatch)
if diff := cmp.Diff(tc.want, got); diff != "" {
t.Errorf("mismatch (-want +got):\n%s", diff)
}
}
}

View File

@ -31,11 +31,8 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
return nil, nil
}
patterns := []string{}
parameters := []query.Parameter{
// Only search file content
{Field: query.FieldType, Value: "file"},
}
var patterns []string
var parameters []query.Parameter
switch operator.Kind {
case query.And:
@ -49,7 +46,7 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
if op.Field == query.FieldContent {
// Split any content field on white space into a set of patterns
patterns = append(patterns, strings.Fields(op.Value)...)
} else if op.Field != query.FieldCase && op.Field != query.FieldType {
} else if op.Field != query.FieldCase {
parameters = append(parameters, op)
}
case query.Pattern:

View File

@ -50,32 +50,32 @@ func TestQueryStringToKeywordQuery(t *testing.T) {
}{
{
query: "context:global abc",
wantQuery: autogold.Expect("type:file context:global abc"),
wantQuery: autogold.Expect("context:global abc"),
wantPatterns: autogold.Expect([]string{"abc"}),
},
{
query: "abc def",
wantQuery: autogold.Expect("type:file (abc OR def)"),
wantQuery: autogold.Expect("(abc OR def)"),
wantPatterns: autogold.Expect([]string{"abc", "def"}),
},
{
query: "context:global lang:Go how to unzip file",
wantQuery: autogold.Expect("type:file context:global lang:Go (unzip OR file)"),
wantQuery: autogold.Expect("context:global lang:Go (unzip OR file)"),
wantPatterns: autogold.Expect([]string{"unzip", "file"}),
},
{
query: "K MEANS CLUSTERING in python",
wantQuery: autogold.Expect("type:file (cluster OR python)"),
wantQuery: autogold.Expect("(cluster OR python)"),
wantPatterns: autogold.Expect([]string{"cluster", "python"}),
},
{
query: "context:global the who",
wantQuery: autogold.Expect("type:file context:global"),
wantQuery: autogold.Expect("context:global"),
wantPatterns: autogold.Expect([]string{}),
},
{
query: `outer content:"inner {with} (special) ^characters$ and keywords like file or repo"`,
wantQuery: autogold.Expect("type:file (special OR ^characters$ OR keyword OR file OR repo OR outer)"),
wantQuery: autogold.Expect("(special OR ^characters$ OR keyword OR file OR repo OR outer)"),
wantPatterns: autogold.Expect([]string{
"special", "^characters$", "keyword", "file",
"repo",