Context: match on filename (#61664)

Currently, Cody keyword context only matches on file contents. For many queries it's critical to match the filename too. The reason we only matched contents is purely historical -- I just chose the simplest implementation at first. Examples of when this is important: * "who owns third party licenses?" should match `third-party-licenses/CODEOWNERS` * "grafana version deps.bzl" should clearly match `deps.bzl` This PR extends the searches to match filename too. It doesn't immediately improve results for most Ownership or Changelog queries, because Zoekt does not count filename matches towards a result's score. I'll fix this in follow-up work.
2026-02-06 14:51:44 +00:00 · 2024-04-09 08:27:22 -07:00 · 2024-04-09 08:27:22 -07:00 · f63cda7b30
commit f63cda7b30
parent b5014ccda9
5 changed files with 101 additions and 17 deletions
--- a/internal/codycontext/BUILD.bazel
+++ b/internal/codycontext/BUILD.bazel
@ -40,16 +40,21 @@ go_library(

 go_test(
    name = "codycontext_test",
-    srcs = ["filter_test.go"],
+    srcs = [
+        "context_test.go",
+        "filter_test.go",
+    ],
    embed = [":codycontext"],
    deps = [
        "//internal/api",
        "//internal/conf",
        "//internal/gitserver",
+        "//internal/search/result",
        "//internal/types",
        "//lib/errors",
        "//lib/pointers",
        "//schema",
+        "@com_github_google_go_cmp//cmp",
        "@com_github_sourcegraph_log//logtest",
        "@com_github_stretchr_testify//require",
    ],
--- a/internal/codycontext/context.go
+++ b/internal/codycontext/context.go
@ -3,7 +3,6 @@ package codycontext
 import (
 	"context"
 	"fmt"
-	"strconv"
 	"strings"
 	"sync"

@ -268,8 +267,8 @@ func (c *CodyContextClient) getKeywordContext(ctx context.Context, args GetConte
 		regexEscapedRepoNames[i] = regexp.QuoteMeta(string(repo.Name))
 	}

-	textQuery := fmt.Sprintf(`repo:^%s$ %s content:%s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, strconv.Quote(args.Query))
-	codeQuery := fmt.Sprintf(`repo:^%s$ -%s content:%s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, strconv.Quote(args.Query))
+	textQuery := fmt.Sprintf(`repo:^%s$ type:file type:path %s %s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, args.Query)
+	codeQuery := fmt.Sprintf(`repo:^%s$ type:file type:path -%s %s`, query.UnionRegExps(regexEscapedRepoNames), textFileFilter, args.Query)

 	doSearch := func(ctx context.Context, query string, limit int) ([]FileChunkContext, error) {
 		if limit == 0 {
@ -344,7 +343,15 @@ func (c *CodyContextClient) getKeywordContext(ctx context.Context, args GetConte

 func fileMatchToContextMatches(fm *result.FileMatch) []FileChunkContext {
 	if len(fm.ChunkMatches) == 0 {
-		return nil
+		// If this is a filename-only match, we return the first 20 lines of the file.
+		return []FileChunkContext{{
+			RepoName:  fm.Repo.Name,
+			RepoID:    fm.Repo.ID,
+			CommitID:  fm.CommitID,
+			Path:      fm.Path,
+			StartLine: 0,
+			EndLine:   20,
+		}}
 	}

 	// To provide some context variety, we just use the top-ranked
--- a/internal/codycontext/context_test.go
+++ b/internal/codycontext/context_test.go
@ -0,0 +1,75 @@
+package codycontext
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/sourcegraph/sourcegraph/internal/search/result"
+	"github.com/sourcegraph/sourcegraph/internal/types"
+)
+
+func TestFileMatchToContextMatches(t *testing.T) {
+	cases := []struct {
+		fileMatch *result.FileMatch
+		want      []FileChunkContext
+	}{
+		{
+			// No chunk matches returns first 20 lines
+			fileMatch: &result.FileMatch{
+				File: result.File{
+					Path:     "main.go",
+					CommitID: "abc123",
+					Repo: types.MinimalRepo{
+						Name: "repo",
+						ID:   1,
+					},
+				},
+				ChunkMatches: nil,
+			},
+			want: []FileChunkContext{{
+				RepoName:  "repo",
+				RepoID:    1,
+				CommitID:  "abc123",
+				Path:      "main.go",
+				StartLine: 0,
+				EndLine:   20,
+			}},
+		},
+		{
+			// With chunk match returns context around first chunk
+			fileMatch: &result.FileMatch{
+				File: result.File{
+					Path:     "main.go",
+					CommitID: "abc123",
+					Repo: types.MinimalRepo{
+						Name: "repo",
+						ID:   1,
+					},
+				},
+				ChunkMatches: []result.ChunkMatch{{
+					Content:      "first chunk of content",
+					ContentStart: result.Location{Line: 90, Column: 2},
+				}, {
+					Content:      "second chunk of content",
+					ContentStart: result.Location{Line: 37, Column: 10},
+				}},
+			},
+			want: []FileChunkContext{{
+				RepoName:  "repo",
+				RepoID:    1,
+				CommitID:  "abc123",
+				Path:      "main.go",
+				StartLine: 85,
+				EndLine:   105,
+			}},
+		},
+	}
+
+	for _, tc := range cases {
+		got := fileMatchToContextMatches(tc.fileMatch)
+		if diff := cmp.Diff(tc.want, got); diff != "" {
+			t.Errorf("mismatch (-want +got):\n%s", diff)
+		}
+	}
+}
--- a/internal/search/codycontext/query_transformer.go
+++ b/internal/search/codycontext/query_transformer.go
@ -31,11 +31,8 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
 		return nil, nil
 	}

-	patterns := []string{}
-	parameters := []query.Parameter{
-		// Only search file content
-		{Field: query.FieldType, Value: "file"},
-	}
+	var patterns []string
+	var parameters []query.Parameter

 	switch operator.Kind {
 	case query.And:
@ -49,7 +46,7 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
 				if op.Field == query.FieldContent {
 					// Split any content field on white space into a set of patterns
 					patterns = append(patterns, strings.Fields(op.Value)...)
-				} else if op.Field != query.FieldCase && op.Field != query.FieldType {
+				} else if op.Field != query.FieldCase {
 					parameters = append(parameters, op)
 				}
 			case query.Pattern:
--- a/internal/search/codycontext/query_transformer_test.go
+++ b/internal/search/codycontext/query_transformer_test.go
@ -50,32 +50,32 @@ func TestQueryStringToKeywordQuery(t *testing.T) {
 	}{
 		{
 			query:        "context:global abc",
-			wantQuery:    autogold.Expect("type:file context:global abc"),
+			wantQuery:    autogold.Expect("context:global abc"),
 			wantPatterns: autogold.Expect([]string{"abc"}),
 		},
 		{
 			query:        "abc def",
-			wantQuery:    autogold.Expect("type:file (abc OR def)"),
+			wantQuery:    autogold.Expect("(abc OR def)"),
 			wantPatterns: autogold.Expect([]string{"abc", "def"}),
 		},
 		{
 			query:        "context:global lang:Go how to unzip file",
-			wantQuery:    autogold.Expect("type:file context:global lang:Go (unzip OR file)"),
+			wantQuery:    autogold.Expect("context:global lang:Go (unzip OR file)"),
 			wantPatterns: autogold.Expect([]string{"unzip", "file"}),
 		},
 		{
 			query:        "K MEANS CLUSTERING in python",
-			wantQuery:    autogold.Expect("type:file (cluster OR python)"),
+			wantQuery:    autogold.Expect("(cluster OR python)"),
 			wantPatterns: autogold.Expect([]string{"cluster", "python"}),
 		},
 		{
 			query:        "context:global the who",
-			wantQuery:    autogold.Expect("type:file context:global"),
+			wantQuery:    autogold.Expect("context:global"),
 			wantPatterns: autogold.Expect([]string{}),
 		},
 		{
 			query:     `outer content:"inner {with} (special) ^characters$ and keywords like file or repo"`,
-			wantQuery: autogold.Expect("type:file (special OR ^characters$ OR keyword OR file OR repo OR outer)"),
+			wantQuery: autogold.Expect("(special OR ^characters$ OR keyword OR file OR repo OR outer)"),
 			wantPatterns: autogold.Expect([]string{
 				"special", "^characters$", "keyword", "file",
 				"repo",