diff --git a/deps.bzl b/deps.bzl index d01d9311daf..5af2f7639b5 100644 --- a/deps.bzl +++ b/deps.bzl @@ -5676,8 +5676,8 @@ def go_dependencies(): patches = [ "//third_party/com_github_sourcegraph_zoekt:x_defs_version.patch", ], - sum = "h1:aXHLpH1rhdvg4gQOiQWLkqVd3D/DG2li5Nnf6WE7mRs=", - version = "v0.0.0-20240501072156-72f95004e6d6", + sum = "h1:eQIFTvf8qZcSLhgu5NrprfwgtJEqU9tvGXG8sf1SSgU=", + version = "v0.0.0-20240507175742-4e674a49795c", ) go_repository( name = "com_github_spaolacci_murmur3", diff --git a/go.mod b/go.mod index d9c254c0c97..fd7253e2222 100644 --- a/go.mod +++ b/go.mod @@ -602,7 +602,7 @@ require ( github.com/scim2/filter-parser/v2 v2.2.0 github.com/sourcegraph/conc v0.3.1-0.20240108182409-4afefce20f9b github.com/sourcegraph/mountinfo v0.0.0-20240201124957-b314c0befab1 - github.com/sourcegraph/zoekt v0.0.0-20240501072156-72f95004e6d6 + github.com/sourcegraph/zoekt v0.0.0-20240507175742-4e674a49795c github.com/spf13/cobra v1.8.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.2 // indirect diff --git a/go.sum b/go.sum index d9b0a0b2140..c59b63b6683 100644 --- a/go.sum +++ b/go.sum @@ -1743,8 +1743,8 @@ github.com/sourcegraph/sourcegraph-accounts-sdk-go v0.0.0-20240426173441-db5b0a1 github.com/sourcegraph/sourcegraph-accounts-sdk-go v0.0.0-20240426173441-db5b0a145ceb/go.mod h1:xul4Fiph3Pvdx/1qsmhCUL2GBeYjTcnga0LXZEbKdGo= github.com/sourcegraph/yaml v1.0.1-0.20200714132230-56936252f152 h1:z/MpntplPaW6QW95pzcAR/72Z5TWDyDnSo0EOcyij9o= github.com/sourcegraph/yaml v1.0.1-0.20200714132230-56936252f152/go.mod h1:GIjDIg/heH5DOkXY3YJ/wNhfHsQHoXGjl8G8amsYQ1I= -github.com/sourcegraph/zoekt v0.0.0-20240501072156-72f95004e6d6 h1:aXHLpH1rhdvg4gQOiQWLkqVd3D/DG2li5Nnf6WE7mRs= -github.com/sourcegraph/zoekt v0.0.0-20240501072156-72f95004e6d6/go.mod h1:K7dYKxtKLPBRwu55Useje/JUZEuWgzlu5O1F8VFHfwE= +github.com/sourcegraph/zoekt v0.0.0-20240507175742-4e674a49795c h1:eQIFTvf8qZcSLhgu5NrprfwgtJEqU9tvGXG8sf1SSgU= +github.com/sourcegraph/zoekt v0.0.0-20240507175742-4e674a49795c/go.mod h1:K7dYKxtKLPBRwu55Useje/JUZEuWgzlu5O1F8VFHfwE= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v0.0.0-20170901052352-ee1bd8ee15a1/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= diff --git a/internal/conf/computed.go b/internal/conf/computed.go index 18e9bfb17d6..e810982b443 100644 --- a/internal/conf/computed.go +++ b/internal/conf/computed.go @@ -476,13 +476,13 @@ func RankingMaxQueueSizeBytes() int { // SearchFlushWallTime controls the amount of time that Zoekt shards collect and rank results. For // larger codebases, it can be helpful to increase this to improve the ranking stability and quality. -func SearchFlushWallTime(keywordScoring bool) time.Duration { +func SearchFlushWallTime(bm25Scoring bool) time.Duration { ranking := ExperimentalFeatures().Ranking if ranking != nil && ranking.FlushWallTimeMS > 0 { return time.Duration(ranking.FlushWallTimeMS) * time.Millisecond } else { - if keywordScoring { - // Keyword scoring takes longer than standard searches, so use a higher FlushWallTime + if bm25Scoring { + // BM25 scoring takes longer than standard searches, so use a higher FlushWallTime // to help ensure ranking is stable return 2 * time.Second } else { diff --git a/internal/search/backend/metered_searcher.go b/internal/search/backend/metered_searcher.go index c747f8b3282..449f5800149 100644 --- a/internal/search/backend/metered_searcher.go +++ b/internal/search/backend/metered_searcher.go @@ -78,7 +78,7 @@ func (m *meteredSearcher) StreamSearch(ctx context.Context, q query.Q, opts *zoe attribute.Bool("opts.chunk_matches", opts.ChunkMatches), attribute.Bool("opts.use_document_ranks", opts.UseDocumentRanks), attribute.Float64("opts.document_ranks_weight", opts.DocumentRanksWeight), - attribute.Bool("opts.use_keyword_scoring", opts.UseKeywordScoring), + attribute.Bool("opts.use_bm25_scoring", opts.UseBM25Scoring), attribute.Bool("opts.debug_score", opts.DebugScore), )...) } diff --git a/internal/search/types.go b/internal/search/types.go index 9cb3a24aa8e..3a20befeef4 100644 --- a/internal/search/types.go +++ b/internal/search/types.go @@ -185,24 +185,21 @@ func (o *ZoektParameters) ToSearchOptions(ctx context.Context) (searchOpts *zoek defaultTimeout := 20 * time.Second searchOpts = &zoekt.SearchOptions{ - Trace: policy.ShouldTrace(ctx), - MaxWallTime: defaultTimeout, - ChunkMatches: true, - UseKeywordScoring: o.PatternType == query.SearchTypeCodyContext, - NumContextLines: o.NumContextLines, + Trace: policy.ShouldTrace(ctx), + MaxWallTime: defaultTimeout, + ChunkMatches: true, + UseBM25Scoring: o.PatternType == query.SearchTypeCodyContext, + NumContextLines: o.NumContextLines, } // These are reasonable default amounts of work to do per shard and // replica respectively. searchOpts.ShardMaxMatchCount = 10_000 searchOpts.TotalMaxMatchCount = 100_000 - // KeywordScoring and Features.UseZoektParser represent different approaches we - // are evaluating to deliver a better keyword-based search experience. For now - // these are separate, but we might combine them in the future. Both profit from - // higher defaults. - if searchOpts.UseKeywordScoring || o.PatternType == query.SearchTypeKeyword { - // Keyword searches tends to match much more broadly than code searches, so we need to - // consider more candidates to ensure we don't miss highly-ranked documents + // Keyword searches tends to match much more broadly than code searches, so we need to + // consider more candidates to ensure we don't miss highly-ranked documents. The same + // holds for BM25 scoring, which is used for Cody context searches. + if searchOpts.UseBM25Scoring || o.PatternType == query.SearchTypeKeyword { searchOpts.ShardMaxMatchCount *= 10 searchOpts.TotalMaxMatchCount *= 10 } @@ -232,7 +229,7 @@ func (o *ZoektParameters) ToSearchOptions(ctx context.Context) (searchOpts *zoek // This enables our stream based ranking, where we wait a certain amount // of time to collect results before ranking. - searchOpts.FlushWallTime = conf.SearchFlushWallTime(searchOpts.UseKeywordScoring) + searchOpts.FlushWallTime = conf.SearchFlushWallTime(searchOpts.UseBM25Scoring) // Only use document ranks if the jobs to calculate the ranks are enabled. This // is to make sure we don't use outdated ranks for scoring in Zoekt. diff --git a/internal/search/types_test.go b/internal/search/types_test.go index 812c43f251d..3e429ad3bd0 100644 --- a/internal/search/types_test.go +++ b/internal/search/types_test.go @@ -151,7 +151,7 @@ func TestZoektParameters(t *testing.T) { }, }, { - name: "test keyword scoring", + name: "test bm25 scoring", context: context.Background(), params: &ZoektParameters{ FileMatchLimit: limits.DefaultMaxSearchResultsStreaming, @@ -161,11 +161,11 @@ func TestZoektParameters(t *testing.T) { ShardMaxMatchCount: 100000, TotalMaxMatchCount: 1000000, MaxWallTime: 20000000000, - FlushWallTime: 2000000000, // for keyword search, default is 2 sec + FlushWallTime: 2000000000, // for bm25 scoring, default is 2 sec MaxDocDisplayCount: 10000, ChunkMatches: true, DocumentRanksWeight: 4500, - UseKeywordScoring: true}, + UseBM25Scoring: true}, }, }