mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:51:59 +00:00
Cody: expand default excluded context files (#50844)
By default, we try not to embed data files and most config files. While testing context fetching, several data and config files were still coming up. This PR conservatively expands the list of excluded files types. This helps improve context quality: * config files like `.gitattributes`, `.prettierignore`, etc. tend to pop up a lot in top vector search results. My theory is that config files tend to contain several paths and references to the sourcegraph repo which inflate their similarity * the embedding model is designed for text and code, not large data files like csv
This commit is contained in:
parent
ab9110fa77
commit
ea1bf60cd6
@ -2,7 +2,6 @@ package embed
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
|
||||
"strings"
|
||||
|
||||
"github.com/sourcegraph/sourcegraph/enterprise/internal/paths"
|
||||
@ -21,9 +20,16 @@ var textFileExtensions = map[string]struct{}{
|
||||
}
|
||||
|
||||
var defaultExcludedFilePathPatterns = []string{
|
||||
".*ignore", // Files like .gitignore, .eslintignore
|
||||
".gitattributes",
|
||||
".mailmap",
|
||||
"*.csv",
|
||||
"*.sql",
|
||||
"*.svg",
|
||||
"*.json",
|
||||
"*.jsonc",
|
||||
"*.jsonl",
|
||||
"*.xml",
|
||||
"*.yml",
|
||||
"*.yaml",
|
||||
"__fixtures__/",
|
||||
|
||||
@ -19,9 +19,13 @@ func TestExcludingFilePaths(t *testing.T) {
|
||||
"vendor/README.md",
|
||||
"LICENSE.txt",
|
||||
"nested/vendor/file.py",
|
||||
".prettierignore",
|
||||
"client/web/.gitattributes",
|
||||
"no_ignore",
|
||||
"data/names.csv",
|
||||
}
|
||||
|
||||
expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt"}
|
||||
expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt", "no_ignore"}
|
||||
gotFiles := []string{}
|
||||
|
||||
excludedGlobPatterns := GetDefaultExcludedFilePathPatterns()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user