Cody: expand default excluded context files (#50844)

By default, we try not to embed data files and most config files. While
testing context fetching, several data and config files were still
coming up. This PR conservatively expands the list of excluded files
types.

This helps improve context quality: 
* config files like `.gitattributes`, `.prettierignore`, etc. tend to
pop up a lot in top vector search results. My theory is that config
files tend to contain several paths and references to the sourcegraph
repo which inflate their similarity
* the embedding model is designed for text and code, not large data
files like csv
This commit is contained in:
Julie Tibshirani 2023-04-18 18:32:16 -07:00 committed by GitHub
parent ab9110fa77
commit ea1bf60cd6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 2 deletions

View File

@ -2,7 +2,6 @@ package embed
import (
"path/filepath"
"strings"
"github.com/sourcegraph/sourcegraph/enterprise/internal/paths"
@ -21,9 +20,16 @@ var textFileExtensions = map[string]struct{}{
}
var defaultExcludedFilePathPatterns = []string{
".*ignore", // Files like .gitignore, .eslintignore
".gitattributes",
".mailmap",
"*.csv",
"*.sql",
"*.svg",
"*.json",
"*.jsonc",
"*.jsonl",
"*.xml",
"*.yml",
"*.yaml",
"__fixtures__/",

View File

@ -19,9 +19,13 @@ func TestExcludingFilePaths(t *testing.T) {
"vendor/README.md",
"LICENSE.txt",
"nested/vendor/file.py",
".prettierignore",
"client/web/.gitattributes",
"no_ignore",
"data/names.csv",
}
expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt"}
expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt", "no_ignore"}
gotFiles := []string{}
excludedGlobPatterns := GetDefaultExcludedFilePathPatterns()