From ea1bf60cd61e41bd0150fe9801071a47bc32385e Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Tue, 18 Apr 2023 18:32:16 -0700 Subject: [PATCH] Cody: expand default excluded context files (#50844) By default, we try not to embed data files and most config files. While testing context fetching, several data and config files were still coming up. This PR conservatively expands the list of excluded files types. This helps improve context quality: * config files like `.gitattributes`, `.prettierignore`, etc. tend to pop up a lot in top vector search results. My theory is that config files tend to contain several paths and references to the sourcegraph repo which inflate their similarity * the embedding model is designed for text and code, not large data files like csv --- enterprise/internal/embeddings/embed/files.go | 8 +++++++- enterprise/internal/embeddings/embed/files_test.go | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/enterprise/internal/embeddings/embed/files.go b/enterprise/internal/embeddings/embed/files.go index f08720634c8..dbf4daa5d57 100644 --- a/enterprise/internal/embeddings/embed/files.go +++ b/enterprise/internal/embeddings/embed/files.go @@ -2,7 +2,6 @@ package embed import ( "path/filepath" - "strings" "github.com/sourcegraph/sourcegraph/enterprise/internal/paths" @@ -21,9 +20,16 @@ var textFileExtensions = map[string]struct{}{ } var defaultExcludedFilePathPatterns = []string{ + ".*ignore", // Files like .gitignore, .eslintignore + ".gitattributes", + ".mailmap", + "*.csv", "*.sql", "*.svg", "*.json", + "*.jsonc", + "*.jsonl", + "*.xml", "*.yml", "*.yaml", "__fixtures__/", diff --git a/enterprise/internal/embeddings/embed/files_test.go b/enterprise/internal/embeddings/embed/files_test.go index 6c2feafcd01..91d325fd1e2 100644 --- a/enterprise/internal/embeddings/embed/files_test.go +++ b/enterprise/internal/embeddings/embed/files_test.go @@ -19,9 +19,13 @@ func TestExcludingFilePaths(t *testing.T) { "vendor/README.md", "LICENSE.txt", "nested/vendor/file.py", + ".prettierignore", + "client/web/.gitattributes", + "no_ignore", + "data/names.csv", } - expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt"} + expectedFiles := []string{"cool.go", "Dockerfile", "README.md", "LICENSE.txt", "no_ignore"} gotFiles := []string{} excludedGlobPatterns := GetDefaultExcludedFilePathPatterns()