mirror of
https://github.com/sourcegraph/sourcegraph.git
synced 2026-02-06 18:51:59 +00:00
embeddings: replace newlines before for OpenAI model only (#52183)
We replace newlines with spaces for OpenAI embedding models only. This allows using other (OSS) models which do not have such requirements. From OpenAI's Python lib: https://sourcegraph.com/github.com/openai/openai-python/-/blob/openai/embeddings_utils.py?L46 ## Test plan * Manual testing
This commit is contained in:
parent
db2fee355b
commit
08bce28d41
@ -82,11 +82,19 @@ func (c *embeddingsClient) GetEmbeddingsWithRetries(ctx context.Context, texts [
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var MODELS_WITHOUT_NEWLINES = map[string]struct{}{
|
||||
"text-embedding-ada-002": {},
|
||||
}
|
||||
|
||||
func GetEmbeddings(ctx context.Context, texts []string, config *schema.Embeddings) ([]float32, error) {
|
||||
// Replace newlines, which can negatively affect performance.
|
||||
augmentedTexts := make([]string, len(texts))
|
||||
for idx, text := range texts {
|
||||
augmentedTexts[idx] = strings.ReplaceAll(text, "\n", " ")
|
||||
_, replaceNewlines := MODELS_WITHOUT_NEWLINES[config.Model]
|
||||
augmentedTexts := texts
|
||||
if replaceNewlines {
|
||||
augmentedTexts = make([]string, len(texts))
|
||||
// Replace newlines for certain (OpenAI) models, because they can negatively affect performance.
|
||||
for idx, text := range texts {
|
||||
augmentedTexts[idx] = strings.ReplaceAll(text, "\n", " ")
|
||||
}
|
||||
}
|
||||
|
||||
request := EmbeddingAPIRequest{Model: config.Model, Input: augmentedTexts}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user