Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Commit

Permalink
Embeddings: fail fast when embedding empty string (#53787)
Browse files Browse the repository at this point in the history
The OpenAI API fails when given an input array with any empty strings:

```
❯ curl https://api.openai.com/v1/embeddings \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer xxxxx" \
  -d '{
    "input": ["a", ""],
    "model": "text-embedding-ada-002"
  }'
{
  "error": {
    "message": "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.",
    "type": "invalid_request_error",
    "param": null,
    "code": null
  }
}
```

When using Cody Gateway, we have three layers of retries: 20 retries
from ExternalDoer in Cody Gateway, 20 retries from ExternalDoer in
`embeddings`, and 3 retries from `GetEmbeddingsWithRetry` for a total of
_1200 retries_. Instead of doing that, this just lets us fail fast. We
should separately fix the massive number of retries.
  • Loading branch information
camdencheek authored and ErikaRS committed Jun 22, 2023
1 parent 7850383 commit d015b2b
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 0 deletions.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ type openaiClient struct {
const apiURL = "https://api.openai.com/v1/embeddings"

func (c *openaiClient) GenerateEmbeddings(ctx context.Context, input codygateway.EmbeddingsRequest) (*codygateway.EmbeddingsResponse, int, error) {
for _, s := range input.Input {
if s == "" {
// The OpenAI API will return an error if any of the strings in texts is an empty string,
// so fail fast to avoid making tons of retryable requests.
return nil, 0, response.NewHTTPStatusCodeError(http.StatusBadRequest, errors.New("cannot generate embeddings for an empty string"))
}
}

openAIModel, ok := openAIModelMappings[input.Model]
if !ok {
return nil, 0, response.NewHTTPStatusCodeError(http.StatusBadRequest, errors.Newf("no OpenAI model found for %q", input.Model))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package embeddings

import (
"context"
"testing"

"github.com/sourcegraph/sourcegraph/internal/codygateway"
"github.com/stretchr/testify/require"
)

func TestOpenAI(t *testing.T) {
t.Run("errors on empty embedding string", func(t *testing.T) {
client := NewOpenAIClient("")
_, _, err := client.GenerateEmbeddings(context.Background(), codygateway.EmbeddingsRequest{
Input: []string{"a", ""}, // empty string is invalid
})
require.ErrorContains(t, err, "empty string")
})
}
11 changes: 11 additions & 0 deletions enterprise/internal/embeddings/embed/client/openai/BUILD.bazel

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions enterprise/internal/embeddings/embed/client/openai/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ func (c *openaiEmbeddingsClient) GetModelIdentifier() string {
// In case of failure, it retries the embedding procedure up to maxRetries. This due to the OpenAI API which
// often hangs up when downloading large embedding responses.
func (c *openaiEmbeddingsClient) GetEmbeddingsWithRetries(ctx context.Context, texts []string, maxRetries int) ([]float32, error) {
for _, text := range texts {
if text == "" {
// The OpenAI API will return an error if any of the strings in texts is an empty string,
// so fail fast to avoid making tons of retryable requests.
return nil, errors.New("cannot generate embeddings for an empty string")
}
}

embeddings, err := c.getEmbeddings(ctx, texts)
if err == nil {
return embeddings, nil
Expand Down
18 changes: 18 additions & 0 deletions enterprise/internal/embeddings/embed/client/openai/client_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package openai

import (
"context"
"testing"

"github.com/sourcegraph/sourcegraph/internal/conf/conftypes"
"github.com/stretchr/testify/require"
)

func TestOpenAI(t *testing.T) {
t.Run("errors on empty embedding string", func(t *testing.T) {
client := NewClient(&conftypes.EmbeddingsConfig{})
invalidTexts := []string{"a", ""} // empty string is invalid
_, err := client.GetEmbeddingsWithRetries(context.Background(), invalidTexts, 10)
require.ErrorContains(t, err, "empty string")
})
}

0 comments on commit d015b2b

Please sign in to comment.