Skip to content

Commit

Permalink
BUG: Correct the input bytes data by langchain_openai xorbitsai#2589
Browse files Browse the repository at this point in the history
  • Loading branch information
xiyuan-lee committed Nov 28, 2024
1 parent 0d4cb9c commit c08fd71
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions xinference/model/embedding/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from typing import Dict, List, Literal, Optional, Tuple, Union, no_type_check

import numpy as np
import tiktoken
import torch

from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError
from ...device_utils import empty_cache
from ...types import Embedding, EmbeddingData, EmbeddingUsage
from ..core import CacheableModelSpec, ModelDescription
Expand Down Expand Up @@ -224,6 +226,28 @@ def to(self, *args, **kwargs):
)

def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
# Check if sentences is a two-dimensional list of integers
if isinstance(sentences, list) and all(
isinstance(item, list) and all(isinstance(i, int) for i in item)
for item in sentences
):
enc = tiktoken.get_encoding("cl100k_base")
lines_decoded = []

for line in sentences:
try:
# Decode each token into bytes, then join them into a complete string
output = b"".join(
enc.decode_single_token_bytes(token) for token in line
)
# Convert the byte sequence into a UTF-8 encoded string
decoded_line = output.decode("utf-8")
lines_decoded.append(decoded_line)
except (ValueError, TypeError, UnicodeDecodeError) as e:
raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], self)

# Update sentences to be the list of decoded strings
sentences = lines_decoded
from FlagEmbedding import BGEM3FlagModel
from sentence_transformers import SentenceTransformer

Expand Down

0 comments on commit c08fd71

Please sign in to comment.