diff --git a/.github/actions/python-build/action.yml b/.github/actions/python-build/action.yml index a4f4595..f1968c1 100644 --- a/.github/actions/python-build/action.yml +++ b/.github/actions/python-build/action.yml @@ -34,6 +34,8 @@ runs: - name: Run test shell: bash + env: + WHYLABS_API_KEY: ${{ secrets.WHYLABS_API_KEY }} run: make test - name: Make dists diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index dd62462..6f42e35 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -84,4 +84,6 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Run test + env: + WHYLABS_API_KEY: ${{ secrets.WHYLABS_API_KEY }} run: make test-cache-constraint diff --git a/langkit/core/metric.py b/langkit/core/metric.py index 150251b..1f826ab 100644 --- a/langkit/core/metric.py +++ b/langkit/core/metric.py @@ -41,7 +41,7 @@ def to_numpy(self, column_name: str) -> np.ndarray[Any, Any]: def to_list(self, column_name: str) -> List[Any]: if column_name not in self.text: - raise ValueError(f"Column {column_name} not found in {self.text}") + raise KeyError(f"Column {column_name} not found in {self.text}") if isinstance(self.text, pd.DataFrame): col = cast("pd.Series[Any]", self.text[column_name]) diff --git a/langkit/metrics/embeddings_types.py b/langkit/metrics/embeddings_types.py index 5459b11..f97d528 100644 --- a/langkit/metrics/embeddings_types.py +++ b/langkit/metrics/embeddings_types.py @@ -15,7 +15,8 @@ def encode(self, text: Tuple[str, ...]) -> "torch.Tensor": class EmbeddingEncoder(Protocol): - def encode(self, text: Tuple[str, ...]) -> "torch.Tensor": ... + def encode(self, text: Tuple[str, ...]) -> "torch.Tensor": + ... class CachingEmbeddingEncoder(EmbeddingEncoder): diff --git a/langkit/metrics/embeddings_utils.py b/langkit/metrics/embeddings_utils.py index 1f1bbbc..eb8e650 100644 --- a/langkit/metrics/embeddings_utils.py +++ b/langkit/metrics/embeddings_utils.py @@ -3,10 +3,10 @@ import torch import torch.nn.functional as F -from langkit.metrics.embeddings_types import TransformerEmbeddingAdapter +from langkit.metrics.embeddings_types import EmbeddingEncoder -def compute_embedding_similarity(encoder: TransformerEmbeddingAdapter, _in: List[str], _out: List[str]) -> torch.Tensor: +def compute_embedding_similarity(encoder: EmbeddingEncoder, _in: List[str], _out: List[str]) -> torch.Tensor: in_encoded = torch.as_tensor(encoder.encode(tuple(_in))) out_encoded = torch.as_tensor(encoder.encode(tuple(_out))) return F.cosine_similarity(in_encoded, out_encoded, dim=1) diff --git a/langkit/metrics/injections.py b/langkit/metrics/injections.py index bc94543..031dce7 100644 --- a/langkit/metrics/injections.py +++ b/langkit/metrics/injections.py @@ -81,11 +81,11 @@ def udf(text: pd.DataFrame) -> SingleMetricResult: input_series: "pd.Series[str]" = cast("pd.Series[str]", text[column_name]) if onnx: - _transformer = embedding_adapter() # onnx - target_embeddings: npt.NDArray[np.float32] = _transformer.encode(tuple(input_series)).numpy() # onnx + _transformer = embedding_adapter() + target_embeddings: npt.NDArray[np.float32] = _transformer.encode(tuple(input_series)).numpy() else: _transformer = sentence_transformer() - target_embeddings: npt.NDArray[np.float32] = _transformer.encode(tuple(input_series), show_progress_bar=False) + target_embeddings: npt.NDArray[np.float32] = _transformer.encode(list(input_series), show_progress_bar=False) # pyright: ignore[reportAssignmentType, reportUnknownMemberType] target_norms = target_embeddings / np.linalg.norm(target_embeddings, axis=1, keepdims=True) cosine_similarities = np.dot(_embeddings, target_norms.T) diff --git a/langkit/onnx_encoder.py b/langkit/onnx_encoder.py index eb85c41..7b6364b 100644 --- a/langkit/onnx_encoder.py +++ b/langkit/onnx_encoder.py @@ -1,14 +1,11 @@ # pyright: reportUnknownVariableType=false, reportUnknownMemberType=false, reportUnknownParameterType=false -import time from enum import Enum from functools import lru_cache -from os import environ from typing import Any, List, Tuple, cast import numpy as np import onnxruntime as ort # pyright: ignore[reportMissingImports] import torch -from psutil import cpu_count from transformers import BertTokenizerFast from langkit.asset_downloader import get_asset @@ -17,19 +14,7 @@ @lru_cache def _get_inference_session(onnx_file_path: str): - cpus = cpu_count(logical=True) - # environ["OMP_NUM_THREADS"] = str(cpus) - # environ["OMP_WAIT_POLICY"] = "ACTIVE" - sess_opts: ort.SessionOptions = ort.SessionOptions() - # sess_opts.enable_cpu_mem_arena = True - # sess_opts.inter_op_num_threads = cpus - # sess_opts.intra_op_num_threads = 1 - # sess_opts.execution_mode = ort.ExecutionMode.ORT_PARALLEL - - # sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL - # sess_opts.enable_mem_pattern = True - - return ort.InferenceSession(onnx_file_path, providers=["CPUExecutionProvider"], sess_options=sess_opts) # pyright: ignore[reportUnknownArgumentType] + return ort.InferenceSession(onnx_file_path, providers=["CPUExecutionProvider"]) # pyright: ignore[reportUnknownArgumentType] class TransformerModel(Enum): @@ -40,9 +25,6 @@ def get_model_path(self): return f"{get_asset(name, tag)}/{name}.onnx" -# _times: List[float] = [] - - class OnnxSentenceTransformer(EmbeddingEncoder): def __init__(self, model: TransformerModel): self._tokenizer: BertTokenizerFast = cast(BertTokenizerFast, BertTokenizerFast.from_pretrained("bert-base-uncased")) @@ -50,15 +32,16 @@ def __init__(self, model: TransformerModel): self._session = _get_inference_session(model.get_model_path()) def encode(self, text: Tuple[str, ...]) -> "torch.Tensor": - model_inputs = self._tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True, truncation=True) + # Pre-truncate the inputs to the model length for better performance + max_length_in_chars = self._tokenizer.model_max_length * 5 # approx limit + truncated_text = tuple(content[:max_length_in_chars] for content in text) + model_inputs = self._tokenizer.batch_encode_plus(list(truncated_text), return_tensors="pt", padding=True, truncation=True) + input_tensor: torch.Tensor = cast(torch.Tensor, model_inputs["input_ids"]) inputs_onnx = {"input_ids": input_tensor.cpu().numpy()} attention_mask: torch.Tensor = cast(torch.Tensor, model_inputs["attention_mask"]) inputs_onnx["attention_mask"] = attention_mask.cpu().detach().numpy().astype(np.float32) - start_time = time.perf_counter() onnx_output: List['np.ndarray["Any", "Any"]'] = cast(List['np.ndarray["Any", "Any"]'], self._session.run(None, inputs_onnx)) - # _times.append(time.perf_counter() - start_time) - # print(f"Average time: {sum(_times) / len(_times)}") embedding = OnnxSentenceTransformer.mean_pooling(onnx_output=onnx_output, attention_mask=attention_mask) return embedding[0] diff --git a/langkit/transformer.py b/langkit/transformer.py index e037f49..7e4e0bf 100644 --- a/langkit/transformer.py +++ b/langkit/transformer.py @@ -1,7 +1,7 @@ from functools import lru_cache -import torch from typing import Tuple +import torch from sentence_transformers import SentenceTransformer from langkit.metrics.embeddings_types import CachingEmbeddingEncoder, EmbeddingEncoder