From 2d8e5a23c95673e42afadca77568ac55dcec8391 Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Fri, 15 Mar 2024 19:47:52 -0700 Subject: [PATCH] update docs (#140) --- README.md | 2 +- docs/docs/benchmarking.md | 14 ++-- docs/docs/contribution.md | 4 +- docs/docs/deploy.md | 8 ++- docs/docs/index.md | 2 +- docs/docs/integrations.md | 12 ++-- docs/docs/python_engine.md | 69 ++++++++++--------- .../infinity_emb/fastapi_schemas/pymodels.py | 2 +- .../infinity_emb/infinity_server.py | 22 +++--- .../end_to_end/test_api_with_dummymodel.py | 4 +- .../tests/end_to_end/test_ct2_sentence.py | 4 +- .../tests/end_to_end/test_fastembed.py | 4 +- .../end_to_end/test_optimum_embedding.py | 4 +- .../end_to_end/test_sentence_transformers.py | 4 +- 14 files changed, 83 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index b1420ed8..6196a8eb 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ [![ci][ci-shield]][ci-url] [![Downloads][pepa-shield]][pepa-url] -Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT Licence](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai). +Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai). ## Why Infinity: Infinity provides the following features: diff --git a/docs/docs/benchmarking.md b/docs/docs/benchmarking.md index ae78f81c..97eee8ff 100644 --- a/docs/docs/benchmarking.md +++ b/docs/docs/benchmarking.md @@ -1,12 +1,12 @@ # Benchmarking details -Benchmarks are always optionated. The goal of this benchmark is to find the best possible self-hosted backend for $/token: +Benchmarks are always opinionated. The goal of this benchmark is to find the best possible self-hosted backend for $/token: 1. end-to-end, including the RestAPI server 2. multi-tenant: multiple clients will try to query your server 3. fair batch size: You want to limit request size (sentences per requests) to something low, such that you can load balance requests, scale -4. measured over throughput per token: Idle servers are bad for buissness (especially since ). This benchmark is NOT about the latency for a single request against an IDLE server. It partially evaluates the latency under a typical load scenario -5. Bert Small / large - the most typical semantic search tasks require a small model (< 1B params) +4. measured over throughput per token: Idle servers are bad for business. This benchmark is NOT about the latency for a single request against an IDLE server. It partially evaluates the latency under a typical load scenario +5. Bert small / large - the most typical semantic search tasks require a small model (< 1B params) 6. accuracy: each backend must have a ~1e-4 prevision over the torch fp32 embeddings. ## Benchmarking machines: @@ -43,7 +43,7 @@ python ./docs/benchmarks/simple_app.py ### huggingface/text-embeddings-inference -using the _cpu_ and _89-cuda_ container (note that cc-89 matches to Nvidia L4) +using the _cpu_ and _cuda-89_ container (note that cc-89 matches to Nvidia L4) ```bash docker run -it -p 7997:80 --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-0.6 --model-id BAAI/bge-small-en-v1.5 --max-client-batch-size 256 @@ -69,8 +69,8 @@ make benchmark_embed ``` Below are the following metrics: -- Requests # / sec (1 request = 256 sentences / 115_000 tokens) -- time to run benchmark (10 requests / 1_150_000) +* Requests # / sec (1 request = 256 sentences / 115_000 tokens) +* time to run benchmark (10 requests = 1_150_000 tokens) ### Results: CPU-only (_BAAI/bge-small-en-v1.5_ | _bert-small_) @@ -80,7 +80,7 @@ Below are the following metrics: | infinity-optimum (onnx) | 125.342 | 0.08 | | fastembed (onnx) | 125.770 | 0.08 | | sentence-transformers (torch) | 256.884 | 0.04 | -| infinity (torch / compile) | 353.065?? | 0.03??? | +| infinity (torch) | 353.065?? | 0.03 (needs revision) | | huggingface/TEI (candle) | 1104.357 | 0.009 | diff --git a/docs/docs/contribution.md b/docs/docs/contribution.md index 7ccc4093..672f5ce2 100644 --- a/docs/docs/contribution.md +++ b/docs/docs/contribution.md @@ -10,14 +10,14 @@ cd libs/infinity_emb poetry install --extras all --with test ``` -To pass the CI: +To ensure your contributions pass the Continuous Integration (CI) checks: ```bash cd libs/infinity_emb make format make lint poetry run pytest ./tests ``` -as alternative, you may also use: +As an alternative, you can also use the following command: ```bash cd libs/infinity_emb make precommit diff --git a/docs/docs/deploy.md b/docs/docs/deploy.md index 7676227c..b7a3e179 100644 --- a/docs/docs/deploy.md +++ b/docs/docs/deploy.md @@ -1,6 +1,9 @@ # Deployment ### Docker: Launch the CLI using a pre-built docker container + +Launch the Infinity model using a pre-built Docker container by running the following command. This command uses Docker to run the Infinity CLI with the specified model and port. The optional `HF_HOME` environment variable allows you to control the download path at runtime. + ```bash model=BAAI/bge-small-en-v1.5 port=7997 @@ -8,7 +11,7 @@ docker run \ -it --gpus all -p $port:$port michaelf34/infinity:latest \ --model-name-or-path $model --port $port ``` -The download path at runtime, can be controlled via the environment variable `HF_HOME`. + ### dstack dstack allows you to provision a VM instance on the cloud of your choice. @@ -25,8 +28,7 @@ commands: port: 80 ``` -Then, simply run the following dstack command. -After this, a prompt will appear to let you choose which VM instance to deploy the Infinity. +To deploy the service, execute the following dstack command. A prompt will guide you through selecting the desired VM instance for deploying Infinity. ```shell dstack run . -f infinity/serve.dstack.yml --gpu 16GB diff --git a/docs/docs/index.md b/docs/docs/index.md index 5ac0d667..a7774ce3 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,4 +1,4 @@ -Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT Licence](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai). +Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai). ## Why Infinity: diff --git a/docs/docs/integrations.md b/docs/docs/integrations.md index be3b627f..73266240 100644 --- a/docs/docs/integrations.md +++ b/docs/docs/integrations.md @@ -1,11 +1,11 @@ # Python Integrations -## Langchain (from runnig server) -Infinity has a official integration into `pip install langchain>=0.342`. +## Langchain (from running server) +Infinity has an official integration into `pip install langchain>=0.342`. You can find more documentation on that here: https://python.langchain.com/docs/integrations/text_embedding/infinity -### Server-Client +### Langchain integration with running infinity API server This code snippet assumes you have a server running at `http://localhost:7997/v1` ```python from langchain.embeddings.infinity import InfinityEmbeddings @@ -14,10 +14,10 @@ from langchain.docstore.document import Document documents = [Document(page_content="Hello world!", metadata={"source": "unknown"})] emb_model = InfinityEmbeddings(model="BAAI/bge-small", infinity_api_url="http://localhost:7997/v1") -print(emb_model.embed_documents([doc.page_content for doc in docs])) +print(emb_model.embed_documents([doc.page_content for doc in documents])) ``` -### from Python Engine +### Langchain integration without running infinity API server and Python Inference. ```python from langchain.embeddings.infinity import InfinityEmbeddings from langchain.docstore.document import Document @@ -47,4 +47,4 @@ print(documents_embedded, query_result) ``` ## LLama-Index -To be announced \ No newline at end of file +Details regarding LLama-Index integration will be announced soon - Contributions welcome. \ No newline at end of file diff --git a/docs/docs/python_engine.md b/docs/docs/python_engine.md index ea8ecc3e..a1538932 100644 --- a/docs/docs/python_engine.md +++ b/docs/docs/python_engine.md @@ -1,74 +1,81 @@ -# Python Engine +Enhancing the document involves improving clarity, structure, and adding helpful context where necessary. Here's an enhanced version: -## Launch via Python +# Python Engine Integration -You can use in a async context with asyncio. -This gives you most flexibility, but is a bit more advanced. +## Launching Embedding generation with Python + +Use asynchronous programming in Python using `asyncio` for flexible and efficient embedding processing with Infinity. This advanced method allows for concurrent execution, making it ideal for high-throughput embedding generation. ```python import asyncio from infinity_emb import AsyncEmbeddingEngine, EngineArgs -sentences = [ - "Embed this is sentence via Infinity.", - "Paris is in France." -] +# Define sentences for embedding +sentences = ["Embed this sentence via Infinity.", "Paris is in France."] +# Initialize the embedding engine with model specifications engine = AsyncEmbeddingEngine.from_args( - EngineArgs(model_name_or_path = "BAAI/bge-small-en-v1.5", engine="torch") + EngineArgs(model_name_or_path="BAAI/bge-small-en-v1.5", engine="torch", + lengths_via_tokenize=True + ) ) async def main(): - async with engine: - # entering context: engine starts with engine.astart() - embeddings, usage = await engine.embed( - sentences=sentences) - # engine stops with engine.astop() + async with engine: # Context manager initializes and terminates the engine + # usage is total token count according to tokenizer. + embeddings, usage = await engine.embed(sentences=sentences) + # Embeddings are now available for use asyncio.run(main()) ``` -# ReRanker +## Reranker -Reranking gives you a score for similarity between a query and multiple documents. -Use it in conjunction with a VectorDB+Embeddings, or as standalone for small amount of documents. -Please select a model from huggingface that is a AutoModelForSequenceClassification with one class classification. +Enhance search results by reranking based on the similarity between a query and a set of documents. This feature is particularly useful in conjunction with vector databases and embeddings, or as a standalone solution for small datasets. Ensure you choose a Hugging Face model designed for sequence classification with a single output class, e.g. "BAAI/bge-reranker-base". Further models are usually listed as `rerank` models on HuggingFace https://huggingface.co/models?pipeline_tag=text-classification&sort=trending&search=rerank. ```python import asyncio from infinity_emb import AsyncEmbeddingEngine, EngineArgs + +# Define your query and documents query = "What is the python package infinity_emb?" docs = [ - "This is a document not related to the python package infinity_emb, hence...", + "This document is unrelated to the python package infinity_emb.", "Paris is in France!", - "infinity_emb is a package for sentence embeddings" + "infinity_emb is a package for generating sentence embeddings." ] -engine_args = EngineArgs( - model_name_or_path = "BAAI/bge-reranker-base", - engine="torch") +# Configure the reranking engine +engine_args = EngineArgs(model_name_or_path="BAAI/bge-reranker-base", engine="torch") engine = AsyncEmbeddingEngine.from_args(engine_args) + async def main(): async with engine: - ranking, usage = await engine.rerank( - query=query, docs=docs) + ranking, usage = await engine.rerank(query=query, docs=docs) + # Display ranked documents print(list(zip(ranking, docs))) asyncio.run(main()) ``` -# Text-Classification (Beta) - +## Text Classification (Beta) + +Explore text classification with Infinity's `classify` feature, which allows for sentiment analysis, emotion detection, and more classification tasks. Utilize pre-trained classification models on your text data. + ```python import asyncio from infinity_emb import AsyncEmbeddingEngine, EngineArgs +# Example sentences for classification sentences = ["This is awesome.", "I am bored."] +# Setup engine with text classification model engine_args = EngineArgs( - model_name_or_path = "SamLowe/roberta-base-go_emotions", + model_name_or_path="SamLowe/roberta-base-go_emotions", engine="torch", model_warmup=True) engine = AsyncEmbeddingEngine.from_args(engine_args) + async def main(): async with engine: - predictions, usage = await engine.classify( - sentences=sentences) + predictions, usage = await engine.classify(sentences=sentences) + # Access classification predictions return predictions, usage asyncio.run(main()) -``` \ No newline at end of file +``` + diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py index 8c150b7e..c9d6bd8b 100644 --- a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py +++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py @@ -110,5 +110,5 @@ class ModelInfo(BaseModel): class OpenAIModelInfo(BaseModel): - data: ModelInfo + data: list[ModelInfo] object: str = "list" diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py index 08a99766..4b8a79da 100644 --- a/libs/infinity_emb/infinity_emb/infinity_server.py +++ b/libs/infinity_emb/infinity_emb/infinity_server.py @@ -93,16 +93,18 @@ async def _models(): """get models endpoint""" s = app.model.overload_status() # type: ignore return dict( - data=dict( - id=engine_args.model_name_or_path, - stats=dict( - queue_fraction=s.queue_fraction, - queue_absolute=s.queue_absolute, - results_pending=s.results_absolute, - batch_size=engine_args.batch_size, - ), - backend=engine_args.engine.name, - ) + data=[ + dict( + id=engine_args.model_name_or_path, + stats=dict( + queue_fraction=s.queue_fraction, + queue_absolute=s.queue_absolute, + results_pending=s.results_absolute, + batch_size=engine_args.batch_size, + ), + backend=engine_args.engine.name, + ) + ] ) @app.post( diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py index a3bf392d..19c9b1d8 100644 --- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py +++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py @@ -39,8 +39,8 @@ async def test_model_route(client): assert response.status_code == 200 rdata = response.json() assert "data" in rdata - assert rdata["data"].get("id", "") == MODEL_NAME - assert isinstance(rdata["data"].get("stats"), dict) + assert rdata["data"][0].get("id", "") == MODEL_NAME + assert isinstance(rdata["data"][0].get("stats"), dict) # ready test response = await client.get("/ready") diff --git a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py index 16171d70..9cf05817 100644 --- a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py +++ b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py @@ -62,8 +62,8 @@ async def test_model_route(client): assert response.status_code == 200 rdata = response.json() assert "data" in rdata - assert rdata["data"].get("id", "") == MODEL - assert isinstance(rdata["data"].get("stats"), dict) + assert rdata["data"][0].get("id", "") == MODEL + assert isinstance(rdata["data"][0].get("stats"), dict) @pytest.mark.anyio diff --git a/libs/infinity_emb/tests/end_to_end/test_fastembed.py b/libs/infinity_emb/tests/end_to_end/test_fastembed.py index 81496a9d..22fae4a1 100644 --- a/libs/infinity_emb/tests/end_to_end/test_fastembed.py +++ b/libs/infinity_emb/tests/end_to_end/test_fastembed.py @@ -43,8 +43,8 @@ async def test_model_route(client): assert response.status_code == 200 rdata = response.json() assert "data" in rdata - assert rdata["data"].get("id", "") == MODEL - assert isinstance(rdata["data"].get("stats"), dict) + assert rdata["data"][0].get("id", "") == MODEL + assert isinstance(rdata["data"][0].get("stats"), dict) @pytest.mark.anyio diff --git a/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py b/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py index c1e1b05e..cb36b91a 100644 --- a/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py +++ b/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py @@ -44,8 +44,8 @@ async def test_model_route(client): assert response.status_code == 200 rdata = response.json() assert "data" in rdata - assert rdata["data"].get("id", "") == MODEL - assert isinstance(rdata["data"].get("stats"), dict) + assert rdata["data"][0].get("id", "") == MODEL + assert isinstance(rdata["data"][0].get("stats"), dict) @pytest.mark.anyio diff --git a/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py b/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py index 7265d2d5..d6ce60fa 100644 --- a/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py +++ b/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py @@ -48,8 +48,8 @@ async def test_model_route(client): assert response.status_code == 200 rdata = response.json() assert "data" in rdata - assert rdata["data"].get("id", "") == MODEL - assert isinstance(rdata["data"].get("stats"), dict) + assert rdata["data"][0].get("id", "") == MODEL + assert isinstance(rdata["data"][0].get("stats"), dict) @pytest.mark.anyio