From 2d8e5a23c95673e42afadca77568ac55dcec8391 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Fri, 15 Mar 2024 19:47:52 -0700
Subject: [PATCH] update docs (#140)

---
 README.md                                     |  2 +-
 docs/docs/benchmarking.md                     | 14 ++--
 docs/docs/contribution.md                     |  4 +-
 docs/docs/deploy.md                           |  8 ++-
 docs/docs/index.md                            |  2 +-
 docs/docs/integrations.md                     | 12 ++--
 docs/docs/python_engine.md                    | 69 ++++++++++---------
 .../infinity_emb/fastapi_schemas/pymodels.py  |  2 +-
 .../infinity_emb/infinity_server.py           | 22 +++---
 .../end_to_end/test_api_with_dummymodel.py    |  4 +-
 .../tests/end_to_end/test_ct2_sentence.py     |  4 +-
 .../tests/end_to_end/test_fastembed.py        |  4 +-
 .../end_to_end/test_optimum_embedding.py      |  4 +-
 .../end_to_end/test_sentence_transformers.py  |  4 +-
 14 files changed, 83 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index b1420ed8..6196a8eb 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 [![ci][ci-shield]][ci-url]
 [![Downloads][pepa-shield]][pepa-url]
 
-Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT Licence](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai).
+Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai).
 
 ## Why Infinity:
 Infinity provides the following features:
diff --git a/docs/docs/benchmarking.md b/docs/docs/benchmarking.md
index ae78f81c..97eee8ff 100644
--- a/docs/docs/benchmarking.md
+++ b/docs/docs/benchmarking.md
@@ -1,12 +1,12 @@
 # Benchmarking details
 
-Benchmarks are always optionated. The goal of this benchmark is to find the best possible self-hosted backend for $/token:
+Benchmarks are always opinionated. The goal of this benchmark is to find the best possible self-hosted backend for $/token:
 
 1. end-to-end, including the RestAPI server
 2. multi-tenant: multiple clients will try to query your server
 3. fair batch size: You want to limit request size (sentences per requests) to something low, such that you can load balance requests, scale
-4. measured over throughput per token: Idle servers are bad for buissness (especially since ). This benchmark is NOT about the latency for a single request against an IDLE server. It partially evaluates the latency under a typical load scenario
-5. Bert Small / large - the most typical semantic search tasks require a small model (< 1B params)
+4. measured over throughput per token: Idle servers are bad for business. This benchmark is NOT about the latency for a single request against an IDLE server. It partially evaluates the latency under a typical load scenario
+5. Bert small / large - the most typical semantic search tasks require a small model (< 1B params)
 6. accuracy: each backend must have a ~1e-4 prevision over the torch fp32 embeddings.
 
 ## Benchmarking machines:
@@ -43,7 +43,7 @@ python ./docs/benchmarks/simple_app.py
 
 ### huggingface/text-embeddings-inference
 
-using the _cpu_ and _89-cuda_ container (note that cc-89 matches to Nvidia L4)
+using the _cpu_ and _cuda-89_ container (note that cc-89 matches to Nvidia L4)
 ```bash
 docker run -it -p 7997:80 --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-0.6 
 --model-id BAAI/bge-small-en-v1.5 --max-client-batch-size 256
@@ -69,8 +69,8 @@ make benchmark_embed
 ```
 
 Below are the following metrics:
-- Requests # / sec (1 request = 256 sentences / 115_000 tokens)
-- time to run benchmark (10 requests / 1_150_000)
+*  Requests # / sec (1 request = 256 sentences / 115_000 tokens)
+*  time to run benchmark (10 requests = 1_150_000 tokens)
 
 ### Results: CPU-only (_BAAI/bge-small-en-v1.5_ | _bert-small_)
 
@@ -80,7 +80,7 @@ Below are the following metrics:
 | infinity-optimum (onnx)           | 125.342        | 0.08                    |
 | fastembed (onnx)                  | 125.770        | 0.08                    |
 | sentence-transformers (torch)     | 256.884        | 0.04                    |
-| infinity (torch / compile)        | 353.065??      | 0.03???                 |
+| infinity (torch)                  | 353.065??      | 0.03 (needs revision)   |
 | huggingface/TEI (candle)          | 1104.357       | 0.009                   |
 
 
diff --git a/docs/docs/contribution.md b/docs/docs/contribution.md
index 7ccc4093..672f5ce2 100644
--- a/docs/docs/contribution.md
+++ b/docs/docs/contribution.md
@@ -10,14 +10,14 @@ cd libs/infinity_emb
 poetry install --extras all --with test
 ```
 
-To pass the CI:
+To ensure your contributions pass the Continuous Integration (CI) checks:
 ```bash
 cd libs/infinity_emb
 make format
 make lint
 poetry run pytest ./tests
 ```
-as alternative, you may also use:
+As an alternative, you can also use the following command:
 ```bash
 cd libs/infinity_emb
 make precommit
diff --git a/docs/docs/deploy.md b/docs/docs/deploy.md
index 7676227c..b7a3e179 100644
--- a/docs/docs/deploy.md
+++ b/docs/docs/deploy.md
@@ -1,6 +1,9 @@
 # Deployment
 
 ### Docker: Launch the CLI using a pre-built docker container
+
+Launch the Infinity model using a pre-built Docker container by running the following command. This command uses Docker to run the Infinity CLI with the specified model and port. The optional `HF_HOME` environment variable allows you to control the download path at runtime. 
+
 ```bash
 model=BAAI/bge-small-en-v1.5
 port=7997
@@ -8,7 +11,7 @@ docker run \
   -it --gpus all -p $port:$port michaelf34/infinity:latest \
   --model-name-or-path $model --port $port
 ```
-The download path at runtime, can be controlled via the environment variable `HF_HOME`.
+
 
 ### dstack
 dstack allows you to provision a VM instance on the cloud of your choice.
@@ -25,8 +28,7 @@ commands:
 port: 80
 ```
 
-Then, simply run the following dstack command.
-After this, a prompt will appear to let you choose which VM instance to deploy the Infinity.
+To deploy the service, execute the following dstack command. A prompt will guide you through selecting the desired VM instance for deploying Infinity.
 
 ```shell
 dstack run . -f infinity/serve.dstack.yml --gpu 16GB
diff --git a/docs/docs/index.md b/docs/docs/index.md
index 5ac0d667..a7774ce3 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -1,4 +1,4 @@
-Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT Licence](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai).
+Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai).
 
 ## Why Infinity:
 
diff --git a/docs/docs/integrations.md b/docs/docs/integrations.md
index be3b627f..73266240 100644
--- a/docs/docs/integrations.md
+++ b/docs/docs/integrations.md
@@ -1,11 +1,11 @@
 # Python Integrations
 
-## Langchain (from runnig server)
-Infinity has a official integration into `pip install langchain>=0.342`. 
+## Langchain (from running server)
+Infinity has an official integration into `pip install langchain>=0.342`. 
 You can find more documentation on that here:
 https://python.langchain.com/docs/integrations/text_embedding/infinity
 
-### Server-Client
+### Langchain integration with running infinity API server
 This code snippet assumes you have a server running at `http://localhost:7997/v1`
 ```python
 from langchain.embeddings.infinity import InfinityEmbeddings
@@ -14,10 +14,10 @@ from langchain.docstore.document import Document
 documents = [Document(page_content="Hello world!", metadata={"source": "unknown"})]
 
 emb_model = InfinityEmbeddings(model="BAAI/bge-small", infinity_api_url="http://localhost:7997/v1")
-print(emb_model.embed_documents([doc.page_content for doc in docs]))
+print(emb_model.embed_documents([doc.page_content for doc in documents]))
 ```
 
-### from Python Engine
+### Langchain integration without running infinity API server and Python Inference.
 ```python
 from langchain.embeddings.infinity import InfinityEmbeddings
 from langchain.docstore.document import Document
@@ -47,4 +47,4 @@ print(documents_embedded, query_result)
 ```
 
 ## LLama-Index
-To be announced
\ No newline at end of file
+Details regarding LLama-Index integration will be announced soon - Contributions welcome.
\ No newline at end of file
diff --git a/docs/docs/python_engine.md b/docs/docs/python_engine.md
index ea8ecc3e..a1538932 100644
--- a/docs/docs/python_engine.md
+++ b/docs/docs/python_engine.md
@@ -1,74 +1,81 @@
-# Python Engine 
+Enhancing the document involves improving clarity, structure, and adding helpful context where necessary. Here's an enhanced version:
 
-## Launch via Python
+# Python Engine Integration
 
-You can use in a async context with asyncio. 
-This gives you most flexibility, but is a bit more advanced.
+## Launching Embedding generation with Python
+
+Use asynchronous programming in Python using `asyncio` for flexible and efficient embedding processing with Infinity. This advanced method allows for concurrent execution, making it ideal for high-throughput embedding generation.
 
 ```python
 import asyncio
 from infinity_emb import AsyncEmbeddingEngine, EngineArgs
 
-sentences = [
-    "Embed this is sentence via Infinity.",
-    "Paris is in France."
-]
+# Define sentences for embedding
+sentences = ["Embed this sentence via Infinity.", "Paris is in France."]
+# Initialize the embedding engine with model specifications
 engine = AsyncEmbeddingEngine.from_args(
-    EngineArgs(model_name_or_path = "BAAI/bge-small-en-v1.5", engine="torch")
+    EngineArgs(model_name_or_path="BAAI/bge-small-en-v1.5", engine="torch", 
+    lengths_via_tokenize=True
+    )
 )
 
 async def main(): 
-    async with engine: 
-        # entering context: engine starts with engine.astart()
-        embeddings, usage = await engine.embed(
-            sentences=sentences)
-    # engine stops with engine.astop()
+    async with engine:  # Context manager initializes and terminates the engine
+        # usage is total token count according to tokenizer.
+        embeddings, usage = await engine.embed(sentences=sentences)
+        # Embeddings are now available for use
 asyncio.run(main())
 ```
 
-# ReRanker
+## Reranker
 
-Reranking gives you a score for similarity between a query and multiple documents. 
-Use it in conjunction with a VectorDB+Embeddings, or as standalone for small amount of documents.
-Please select a model from huggingface that is a AutoModelForSequenceClassification with one class classification.
+Enhance search results by reranking based on the similarity between a query and a set of documents. This feature is particularly useful in conjunction with vector databases and embeddings, or as a standalone solution for small datasets. Ensure you choose a Hugging Face model designed for sequence classification with a single output class, e.g. "BAAI/bge-reranker-base". Further models are usually listed as `rerank` models on HuggingFace https://huggingface.co/models?pipeline_tag=text-classification&sort=trending&search=rerank. 
 
 ```python
 import asyncio
 from infinity_emb import AsyncEmbeddingEngine, EngineArgs
+
+# Define your query and documents
 query = "What is the python package infinity_emb?"
 docs = [
-    "This is a document not related to the python package infinity_emb, hence...", 
+    "This document is unrelated to the python package infinity_emb.", 
     "Paris is in France!",
-    "infinity_emb is a package for sentence embeddings"
+    "infinity_emb is a package for generating sentence embeddings."
 ]
-engine_args = EngineArgs(
-    model_name_or_path = "BAAI/bge-reranker-base", 
-    engine="torch")
 
+# Configure the reranking engine
+engine_args = EngineArgs(model_name_or_path="BAAI/bge-reranker-base", engine="torch")
 engine = AsyncEmbeddingEngine.from_args(engine_args)
+
 async def main(): 
     async with engine:
-        ranking, usage = await engine.rerank(
-            query=query, docs=docs)
+        ranking, usage = await engine.rerank(query=query, docs=docs)
+        # Display ranked documents
         print(list(zip(ranking, docs)))
 asyncio.run(main())
 ```
 
-# Text-Classification (Beta)
-  
+## Text Classification (Beta)
+
+Explore text classification with Infinity's `classify` feature, which allows for sentiment analysis, emotion detection, and more classification tasks. Utilize pre-trained classification models on your text data.
+
 ```python
 import asyncio
 from infinity_emb import AsyncEmbeddingEngine, EngineArgs
 
+# Example sentences for classification
 sentences = ["This is awesome.", "I am bored."]
+# Setup engine with text classification model
 engine_args = EngineArgs(
-    model_name_or_path = "SamLowe/roberta-base-go_emotions", 
+    model_name_or_path="SamLowe/roberta-base-go_emotions", 
     engine="torch", model_warmup=True)
 engine = AsyncEmbeddingEngine.from_args(engine_args)
+
 async def main(): 
     async with engine:
-        predictions, usage = await engine.classify(
-            sentences=sentences)
+        predictions, usage = await engine.classify(sentences=sentences)
+        # Access classification predictions
         return predictions, usage
 asyncio.run(main())
-```
\ No newline at end of file
+```
+
diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
index 8c150b7e..c9d6bd8b 100644
--- a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
+++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
@@ -110,5 +110,5 @@ class ModelInfo(BaseModel):
 
 
 class OpenAIModelInfo(BaseModel):
-    data: ModelInfo
+    data: list[ModelInfo]
     object: str = "list"
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 08a99766..4b8a79da 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -93,16 +93,18 @@ async def _models():
         """get models endpoint"""
         s = app.model.overload_status()  # type: ignore
         return dict(
-            data=dict(
-                id=engine_args.model_name_or_path,
-                stats=dict(
-                    queue_fraction=s.queue_fraction,
-                    queue_absolute=s.queue_absolute,
-                    results_pending=s.results_absolute,
-                    batch_size=engine_args.batch_size,
-                ),
-                backend=engine_args.engine.name,
-            )
+            data=[
+                dict(
+                    id=engine_args.model_name_or_path,
+                    stats=dict(
+                        queue_fraction=s.queue_fraction,
+                        queue_absolute=s.queue_absolute,
+                        results_pending=s.results_absolute,
+                        batch_size=engine_args.batch_size,
+                    ),
+                    backend=engine_args.engine.name,
+                )
+            ]
         )
 
     @app.post(
diff --git a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
index a3bf392d..19c9b1d8 100644
--- a/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
+++ b/libs/infinity_emb/tests/end_to_end/test_api_with_dummymodel.py
@@ -39,8 +39,8 @@ async def test_model_route(client):
     assert response.status_code == 200
     rdata = response.json()
     assert "data" in rdata
-    assert rdata["data"].get("id", "") == MODEL_NAME
-    assert isinstance(rdata["data"].get("stats"), dict)
+    assert rdata["data"][0].get("id", "") == MODEL_NAME
+    assert isinstance(rdata["data"][0].get("stats"), dict)
 
     # ready test
     response = await client.get("/ready")
diff --git a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
index 16171d70..9cf05817 100644
--- a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
+++ b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
@@ -62,8 +62,8 @@ async def test_model_route(client):
     assert response.status_code == 200
     rdata = response.json()
     assert "data" in rdata
-    assert rdata["data"].get("id", "") == MODEL
-    assert isinstance(rdata["data"].get("stats"), dict)
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
 
 
 @pytest.mark.anyio
diff --git a/libs/infinity_emb/tests/end_to_end/test_fastembed.py b/libs/infinity_emb/tests/end_to_end/test_fastembed.py
index 81496a9d..22fae4a1 100644
--- a/libs/infinity_emb/tests/end_to_end/test_fastembed.py
+++ b/libs/infinity_emb/tests/end_to_end/test_fastembed.py
@@ -43,8 +43,8 @@ async def test_model_route(client):
     assert response.status_code == 200
     rdata = response.json()
     assert "data" in rdata
-    assert rdata["data"].get("id", "") == MODEL
-    assert isinstance(rdata["data"].get("stats"), dict)
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
 
 
 @pytest.mark.anyio
diff --git a/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py b/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py
index c1e1b05e..cb36b91a 100644
--- a/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py
+++ b/libs/infinity_emb/tests/end_to_end/test_optimum_embedding.py
@@ -44,8 +44,8 @@ async def test_model_route(client):
     assert response.status_code == 200
     rdata = response.json()
     assert "data" in rdata
-    assert rdata["data"].get("id", "") == MODEL
-    assert isinstance(rdata["data"].get("stats"), dict)
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
 
 
 @pytest.mark.anyio
diff --git a/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py b/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py
index 7265d2d5..d6ce60fa 100644
--- a/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py
+++ b/libs/infinity_emb/tests/end_to_end/test_sentence_transformers.py
@@ -48,8 +48,8 @@ async def test_model_route(client):
     assert response.status_code == 200
     rdata = response.json()
     assert "data" in rdata
-    assert rdata["data"].get("id", "") == MODEL
-    assert isinstance(rdata["data"].get("stats"), dict)
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
 
 
 @pytest.mark.anyio