OpenAI compatible servers benchmark based from the anyscale and exlla…

…ma benchmarks.
ivanbaldo · Apr 8, 2024 · 10ee48e · 10ee48e
1 parent e97035d
commit 10ee48e
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 0 deletions.
diff --git a/openai/Dockerfile b/openai/Dockerfile
@@ -0,0 +1,28 @@
+# syntax=docker/dockerfile:1
+
+# From the parent directory (main directory of this repo) run:
+#
+# docker build --build-arg USERID=$(id -u) -t localhost/openai-bench openai
+#
+# docker run --rm -it --name openai-bench -e OPENAI_API_KEY='secret' \
+#   -v"$PWD":/home/user/llama-inference localhost/openai-bench \
+#   --baseurl https://api.openai.com:443/v1 --model gpt-3.5-turbo \
+#   --filename gpt-3.5-turbo.bench.csv --note "gpt-3.5-turbo"
+#
+# The container isn't self contained but designed to be used with the repo.
+
+# Look for newer tags here https://hub.docker.com/_/python:
+FROM docker.io/python:3.12-bookworm
+ARG USERID=1000
+COPY requirements.txt /tmp
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+RUN adduser --disabled-password --gecos '' -u $USERID user
+USER user
+WORKDIR /home/user/llama-inference/openai/
+ENTRYPOINT ["python3", "bench.py"]
+CMD [ \
+    "--baseurl", "https://api.openai.com:443/v1", \
+    "--model", "gpt-3.5-turbo", \
+    "--filename", "/dev/stdout", \
+    "--note", "gpt-3.5-turbo" \
+]
diff --git a/openai/README.md b/openai/README.md
@@ -0,0 +1,16 @@
+# OpenAI Chat API compatible benchmark
+
+Benchmark for servers compatible with the [OpenAI Chat Completions API](https://platform.openai.com/docs/guides/text-generation/chat-completions-api).
+
+You can follow the instructions in the [Dockerfile](Dockerfile) to build a container to run the benchmark.
+
+Or alternatively on your own machine:
+```sh
+cd openai
+pip3 -r requirements.txt
+export OPENAI_API_KEY='secret' #obtain from https://platform.openai.com/api-keys
+python3 client.py #to test a single run
+python3 bench.py --help #get usage help
+python3 bench.py --baseurl https://api.openai.com:443/v1 --model gpt-3.5-turbo \
+    --filename gpt-3.5-turbo.bench.csv --note "gpt-3.5-turbo"
+```
diff --git a/openai/bench.py b/openai/bench.py
@@ -0,0 +1,17 @@
+from client import chat,args
+import sys
+sys.path.append('../common/')
+from questions import questions
+import pandas as pd
+
+if __name__ == '__main__':
+    counter = 1
+    responses = []
+    for q in questions:
+        response = chat(q)
+        if counter >= 2: # allow for a warmup
+            responses.append(response)
+        counter +=1
+
+    df = pd.DataFrame(responses)
+    df.to_csv(args.filename, index=False)
diff --git a/openai/client.py b/openai/client.py
@@ -0,0 +1,44 @@
+from openai import OpenAI
+import argparse
+import time
+
+#TODO: should use a parent parser without --filename here.
+parser = argparse.ArgumentParser(description='Run LLM inference requests and save to a CSV.')
+parser.add_argument('--filename', type=str, default='/dev/stdout',
+                    help='Path to the output CSV file (stdout by default).')
+parser.add_argument('--note', type=str,
+                    help='Note to add to the rows of the file (--model by default).')
+parser.add_argument('--model', type=str, default='gpt-3.5-turbo',
+                    help='Model to use (gpt-3.5-turbo by default).')
+parser.add_argument('--baseurl', type=str, default='https://api.openai.com:443/v1',
+                    help='Endpoint base URL (https://api.openai.com:443/v1 by default).')
+args = parser.parse_args()
+if ('note' not in args):
+    args.note = args.model
+
+client = OpenAI(base_url=args.baseurl)
+
+def chat(prompt:str):
+    start = time.perf_counter()
+    result = client.chat.completions.create(
+        model=args.model,
+        max_tokens=200,
+        messages=[
+            {"role": "system", "content": "You are a very verbose and helpful assistant"},
+            {"role": "user", "content": prompt}
+        ]
+    )
+    request_time = time.perf_counter() - start
+    return {'tok_count': result.usage.completion_tokens,
+        'time': request_time,
+        'question': prompt,
+        'answer': result.choices[0].message.content,
+        'note': args.note}
+
+if __name__ == '__main__':
+    prompt = "San Francisco is a city in"
+    result = chat(prompt)
+    tokPerSec = result['tok_count']/result['time']
+    print(f"User: {prompt}\n"
+          f"Chatbot in {result['time']}s with {result['tok_count']} tokens ({tokPerSec} t/s):\n"
+          f"{result['answer']}")
diff --git a/openai/requirements.txt b/openai/requirements.txt
@@ -0,0 +1,22 @@
+annotated-types==0.6.0
+anyio==4.3.0
+certifi==2024.2.2
+distro==1.9.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+idna==3.6
+numpy==1.26.4
+openai==1.16.2
+pandas==2.2.1
+pydantic==2.6.4
+pydantic_core==2.16.3
+python-dateutil==2.9.0.post0
+pytz==2024.1
+setuptools==69.1.1
+six==1.16.0
+sniffio==1.3.1
+tqdm==4.66.2
+typing_extensions==4.11.0
+tzdata==2024.1
+wheel==0.43.0