Merge branch 'powerinfer' into ibaldoall

ivanbaldo · Jan 31, 2024 · dbecbb2 · dbecbb2
2 parents 26494c7 + c6d2ea5
commit dbecbb2
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 0 deletions.
diff --git a/powerinfer/Dockerfile b/powerinfer/Dockerfile
@@ -0,0 +1,53 @@
+# From the parent directory (main directory of this repo) run:
+#
+# docker build --build-arg USERID=$(id -u) -t local/powerinfer-bench powerinfer
+#
+# mkdir $HOME/.cache/{huggingface,PowerInfer}
+# docker run --rm -it -v$HOME/.cache/huggingface/:/home/user/.cache/huggingface/:Z \
+#   -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \
+#   local/powerinfer-bench \
+#   sh -c 'cd /home/user/.cache/PowerInfer/ && \
+#   huggingface-cli download --resume-download --local-dir ReluLLaMA-7B \
+#   --local-dir-use-symlinks False PowerInfer/ReluLLaMA-7B-PowerInfer-GGUF'
+#
+# docker run --rm -it --gpus all -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \
+#   local/powerinfer-bench \
+#   build/bin/main --log-disable -m \
+#   /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf \
+#   -p "Once upon a time"
+#
+# docker run --rm -it --name powerinfer-bench --gpus all \
+#   -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \
+#   -v$(pwd):/home/user/llama-inference/:Z \
+#   local/powerinfer-bench \
+#   build/bin/server -v -m \
+#   /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf
+#
+# In another terminal:
+#
+# docker exec -it powerinfer-bench curl --request POST --url http://localhost:8080/completion \
+#   --header "Content-Type: application/json" --data \
+#   '{"prompt": "Building a website can be done in 10 simple steps:", "n_predict": 128}'
+#
+# docker exec -it powerinfer-bench sh -c 'cd /home/user/llama-inference/powerinfer && \
+#   python3 bench.py'
+#
+# If using Podman with CDI substitute
+#   --gpus all
+# for
+#   --device nvidia.com/gpu=all --security-opt=label=disable
+
+# Select an available version from
+# https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/supported-tags.md:
+FROM nvidia/cuda:12.3.1-devel-rockylinux9
+ARG USERID=1000
+RUN yum install -y python3-pip cmake libcudnn8 git && yum clean all && rm -rf /var/cache/yum/*
+RUN git clone https://github.com/SJTU-IPADS/PowerInfer
+WORKDIR PowerInfer
+RUN pip install --no-cache-dir -r requirements.txt
+RUN cmake -S . -B build -DLLAMA_CUBLAS=ON
+RUN cmake --build build --config Release -j $(nproc)
+RUN pip install --no-cache-dir pandas #for the benchmark.
+RUN adduser -u $USERID user
+USER user
+
diff --git a/powerinfer/README.md b/powerinfer/README.md
@@ -0,0 +1,19 @@
+[PowerInfer](https://github.com/SJTU-IPADS/PowerInfer)
+
+Note that the model loses some inference quality in exchange for speed as shown in https://huggingface.co/SparseLLM/ReluLLaMA-7B.
+
+You can compile PowerInfer following their instructions and then run the server:
+
+```bash
+build/bin/server -v -m ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf
+```
+
+And in another terminal run:
+```bash
+python3 bench.py
+```
+
+The results will be in the bench-powerinfer.csv file.
+
+Or alternatively you can follow the instructions in the [Dockerfile](Dockerfile) to build a container to run the server and the benchmark inside it.
+
diff --git a/powerinfer/bench.py b/powerinfer/bench.py
@@ -0,0 +1,17 @@
+import pandas as pd
+import sys
+sys.path.append('../common/')
+from client import chat
+from questions import questions
+
+if __name__ == '__main__':
+    counter = 1
+    responses = []
+    for q in questions:
+        response = chat(q)
+        if counter >= 2: # allow for a warmup
+            responses.append(response)
+        counter +=1
+
+    df = pd.DataFrame(responses)
+    df.to_csv('bench-powerinfer.csv', index=False)
diff --git a/powerinfer/client.py b/powerinfer/client.py
@@ -0,0 +1,21 @@
+import os, requests, time, json
+
+api_base = "http://localhost:8080"
+
+def chat(prompt:str):
+    payload = {"prompt": f"{prompt}", "n_predict": 200}
+    headers = {"Content-Type": "application/json"}
+    start = time.perf_counter()
+    response = requests.post(f"{api_base}/completion", headers = headers, json = payload)
+    response = response.json()
+    request_time = time.perf_counter() - start
+
+    return {'tok_count': response['tokens_predicted'],
+        'time': request_time,
+        'question': prompt,
+        'answer': response['content'],
+        'note': 'PowerInfer ' + response['model']}
+
+if __name__ == '__main__':
+    prompt = "San Francisco is a city in"
+    print(f"User: {prompt}\nPowerInfer: {chat(prompt)['answer']}")