forked from hamelsmu/llama-inference
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'powerinfer' into ibaldoall
- Loading branch information
Showing
4 changed files
with
110 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# From the parent directory (main directory of this repo) run: | ||
# | ||
# docker build --build-arg USERID=$(id -u) -t local/powerinfer-bench powerinfer | ||
# | ||
# mkdir $HOME/.cache/{huggingface,PowerInfer} | ||
# docker run --rm -it -v$HOME/.cache/huggingface/:/home/user/.cache/huggingface/:Z \ | ||
# -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ | ||
# local/powerinfer-bench \ | ||
# sh -c 'cd /home/user/.cache/PowerInfer/ && \ | ||
# huggingface-cli download --resume-download --local-dir ReluLLaMA-7B \ | ||
# --local-dir-use-symlinks False PowerInfer/ReluLLaMA-7B-PowerInfer-GGUF' | ||
# | ||
# docker run --rm -it --gpus all -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ | ||
# local/powerinfer-bench \ | ||
# build/bin/main --log-disable -m \ | ||
# /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf \ | ||
# -p "Once upon a time" | ||
# | ||
# docker run --rm -it --name powerinfer-bench --gpus all \ | ||
# -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ | ||
# -v$(pwd):/home/user/llama-inference/:Z \ | ||
# local/powerinfer-bench \ | ||
# build/bin/server -v -m \ | ||
# /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf | ||
# | ||
# In another terminal: | ||
# | ||
# docker exec -it powerinfer-bench curl --request POST --url http://localhost:8080/completion \ | ||
# --header "Content-Type: application/json" --data \ | ||
# '{"prompt": "Building a website can be done in 10 simple steps:", "n_predict": 128}' | ||
# | ||
# docker exec -it powerinfer-bench sh -c 'cd /home/user/llama-inference/powerinfer && \ | ||
# python3 bench.py' | ||
# | ||
# If using Podman with CDI substitute | ||
# --gpus all | ||
# for | ||
# --device nvidia.com/gpu=all --security-opt=label=disable | ||
|
||
# Select an available version from | ||
# https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/supported-tags.md: | ||
FROM nvidia/cuda:12.3.1-devel-rockylinux9 | ||
ARG USERID=1000 | ||
RUN yum install -y python3-pip cmake libcudnn8 git && yum clean all && rm -rf /var/cache/yum/* | ||
RUN git clone https://github.com/SJTU-IPADS/PowerInfer | ||
WORKDIR PowerInfer | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
RUN cmake -S . -B build -DLLAMA_CUBLAS=ON | ||
RUN cmake --build build --config Release -j $(nproc) | ||
RUN pip install --no-cache-dir pandas #for the benchmark. | ||
RUN adduser -u $USERID user | ||
USER user | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
[PowerInfer](https://github.com/SJTU-IPADS/PowerInfer) | ||
|
||
Note that the model loses some inference quality in exchange for speed as shown in https://huggingface.co/SparseLLM/ReluLLaMA-7B. | ||
|
||
You can compile PowerInfer following their instructions and then run the server: | ||
|
||
```bash | ||
build/bin/server -v -m ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf | ||
``` | ||
|
||
And in another terminal run: | ||
```bash | ||
python3 bench.py | ||
``` | ||
|
||
The results will be in the bench-powerinfer.csv file. | ||
|
||
Or alternatively you can follow the instructions in the [Dockerfile](Dockerfile) to build a container to run the server and the benchmark inside it. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import pandas as pd | ||
import sys | ||
sys.path.append('../common/') | ||
from client import chat | ||
from questions import questions | ||
|
||
if __name__ == '__main__': | ||
counter = 1 | ||
responses = [] | ||
for q in questions: | ||
response = chat(q) | ||
if counter >= 2: # allow for a warmup | ||
responses.append(response) | ||
counter +=1 | ||
|
||
df = pd.DataFrame(responses) | ||
df.to_csv('bench-powerinfer.csv', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import os, requests, time, json | ||
|
||
api_base = "http://localhost:8080" | ||
|
||
def chat(prompt:str): | ||
payload = {"prompt": f"{prompt}", "n_predict": 200} | ||
headers = {"Content-Type": "application/json"} | ||
start = time.perf_counter() | ||
response = requests.post(f"{api_base}/completion", headers = headers, json = payload) | ||
response = response.json() | ||
request_time = time.perf_counter() - start | ||
|
||
return {'tok_count': response['tokens_predicted'], | ||
'time': request_time, | ||
'question': prompt, | ||
'answer': response['content'], | ||
'note': 'PowerInfer ' + response['model']} | ||
|
||
if __name__ == '__main__': | ||
prompt = "San Francisco is a city in" | ||
print(f"User: {prompt}\nPowerInfer: {chat(prompt)['answer']}") |