diff --git a/powerinfer/Dockerfile b/powerinfer/Dockerfile new file mode 100644 index 0000000..5d68019 --- /dev/null +++ b/powerinfer/Dockerfile @@ -0,0 +1,53 @@ +# From the parent directory (main directory of this repo) run: +# +# docker build --build-arg USERID=$(id -u) -t local/powerinfer-bench powerinfer +# +# mkdir $HOME/.cache/{huggingface,PowerInfer} +# docker run --rm -it -v$HOME/.cache/huggingface/:/home/user/.cache/huggingface/:Z \ +# -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ +# local/powerinfer-bench \ +# sh -c 'cd /home/user/.cache/PowerInfer/ && \ +# huggingface-cli download --resume-download --local-dir ReluLLaMA-7B \ +# --local-dir-use-symlinks False PowerInfer/ReluLLaMA-7B-PowerInfer-GGUF' +# +# docker run --rm -it --gpus all -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ +# local/powerinfer-bench \ +# build/bin/main --log-disable -m \ +# /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf \ +# -p "Once upon a time" +# +# docker run --rm -it --name powerinfer-bench --gpus all \ +# -v$HOME/.cache/PowerInfer/:/home/user/.cache/PowerInfer/:Z \ +# -v$(pwd):/home/user/llama-inference/:Z \ +# local/powerinfer-bench \ +# build/bin/server -v -m \ +# /home/user/.cache/PowerInfer/ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf +# +# In another terminal: +# +# docker exec -it powerinfer-bench curl --request POST --url http://localhost:8080/completion \ +# --header "Content-Type: application/json" --data \ +# '{"prompt": "Building a website can be done in 10 simple steps:", "n_predict": 128}' +# +# docker exec -it powerinfer-bench sh -c 'cd /home/user/llama-inference/powerinfer && \ +# python3 bench.py' +# +# If using Podman with CDI substitute +# --gpus all +# for +# --device nvidia.com/gpu=all --security-opt=label=disable + +# Select an available version from +# https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/supported-tags.md: +FROM nvidia/cuda:12.3.1-devel-rockylinux9 +ARG USERID=1000 +RUN yum install -y python3-pip cmake libcudnn8 git && yum clean all && rm -rf /var/cache/yum/* +RUN git clone https://github.com/SJTU-IPADS/PowerInfer +WORKDIR PowerInfer +RUN pip install --no-cache-dir -r requirements.txt +RUN cmake -S . -B build -DLLAMA_CUBLAS=ON +RUN cmake --build build --config Release -j $(nproc) +RUN pip install --no-cache-dir pandas #for the benchmark. +RUN adduser -u $USERID user +USER user + diff --git a/powerinfer/README.md b/powerinfer/README.md new file mode 100644 index 0000000..a72ad06 --- /dev/null +++ b/powerinfer/README.md @@ -0,0 +1,19 @@ +[PowerInfer](https://github.com/SJTU-IPADS/PowerInfer) + +Note that the model loses some inference quality in exchange for speed as shown in https://huggingface.co/SparseLLM/ReluLLaMA-7B. + +You can compile PowerInfer following their instructions and then run the server: + +```bash +build/bin/server -v -m ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf +``` + +And in another terminal run: +```bash +python3 bench.py +``` + +The results will be in the bench-powerinfer.csv file. + +Or alternatively you can follow the instructions in the [Dockerfile](Dockerfile) to build a container to run the server and the benchmark inside it. + diff --git a/powerinfer/bench.py b/powerinfer/bench.py new file mode 100755 index 0000000..9c17be5 --- /dev/null +++ b/powerinfer/bench.py @@ -0,0 +1,17 @@ +import pandas as pd +import sys +sys.path.append('../common/') +from client import chat +from questions import questions + +if __name__ == '__main__': + counter = 1 + responses = [] + for q in questions: + response = chat(q) + if counter >= 2: # allow for a warmup + responses.append(response) + counter +=1 + + df = pd.DataFrame(responses) + df.to_csv('bench-powerinfer.csv', index=False) diff --git a/powerinfer/client.py b/powerinfer/client.py new file mode 100755 index 0000000..a932c8f --- /dev/null +++ b/powerinfer/client.py @@ -0,0 +1,21 @@ +import os, requests, time, json + +api_base = "http://localhost:8080" + +def chat(prompt:str): + payload = {"prompt": f"{prompt}", "n_predict": 200} + headers = {"Content-Type": "application/json"} + start = time.perf_counter() + response = requests.post(f"{api_base}/completion", headers = headers, json = payload) + response = response.json() + request_time = time.perf_counter() - start + + return {'tok_count': response['tokens_predicted'], + 'time': request_time, + 'question': prompt, + 'answer': response['content'], + 'note': 'PowerInfer ' + response['model']} + +if __name__ == '__main__': + prompt = "San Francisco is a city in" + print(f"User: {prompt}\nPowerInfer: {chat(prompt)['answer']}")