From 5f584cebd2c4f51bd215a9b664e668d05f8b22f4 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Mon, 29 Apr 2024 18:35:23 +0800 Subject: [PATCH] Separate folder for ggml models & Fix dockerfile (#296) --- .dockerignore | 4 +- Dockerfile | 1 + README.md | 125 ++++++++++++++++++----------------- chatglm_cpp/convert.py | 2 +- chatglm_cpp/langchain_api.py | 2 +- chatglm_cpp/openai_api.py | 2 +- chatglm_test.cpp | 42 ++++++------ examples/chatglm3_demo.py | 2 +- examples/cli_demo.py | 2 +- examples/web_demo.py | 2 +- main.cpp | 4 +- models/.gitignore | 1 + tests/perf.sh | 6 +- tests/perplexity.cpp | 2 +- tests/ppl.sh | 6 +- tests/test_chatglm_cpp.py | 32 ++++----- 16 files changed, 119 insertions(+), 116 deletions(-) create mode 100644 models/.gitignore diff --git a/.dockerignore b/.dockerignore index 4fd4d33e..49f0e15f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,4 @@ -.git/ +**/.git/ .github/ .hypothesis/ .pytest_cache/ @@ -6,6 +6,6 @@ build/ chatglm_cpp.egg-info/ dist/ .dockerignore -*.bin +models/ Dockerfile **/__pycache__/ diff --git a/Dockerfile b/Dockerfile index 3b23d1f9..115082e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,7 @@ RUN \ rm -rf /var/lib/apt/lists/* COPY --from=build /chatglm.cpp/build/bin/main /chatglm.cpp/build/bin/main +COPY --from=build /chatglm.cpp/build/lib/*.so /chatglm.cpp/build/lib/ COPY --from=build /chatglm.cpp/dist/ /chatglm.cpp/dist/ ADD examples examples diff --git a/README.md b/README.md index 4e7ddb82..b21fda01 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ python3 -m pip install torch tabulate tqdm transformers accelerate sentencepiece Use `convert.py` to transform ChatGLM-6B into quantized GGML format. For example, to convert the fp16 original model to q4_0 (quantized int4) GGML model, run: ```sh -python3 chatglm_cpp/convert.py -i THUDM/chatglm-6b -t q4_0 -o chatglm-ggml.bin +python3 chatglm_cpp/convert.py -i THUDM/chatglm-6b -t q4_0 -o models/chatglm-ggml.bin ``` The original model (`-i `) can be a Hugging Face model name or a local path to your pre-downloaded model. Currently supported models are: @@ -69,7 +69,7 @@ You are free to try any of the below quantization types by specifying `-t * `f16`: half precision floating point weights without quantization. * `f32`: single precision floating point weights without quantization. -For LoRA models, add `-l ` flag to merge your LoRA weights into the base model. For example, run `python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o chatglm3-ggml-lora.bin -l shibing624/chatglm3-6b-csc-chinese-lora` to merge public LoRA weights from Hugging Face. +For LoRA models, add `-l ` flag to merge your LoRA weights into the base model. For example, run `python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o models/chatglm3-ggml-lora.bin -l shibing624/chatglm3-6b-csc-chinese-lora` to merge public LoRA weights from Hugging Face. For P-Tuning v2 models using the [official finetuning script](https://github.com/THUDM/ChatGLM3/tree/main/finetune_demo), additional weights are automatically detected by `convert.py`. If `past_key_values` is on the output weight list, the P-Tuning checkpoint is successfully converted. @@ -83,13 +83,13 @@ cmake --build build -j --config Release Now you may chat with the quantized ChatGLM-6B model by running: ```sh -./build/bin/main -m chatglm-ggml.bin -p 你好 +./build/bin/main -m models/chatglm-ggml.bin -p 你好 # 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。 ``` To run the model in interactive mode, add the `-i` flag. For example: ```sh -./build/bin/main -m chatglm-ggml.bin -i +./build/bin/main -m models/chatglm-ggml.bin -i ``` In interactive mode, your chat history will serve as the context for the next-round conversation. @@ -101,8 +101,8 @@ Run `./build/bin/main -h` to explore more options! ChatGLM2-6B ```sh -python3 chatglm_cpp/convert.py -i THUDM/chatglm2-6b -t q4_0 -o chatglm2-ggml.bin -./build/bin/main -m chatglm2-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 +python3 chatglm_cpp/convert.py -i THUDM/chatglm2-6b -t q4_0 -o models/chatglm2-ggml.bin +./build/bin/main -m models/chatglm2-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # 你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。 ``` @@ -114,20 +114,20 @@ ChatGLM3-6B further supports function call and code interpreter in addition to c Chat mode: ```sh -python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o chatglm3-ggml.bin -./build/bin/main -m chatglm3-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 +python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o models/chatglm3-ggml.bin +./build/bin/main -m models/chatglm3-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # 你好👋!我是人工智能助手 ChatGLM3-6B,很高兴见到你,欢迎问我任何问题。 ``` Setting system prompt: ```sh -./build/bin/main -m chatglm3-ggml.bin -p 你好 -s "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown." +./build/bin/main -m models/chatglm3-ggml.bin -p 你好 -s "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown." # 你好👋!我是 ChatGLM3,有什么问题可以帮您解答吗? ``` Function call: ~~~ -$ ./build/bin/main -m chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/function_call.txt -i +$ ./build/bin/main -m models/chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/function_call.txt -i System > Answer the following questions as best as you can. You have access to the following tools: ... Prompt > 生成一个随机数 ChatGLM3 > random_number_generator @@ -141,7 +141,7 @@ ChatGLM3 > 根据您的要求,我使用随机数生成器API生成了一个随 Code interpreter: ~~~ -$ ./build/bin/main -m chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/code_interpreter.txt -i +$ ./build/bin/main -m models/chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/code_interpreter.txt -i System > 你是一位智能AI助手,你叫ChatGLM,你连接着一台电脑,但请注意不能联网。在使用Python解决任务时,你可以运行代码并得到结果,如果运行结果有错误,你需要尽可能对代码进行改进。你可以处理用户上传到电脑上的文件,文件默认存储路径是/mnt/data/。 Prompt > 列出100以内的所有质数 ChatGLM3 > 好的,我会为您列出100以内的所有质数。 @@ -180,19 +180,19 @@ $$ CodeGeeX2 ```sh -$ python3 chatglm_cpp/convert.py -i THUDM/codegeex2-6b -t q4_0 -o codegeex2-ggml.bin -$ ./build/bin/main -m codegeex2-ggml.bin --temp 0 --mode generate -p "\ +$ python3 chatglm_cpp/convert.py -i THUDM/codegeex2-6b -t q4_0 -o models/codegeex2-ggml.bin +$ ./build/bin/main -m models/codegeex2-ggml.bin --temp 0 --mode generate -p "\ # language: Python # write a bubble sort function " -def bubble_sort(list): - for i in range(len(list) - 1): - for j in range(len(list) - 1): - if list[j] > list[j + 1]: - list[j], list[j + 1] = list[j + 1], list[j] - return list +def bubble_sort(lst): + for i in range(len(lst) - 1): + for j in range(len(lst) - 1 - i): + if lst[j] > lst[j + 1]: + lst[j], lst[j + 1] = lst[j + 1], lst[j] + return lst print(bubble_sort([5, 4, 3, 2, 1])) @@ -203,8 +203,8 @@ print(bubble_sort([5, 4, 3, 2, 1])) Baichuan-13B-Chat ```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o baichuan-13b-chat-ggml.bin -./build/bin/main -m baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 +python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o models/baichuan-13b-chat-ggml.bin +./build/bin/main -m models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # 你好!有什么我可以帮助你的吗? ``` @@ -213,8 +213,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o baic Baichuan2-7B-Chat ```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o baichuan2-7b-chat-ggml.bin -./build/bin/main -m baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 +python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o models/baichuan2-7b-chat-ggml.bin +./build/bin/main -m models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # 你好!很高兴为您提供帮助。请问有什么问题我可以帮您解答? ``` @@ -223,8 +223,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o baic Baichuan2-13B-Chat ```sh -python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o baichuan2-13b-chat-ggml.bin -./build/bin/main -m baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 +python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o models/baichuan2-13b-chat-ggml.bin +./build/bin/main -m models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # 你好!今天我能为您提供什么帮助? ``` @@ -233,8 +233,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o bai InternLM-Chat-7B ```sh -python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b-v1_1 -t q4_0 -o internlm-chat-7b-ggml.bin -./build/bin/main -m internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 +python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b -t q4_0 -o models/internlm-chat-7b-ggml.bin +./build/bin/main -m models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # 你好,我是书生·浦语,有什么可以帮助你的吗? ``` @@ -243,8 +243,8 @@ python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b-v1_1 -t q4_0 -o inte InternLM-Chat-20B ```sh -python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o internlm-chat-20b-ggml.bin -./build/bin/main -m internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 +python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o models/internlm-chat-20b-ggml.bin +./build/bin/main -m models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # 你好!有什么我可以帮到你的吗? ``` @@ -323,19 +323,19 @@ Here is a simple demo that uses `chatglm_cpp.Pipeline` to load the GGML model an ```python >>> import chatglm_cpp >>> ->>> pipeline = chatglm_cpp.Pipeline("../chatglm-ggml.bin") +>>> pipeline = chatglm_cpp.Pipeline("../models/chatglm-ggml.bin") >>> pipeline.chat([chatglm_cpp.ChatMessage(role="user", content="你好")]) ChatMessage(role="assistant", content="你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。", tool_calls=[]) ``` To chat in stream, run the below Python example: ```sh -python3 cli_demo.py -m ../chatglm-ggml.bin -i +python3 cli_demo.py -m ../models/chatglm-ggml.bin -i ``` Launch a web demo to chat in your browser: ```sh -python3 web_demo.py -m ../chatglm-ggml.bin +python3 web_demo.py -m ../models/chatglm-ggml.bin ``` ![web_demo](docs/web_demo.jpg) @@ -346,8 +346,8 @@ For other models: ChatGLM2-6B ```sh -python3 cli_demo.py -m ../chatglm2-ggml.bin -p 你好 --temp 0.8 --top_p 0.8 # CLI demo -python3 web_demo.py -m ../chatglm2-ggml.bin --temp 0.8 --top_p 0.8 # web demo +python3 cli_demo.py -m ../models/chatglm2-ggml.bin -p 你好 --temp 0.8 --top_p 0.8 # CLI demo +python3 web_demo.py -m ../models/chatglm2-ggml.bin --temp 0.8 --top_p 0.8 # web demo ``` @@ -358,17 +358,17 @@ python3 web_demo.py -m ../chatglm2-ggml.bin --temp 0.8 --top_p 0.8 # web demo Chat mode: ```sh -python3 cli_demo.py -m ../chatglm3-ggml.bin -p 你好 --temp 0.8 --top_p 0.8 +python3 cli_demo.py -m ../models/chatglm3-ggml.bin -p 你好 --temp 0.8 --top_p 0.8 ``` Function call: ```sh -python3 cli_demo.py -m ../chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/function_call.txt -i +python3 cli_demo.py -m ../models/chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/function_call.txt -i ``` Code interpreter: ```sh -python3 cli_demo.py -m ../chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/code_interpreter.txt -i +python3 cli_demo.py -m ../models/chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/code_interpreter.txt -i ``` **Web Demo** @@ -395,12 +395,12 @@ streamlit run chatglm3_demo.py ```sh # CLI demo -python3 cli_demo.py -m ../codegeex2-ggml.bin --temp 0 --mode generate -p "\ +python3 cli_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --mode generate -p "\ # language: Python # write a bubble sort function " # web demo -python3 web_demo.py -m ../codegeex2-ggml.bin --temp 0 --max_length 512 --mode generate --plain +python3 web_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --max_length 512 --mode generate --plain ``` @@ -408,8 +408,8 @@ python3 web_demo.py -m ../codegeex2-ggml.bin --temp 0 --max_length 512 --mode ge Baichuan-13B-Chat ```sh -python3 cli_demo.py -m ../baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo -python3 web_demo.py -m ../baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # web demo +python3 cli_demo.py -m ../models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo +python3 web_demo.py -m ../models/baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # web demo ``` @@ -417,8 +417,8 @@ python3 web_demo.py -m ../baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --te Baichuan2-7B-Chat ```sh -python3 cli_demo.py -m ../baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo -python3 web_demo.py -m ../baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo +python3 cli_demo.py -m ../models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo +python3 web_demo.py -m ../models/baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo ``` @@ -426,8 +426,8 @@ python3 web_demo.py -m ../baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --te Baichuan2-13B-Chat ```sh -python3 cli_demo.py -m ../baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo -python3 web_demo.py -m ../baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo +python3 cli_demo.py -m ../models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo +python3 web_demo.py -m ../models/baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo ``` @@ -435,8 +435,8 @@ python3 web_demo.py -m ../baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --t InternLM-Chat-7B ```sh -python3 cli_demo.py -m ../internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo -python3 web_demo.py -m ../internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8 # web demo +python3 cli_demo.py -m ../models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo +python3 web_demo.py -m ../models/internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8 # web demo ``` @@ -444,8 +444,8 @@ python3 web_demo.py -m ../internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8 # we InternLM-Chat-20B ```sh -python3 cli_demo.py -m ../internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo -python3 web_demo.py -m ../internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo +python3 cli_demo.py -m ../models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo +python3 web_demo.py -m ../models/internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo ``` @@ -480,7 +480,7 @@ Remember to add the corresponding `CMAKE_ARGS` to enable acceleration. Start the api server for LangChain: ```sh -MODEL=./chatglm2-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000 +MODEL=./models/chatglm2-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000 ``` Test the api endpoint with `curl`: @@ -503,7 +503,7 @@ For more options, please refer to [examples/langchain_client.py](examples/langch Start an API server compatible with [OpenAI chat completions protocol](https://platform.openai.com/docs/api-reference/chat): ```sh -MODEL=./chatglm3-ggml.bin uvicorn chatglm_cpp.openai_api:app --host 127.0.0.1 --port 8000 +MODEL=./models/chatglm3-ggml.bin uvicorn chatglm_cpp.openai_api:app --host 127.0.0.1 --port 8000 ``` Test your endpoint with `curl`: @@ -542,14 +542,14 @@ Building docker image locally and start a container to run inference on CPU: ```sh docker build . --network=host -t chatglm.cpp # cpp demo -docker run -it --rm -v $PWD:/opt chatglm.cpp ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好" +docker run -it --rm -v $PWD/models:/chatglm.cpp/models chatglm.cpp ./build/bin/main -m models/chatglm-ggml.bin -p "你好" # python demo -docker run -it --rm -v $PWD:/opt chatglm.cpp python3 examples/cli_demo.py -m /opt/chatglm-ggml.bin -p "你好" +docker run -it --rm -v $PWD/models:/chatglm.cpp/models chatglm.cpp python3 examples/cli_demo.py -m models/chatglm-ggml.bin -p "你好" # langchain api server -docker run -it --rm -v $PWD:/opt -p 8000:8000 -e MODEL=/opt/chatglm-ggml.bin chatglm.cpp \ +docker run -it --rm -v $PWD/models:/chatglm.cpp/models -p 8000:8000 -e MODEL=models/chatglm-ggml.bin chatglm.cpp \ uvicorn chatglm_cpp.langchain_api:app --host 0.0.0.0 --port 8000 # openai api server -docker run -it --rm -v $PWD:/opt -p 8000:8000 -e MODEL=/opt/chatglm-ggml.bin chatglm.cpp \ +docker run -it --rm -v $PWD/models:/chatglm.cpp/models -p 8000:8000 -e MODEL=models/chatglm-ggml.bin chatglm.cpp \ uvicorn chatglm_cpp.openai_api:app --host 0.0.0.0 --port 8000 ``` @@ -557,8 +557,9 @@ For CUDA support, make sure [nvidia-docker](https://github.com/NVIDIA/nvidia-doc ```sh docker build . --network=host -t chatglm.cpp-cuda \ --build-arg BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 \ - --build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON" -docker run -it --rm --gpus all -v $PWD:/chatglm.cpp/models chatglm.cpp-cuda ./build/bin/main -m models/chatglm-ggml.bin -p "你好" + --build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES=80" +docker run -it --rm --gpus all -v $PWD/models:/chatglm.cpp/models chatglm.cpp-cuda \ + ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` **Option 2: Using Pre-built Image** @@ -567,14 +568,14 @@ The pre-built image for CPU inference is published on both [Docker Hub](https:// To pull from Docker Hub and run demo: ```sh -docker run -it --rm -v $PWD:/opt liplusx/chatglm.cpp:main \ - ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好" +docker run -it --rm -v $PWD/models:/chatglm.cpp/models liplusx/chatglm.cpp:main \ + ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` To pull from GHCR and run demo: ```sh -docker run -it --rm -v $PWD:/opt ghcr.io/li-plus/chatglm.cpp:main \ - ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好" +docker run -it --rm -v $PWD/models:/chatglm.cpp/models ghcr.io/li-plus/chatglm.cpp:main \ + ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` Python demo and API servers are also supported in pre-built image. Use it in the same way as **Option 1**. @@ -646,7 +647,7 @@ We measure model quality by evaluating the perplexity over the WikiText-2 test d Download and unzip the dataset from [link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip). Measure the perplexity with a stride of 512 and max input length of 2048: ```sh -./build/bin/perplexity -m -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048 +./build/bin/perplexity -m models/chatglm3-ggml.bin -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048 ``` | | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 | diff --git a/chatglm_cpp/convert.py b/chatglm_cpp/convert.py index a98168c1..78ea912d 100644 --- a/chatglm_cpp/convert.py +++ b/chatglm_cpp/convert.py @@ -562,7 +562,7 @@ def main(): help="Lora model name or path used in PeftModel.from_pretrained", ) parser.add_argument( - "-o", "--save_path", default="chatglm-ggml.bin", type=Path, help="Path to save the generated GGML model" + "-o", "--save_path", default="models/chatglm-ggml.bin", type=Path, help="Path to save the generated GGML model" ) parser.add_argument( "-t", diff --git a/chatglm_cpp/langchain_api.py b/chatglm_cpp/langchain_api.py index ceea5988..12de66a5 100644 --- a/chatglm_cpp/langchain_api.py +++ b/chatglm_cpp/langchain_api.py @@ -11,7 +11,7 @@ class Settings(BaseSettings): - model: str = "chatglm-ggml.bin" + model: str = "models/chatglm-ggml.bin" class ChatRequest(BaseModel): diff --git a/chatglm_cpp/openai_api.py b/chatglm_cpp/openai_api.py index 42c0a479..ad5115b8 100644 --- a/chatglm_cpp/openai_api.py +++ b/chatglm_cpp/openai_api.py @@ -16,7 +16,7 @@ class Settings(BaseSettings): - model: str = "chatglm3-ggml.bin" + model: str = "models/chatglm3-ggml.bin" num_threads: int = 0 diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 1b0befe1..c0ebd13b 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -995,7 +995,7 @@ static void check_chat_format(const Pipeline &pipeline) { } TEST(Pipeline, ChatGLM) { - fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping ChatGLM e2e test (ggml model not found)"; } @@ -1057,7 +1057,7 @@ TEST(Pipeline, ChatGLM) { } TEST(Pipeline, ChatGLM2) { - fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm2-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm2-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping ChatGLM2 e2e test (ggml model not found)"; } @@ -1127,7 +1127,7 @@ static inline std::string read_text(const fs::path &path) { } TEST(Pipeline, ChatGLM3) { - fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm3-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm3-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping ChatGLM3 e2e test (ggml model not found)"; } @@ -1296,7 +1296,7 @@ primes_up_to_100 } TEST(Pipeline, CodeGeeX2) { - fs::path model_path = fs::path(__FILE__).parent_path() / "codegeex2-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/codegeex2-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping CodeGeeX2 e2e test (ggml model not found)"; } @@ -1320,12 +1320,12 @@ TEST(Pipeline, CodeGeeX2) { std::string prompt = "# language: Python\n# write a bubble sort function\n"; std::string target = R"( -def bubble_sort(list): - for i in range(len(list) - 1): - for j in range(len(list) - 1): - if list[j] > list[j + 1]: - list[j], list[j + 1] = list[j + 1], list[j] - return list +def bubble_sort(lst): + for i in range(len(lst) - 1): + for j in range(len(lst) - 1 - i): + if lst[j] > lst[j + 1]: + lst[j], lst[j + 1] = lst[j + 1], lst[j] + return lst print(bubble_sort([5, 4, 3, 2, 1])))"; @@ -1336,7 +1336,7 @@ print(bubble_sort([5, 4, 3, 2, 1])))"; } TEST(Pipeline, Baichuan13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan-13b-chat-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan-13b-chat-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping Baichuan13B e2e test (ggml model not found)"; } @@ -1391,7 +1391,7 @@ TEST(Pipeline, Baichuan13B) { } TEST(Pipeline, Baichuan2_7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-7b-chat-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping Baichuan2-7B e2e test (ggml model not found)"; } @@ -1446,7 +1446,7 @@ TEST(Pipeline, Baichuan2_7B) { } TEST(Pipeline, Baichuan2_13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-13b-chat-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping Baichuan2-13B e2e test (ggml model not found)"; } @@ -1489,7 +1489,7 @@ TEST(Pipeline, Baichuan2_13B) { } TEST(Pipeline, InternLM) { - fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-7b-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin"; if (!fs::exists(model_path)) { GTEST_SKIP() << "Skipping InternLM e2e test (ggml model not found)"; } @@ -1539,7 +1539,7 @@ TEST(Pipeline, InternLM) { gen_config.do_sample = false; std::vector messages{{ChatMessage::ROLE_USER, "你好"}}; ChatMessage output = pipeline.chat(messages, gen_config); - EXPECT_EQ(output.content, "你好,有什么我可以帮助你的吗?"); + EXPECT_EQ(output.content, "你好!有什么我可以帮助你的吗?"); } } @@ -1578,32 +1578,32 @@ static void run_benchmark(const fs::path &model_path) { } TEST(Benchmark, ChatGLM) { - fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm-ggml.bin"; run_benchmark(model_path); } TEST(Benchmark, ChatGLM2) { - fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm2-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm2-ggml.bin"; run_benchmark(model_path); } TEST(Benchmark, Baichuan2_7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-7b-chat-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin"; run_benchmark(model_path); } TEST(Benchmark, Baichuan2_13B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-13b-chat-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin"; run_benchmark(model_path); } TEST(Benchmark, InternLM7B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-7b-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin"; run_benchmark(model_path); } TEST(Benchmark, InternLM20B) { - fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-20b-ggml.bin"; + fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-20b-ggml.bin"; run_benchmark(model_path); } diff --git a/examples/chatglm3_demo.py b/examples/chatglm3_demo.py index 5c200bb8..783b58ad 100644 --- a/examples/chatglm3_demo.py +++ b/examples/chatglm3_demo.py @@ -17,7 +17,7 @@ from PIL import Image IPYKERNEL = "chatglm3-demo" -MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm3-ggml.bin" +MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm3-ggml.bin" CHAT_SYSTEM_PROMPT = "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown." diff --git a/examples/cli_demo.py b/examples/cli_demo.py index c134b7dd..d4532b97 100644 --- a/examples/cli_demo.py +++ b/examples/cli_demo.py @@ -4,7 +4,7 @@ import chatglm_cpp -DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin" +DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm-ggml.bin" BANNER = """ ________ __ ________ __ ___ diff --git a/examples/web_demo.py b/examples/web_demo.py index 7fe74cdf..96b39a7f 100644 --- a/examples/web_demo.py +++ b/examples/web_demo.py @@ -6,7 +6,7 @@ import chatglm_cpp import gradio as gr -DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin" +DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm-ggml.bin" parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", default=DEFAULT_MODEL_PATH, type=Path, help="model path") diff --git a/main.cpp b/main.cpp index a909746a..6a14765e 100644 --- a/main.cpp +++ b/main.cpp @@ -22,7 +22,7 @@ static inline InferenceMode to_inference_mode(const std::string &s) { } struct Args { - std::string model_path = "chatglm-ggml.bin"; + std::string model_path = "models/chatglm-ggml.bin"; InferenceMode mode = INFERENCE_MODE_CHAT; bool sync = false; std::string prompt = "你好"; @@ -44,7 +44,7 @@ static void usage(const std::string &prog) { options: -h, --help show this help message and exit - -m, --model PATH model path (default: chatglm-ggml.bin) + -m, --model PATH model path (default: models/chatglm-ggml.bin) --mode inference mode chosen from {chat, generate} (default: chat) --sync synchronized generation without streaming -p, --prompt PROMPT prompt to start generation with (default: 你好) diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 00000000..a8a0dcec --- /dev/null +++ b/models/.gitignore @@ -0,0 +1 @@ +*.bin diff --git a/tests/perf.sh b/tests/perf.sh index 3412f58c..84e614ef 100644 --- a/tests/perf.sh +++ b/tests/perf.sh @@ -3,13 +3,13 @@ export CUDA_VISIBLE_DEVICES=0 # InternLM-7B -hf_model=internlm/internlm-chat-7b-v1_1 -ggml_model=internlm-chat-7b-ggml.bin +hf_model=internlm/internlm-chat-7b +ggml_model=models/internlm-chat-7b-ggml.bin benchmark=Benchmark.InternLM7B # InternLM-20B # hf_model=internlm/internlm-chat-20b -# ggml_model=internlm-chat-20b-ggml.bin +# ggml_model=models/internlm-chat-20b-ggml.bin # benchmark=Benchmark.InternLM20B for dtype in q4_0 q4_1 q5_0 q5_1 q8_0 f16; do diff --git a/tests/perplexity.cpp b/tests/perplexity.cpp index a98e624c..5bb41c30 100644 --- a/tests/perplexity.cpp +++ b/tests/perplexity.cpp @@ -5,7 +5,7 @@ #include struct Args { - std::string model_path = "chatglm-ggml.bin"; + std::string model_path = "models/chatglm-ggml.bin"; std::string corpus_path = "data/wikitext-2-raw/wiki.test.raw"; int max_length = 1024; int stride = 512; diff --git a/tests/ppl.sh b/tests/ppl.sh index 206025ea..71a1ac8a 100644 --- a/tests/ppl.sh +++ b/tests/ppl.sh @@ -4,15 +4,15 @@ export CUDA_VISIBLE_DEVICES=0 # ChatGLM3-6B-Base hf_model=THUDM/chatglm3-6b-base -ggml_model=chatglm3-base-ggml.bin +ggml_model=models/hatglm3-base-ggml.bin # Baichuan2-7B-Base # hf_model=baichuan-inc/Baichuan2-7B-Base -# ggml_model=baichuan2-7b-base-ggml.bin +# ggml_model=models/baichuan2-7b-base-ggml.bin # InternLM # hf_model=internlm/internlm-7b -# ggml_model=internlm-7b-base-ggml.bin +# ggml_model=models/internlm-7b-base-ggml.bin for dtype in f16; do python3 chatglm_cpp/convert.py -i $hf_model -o $ggml_model -t $dtype diff --git a/tests/test_chatglm_cpp.py b/tests/test_chatglm_cpp.py index 45386bbd..2456ffd3 100644 --- a/tests/test_chatglm_cpp.py +++ b/tests/test_chatglm_cpp.py @@ -5,15 +5,15 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent -CHATGLM_MODEL_PATH = PROJECT_ROOT / "chatglm-ggml.bin" -CHATGLM2_MODEL_PATH = PROJECT_ROOT / "chatglm2-ggml.bin" -CHATGLM3_MODEL_PATH = PROJECT_ROOT / "chatglm3-ggml.bin" -CODEGEEX2_MODEL_PATH = PROJECT_ROOT / "codegeex2-ggml.bin" -BAICHUAN13B_MODEL_PATH = PROJECT_ROOT / "baichuan-13b-chat-ggml.bin" -BAICHUAN2_7B_MODEL_PATH = PROJECT_ROOT / "baichuan2-7b-chat-ggml.bin" -BAICHUAN2_13B_MODEL_PATH = PROJECT_ROOT / "baichuan2-13b-chat-ggml.bin" -INTERNLM7B_MODEL_PATH = PROJECT_ROOT / "internlm-chat-7b-ggml.bin" -INTERNLM20B_MODEL_PATH = PROJECT_ROOT / "internlm-chat-20b-ggml.bin" +CHATGLM_MODEL_PATH = PROJECT_ROOT / "models/chatglm-ggml.bin" +CHATGLM2_MODEL_PATH = PROJECT_ROOT / "models/chatglm2-ggml.bin" +CHATGLM3_MODEL_PATH = PROJECT_ROOT / "models/chatglm3-ggml.bin" +CODEGEEX2_MODEL_PATH = PROJECT_ROOT / "models/codegeex2-ggml.bin" +BAICHUAN13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan-13b-chat-ggml.bin" +BAICHUAN2_7B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-7b-chat-ggml.bin" +BAICHUAN2_13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-13b-chat-ggml.bin" +INTERNLM7B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-7b-ggml.bin" +INTERNLM20B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-20b-ggml.bin" def test_chatglm_version(): @@ -80,12 +80,12 @@ def test_codegeex2_pipeline(): prompt = "# language: Python\n# write a bubble sort function\n" target = """ -def bubble_sort(list): - for i in range(len(list) - 1): - for j in range(len(list) - 1): - if list[j] > list[j + 1]: - list[j], list[j + 1] = list[j + 1], list[j] - return list +def bubble_sort(lst): + for i in range(len(lst) - 1): + for j in range(len(lst) - 1 - i): + if lst[j] > lst[j + 1]: + lst[j], lst[j + 1] = lst[j + 1], lst[j] + return lst print(bubble_sort([5, 4, 3, 2, 1]))""" @@ -131,7 +131,7 @@ def test_baichuan2_13b_pipeline(): @pytest.mark.skipif(not INTERNLM7B_MODEL_PATH.exists(), reason="model file not found") def test_internlm7b_pipeline(): - check_pipeline(model_path=INTERNLM7B_MODEL_PATH, prompt="你好", target="你好,有什么我可以帮助你的吗?") + check_pipeline(model_path=INTERNLM7B_MODEL_PATH, prompt="你好", target="你好!有什么我可以帮助你的吗?") @pytest.mark.skipif(not INTERNLM20B_MODEL_PATH.exists(), reason="model file not found")