From 5f584cebd2c4f51bd215a9b664e668d05f8b22f4 Mon Sep 17 00:00:00 2001
From: Jiahao Li <liplus17@163.com>
Date: Mon, 29 Apr 2024 18:35:23 +0800
Subject: [PATCH] Separate folder for ggml models & Fix dockerfile (#296)

---
 .dockerignore                |   4 +-
 Dockerfile                   |   1 +
 README.md                    | 125 ++++++++++++++++++-----------------
 chatglm_cpp/convert.py       |   2 +-
 chatglm_cpp/langchain_api.py |   2 +-
 chatglm_cpp/openai_api.py    |   2 +-
 chatglm_test.cpp             |  42 ++++++------
 examples/chatglm3_demo.py    |   2 +-
 examples/cli_demo.py         |   2 +-
 examples/web_demo.py         |   2 +-
 main.cpp                     |   4 +-
 models/.gitignore            |   1 +
 tests/perf.sh                |   6 +-
 tests/perplexity.cpp         |   2 +-
 tests/ppl.sh                 |   6 +-
 tests/test_chatglm_cpp.py    |  32 ++++-----
 16 files changed, 119 insertions(+), 116 deletions(-)
 create mode 100644 models/.gitignore
diff --git a/.dockerignore b/.dockerignore
index 4fd4d33e..49f0e15f 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,4 @@
-.git/
+**/.git/
 .github/
 .hypothesis/
 .pytest_cache/
@@ -6,6 +6,6 @@ build/
 chatglm_cpp.egg-info/
 dist/
 .dockerignore
-*.bin
+models/
 Dockerfile
 **/__pycache__/
diff --git a/Dockerfile b/Dockerfile
index 3b23d1f9..115082e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,6 +47,7 @@ RUN \
     rm -rf /var/lib/apt/lists/*
 
 COPY --from=build /chatglm.cpp/build/bin/main /chatglm.cpp/build/bin/main
+COPY --from=build /chatglm.cpp/build/lib/*.so /chatglm.cpp/build/lib/
 COPY --from=build /chatglm.cpp/dist/ /chatglm.cpp/dist/
 
 ADD examples examples
diff --git a/README.md b/README.md
index 4e7ddb82..b21fda01 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ python3 -m pip install torch tabulate tqdm transformers accelerate sentencepiece
 
 Use `convert.py` to transform ChatGLM-6B into quantized GGML format. For example, to convert the fp16 original model to q4_0 (quantized int4) GGML model, run:
 ```sh
-python3 chatglm_cpp/convert.py -i THUDM/chatglm-6b -t q4_0 -o chatglm-ggml.bin
+python3 chatglm_cpp/convert.py -i THUDM/chatglm-6b -t q4_0 -o models/chatglm-ggml.bin
 ```
 
 The original model (`-i <model_name_or_path>`) can be a Hugging Face model name or a local path to your pre-downloaded model. Currently supported models are:
@@ -69,7 +69,7 @@ You are free to try any of the below quantization types by specifying `-t <type>
 * `f16`: half precision floating point weights without quantization.
 * `f32`: single precision floating point weights without quantization.
 
-For LoRA models, add `-l <lora_model_name_or_path>` flag to merge your LoRA weights into the base model. For example, run `python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o chatglm3-ggml-lora.bin -l shibing624/chatglm3-6b-csc-chinese-lora` to merge public LoRA weights from Hugging Face.
+For LoRA models, add `-l <lora_model_name_or_path>` flag to merge your LoRA weights into the base model. For example, run `python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o models/chatglm3-ggml-lora.bin -l shibing624/chatglm3-6b-csc-chinese-lora` to merge public LoRA weights from Hugging Face.
 
 For P-Tuning v2 models using the [official finetuning script](https://github.com/THUDM/ChatGLM3/tree/main/finetune_demo), additional weights are automatically detected by `convert.py`. If `past_key_values` is on the output weight list, the P-Tuning checkpoint is successfully converted.
 
@@ -83,13 +83,13 @@ cmake --build build -j --config Release
 
 Now you may chat with the quantized ChatGLM-6B model by running:
 ```sh
-./build/bin/main -m chatglm-ggml.bin -p 你好
+./build/bin/main -m models/chatglm-ggml.bin -p 你好
 # 你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。
 ```
 
 To run the model in interactive mode, add the `-i` flag. For example:
 ```sh
-./build/bin/main -m chatglm-ggml.bin -i
+./build/bin/main -m models/chatglm-ggml.bin -i
 ```
 In interactive mode, your chat history will serve as the context for the next-round conversation.
 
@@ -101,8 +101,8 @@ Run `./build/bin/main -h` to explore more options!
 <summary>ChatGLM2-6B</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i THUDM/chatglm2-6b -t q4_0 -o chatglm2-ggml.bin
-./build/bin/main -m chatglm2-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+python3 chatglm_cpp/convert.py -i THUDM/chatglm2-6b -t q4_0 -o models/chatglm2-ggml.bin
+./build/bin/main -m models/chatglm2-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
 # 你好👋！我是人工智能助手 ChatGLM2-6B，很高兴见到你，欢迎问我任何问题。
 ```
 </details>
@@ -114,20 +114,20 @@ ChatGLM3-6B further supports function call and code interpreter in addition to c
 
 Chat mode:
 ```sh
-python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o chatglm3-ggml.bin
-./build/bin/main -m chatglm3-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o models/chatglm3-ggml.bin
+./build/bin/main -m models/chatglm3-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
 # 你好👋！我是人工智能助手 ChatGLM3-6B，很高兴见到你，欢迎问我任何问题。
 ```
 
 Setting system prompt:
 ```sh
-./build/bin/main -m chatglm3-ggml.bin -p 你好 -s "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown."
+./build/bin/main -m models/chatglm3-ggml.bin -p 你好 -s "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown."
 # 你好👋！我是 ChatGLM3，有什么问题可以帮您解答吗？
 ```
 
 Function call:
 ~~~
-$ ./build/bin/main -m chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/function_call.txt -i
+$ ./build/bin/main -m models/chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/function_call.txt -i
 System   > Answer the following questions as best as you can. You have access to the following tools: ...
 Prompt   > 生成一个随机数
 ChatGLM3 > random_number_generator
@@ -141,7 +141,7 @@ ChatGLM3 > 根据您的要求，我使用随机数生成器API生成了一个随
 
 Code interpreter:
 ~~~
-$ ./build/bin/main -m chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/code_interpreter.txt -i
+$ ./build/bin/main -m models/chatglm3-ggml.bin --top_p 0.8 --temp 0.8 --sp examples/system/code_interpreter.txt -i
 System   > 你是一位智能AI助手，你叫ChatGLM，你连接着一台电脑，但请注意不能联网。在使用Python解决任务时，你可以运行代码并得到结果，如果运行结果有错误，你需要尽可能对代码进行改进。你可以处理用户上传到电脑上的文件，文件默认存储路径是/mnt/data/。
 Prompt   > 列出100以内的所有质数
 ChatGLM3 > 好的，我会为您列出100以内的所有质数。
@@ -180,19 +180,19 @@ $$
 <summary>CodeGeeX2</summary>
 
 ```sh
-$ python3 chatglm_cpp/convert.py -i THUDM/codegeex2-6b -t q4_0 -o codegeex2-ggml.bin
-$ ./build/bin/main -m codegeex2-ggml.bin --temp 0 --mode generate -p "\
+$ python3 chatglm_cpp/convert.py -i THUDM/codegeex2-6b -t q4_0 -o models/codegeex2-ggml.bin
+$ ./build/bin/main -m models/codegeex2-ggml.bin --temp 0 --mode generate -p "\
 # language: Python
 # write a bubble sort function
 "
 
 
-def bubble_sort(list):
-    for i in range(len(list) - 1):
-        for j in range(len(list) - 1):
-            if list[j] > list[j + 1]:
-                list[j], list[j + 1] = list[j + 1], list[j]
-    return list
+def bubble_sort(lst):
+    for i in range(len(lst) - 1):
+        for j in range(len(lst) - 1 - i):
+            if lst[j] > lst[j + 1]:
+                lst[j], lst[j + 1] = lst[j + 1], lst[j]
+    return lst
 
 
 print(bubble_sort([5, 4, 3, 2, 1]))
@@ -203,8 +203,8 @@ print(bubble_sort([5, 4, 3, 2, 1]))
 <summary>Baichuan-13B-Chat</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o baichuan-13b-chat-ggml.bin
-./build/bin/main -m baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1
+python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o models/baichuan-13b-chat-ggml.bin
+./build/bin/main -m models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1
 # 你好！有什么我可以帮助你的吗？
 ```
 </details>
@@ -213,8 +213,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o baic
 <summary>Baichuan2-7B-Chat</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o baichuan2-7b-chat-ggml.bin
-./build/bin/main -m baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
+python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o models/baichuan2-7b-chat-ggml.bin
+./build/bin/main -m models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
 # 你好！很高兴为您提供帮助。请问有什么问题我可以帮您解答？
 ```
 </details>
@@ -223,8 +223,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o baic
 <summary>Baichuan2-13B-Chat</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o baichuan2-13b-chat-ggml.bin
-./build/bin/main -m baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
+python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o models/baichuan2-13b-chat-ggml.bin
+./build/bin/main -m models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
 # 你好！今天我能为您提供什么帮助？
 ```
 </details>
@@ -233,8 +233,8 @@ python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o bai
 <summary>InternLM-Chat-7B</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b-v1_1 -t q4_0 -o internlm-chat-7b-ggml.bin
-./build/bin/main -m internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b -t q4_0 -o models/internlm-chat-7b-ggml.bin
+./build/bin/main -m models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
 # 你好，我是书生·浦语，有什么可以帮助你的吗？
 ```
 </details>
@@ -243,8 +243,8 @@ python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b-v1_1 -t q4_0 -o inte
 <summary>InternLM-Chat-20B</summary>
 
 ```sh
-python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o internlm-chat-20b-ggml.bin
-./build/bin/main -m internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
+python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o models/internlm-chat-20b-ggml.bin
+./build/bin/main -m models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
 # 你好！有什么我可以帮到你的吗？
 ```
 </details>
@@ -323,19 +323,19 @@ Here is a simple demo that uses `chatglm_cpp.Pipeline` to load the GGML model an
 ```python
 >>> import chatglm_cpp
 >>> 
->>> pipeline = chatglm_cpp.Pipeline("../chatglm-ggml.bin")
+>>> pipeline = chatglm_cpp.Pipeline("../models/chatglm-ggml.bin")
 >>> pipeline.chat([chatglm_cpp.ChatMessage(role="user", content="你好")])
 ChatMessage(role="assistant", content="你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。", tool_calls=[])
 ```
 
 To chat in stream, run the below Python example:
 ```sh
-python3 cli_demo.py -m ../chatglm-ggml.bin -i
+python3 cli_demo.py -m ../models/chatglm-ggml.bin -i
 ```
 
 Launch a web demo to chat in your browser:
 ```sh
-python3 web_demo.py -m ../chatglm-ggml.bin
+python3 web_demo.py -m ../models/chatglm-ggml.bin
 ```
 
 ![web_demo](docs/web_demo.jpg)
@@ -346,8 +346,8 @@ For other models:
 <summary>ChatGLM2-6B</summary>
 
 ```sh
-python3 cli_demo.py -m ../chatglm2-ggml.bin -p 你好 --temp 0.8 --top_p 0.8  # CLI demo
-python3 web_demo.py -m ../chatglm2-ggml.bin --temp 0.8 --top_p 0.8  # web demo
+python3 cli_demo.py -m ../models/chatglm2-ggml.bin -p 你好 --temp 0.8 --top_p 0.8  # CLI demo
+python3 web_demo.py -m ../models/chatglm2-ggml.bin --temp 0.8 --top_p 0.8  # web demo
 ```
 </details>
 
@@ -358,17 +358,17 @@ python3 web_demo.py -m ../chatglm2-ggml.bin --temp 0.8 --top_p 0.8  # web demo
 
 Chat mode:
 ```sh
-python3 cli_demo.py -m ../chatglm3-ggml.bin -p 你好 --temp 0.8 --top_p 0.8
+python3 cli_demo.py -m ../models/chatglm3-ggml.bin -p 你好 --temp 0.8 --top_p 0.8
 ```
 
 Function call:
 ```sh
-python3 cli_demo.py -m ../chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/function_call.txt -i
+python3 cli_demo.py -m ../models/chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/function_call.txt -i
 ```
 
 Code interpreter:
 ```sh
-python3 cli_demo.py -m ../chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/code_interpreter.txt -i
+python3 cli_demo.py -m ../models/chatglm3-ggml.bin --temp 0.8 --top_p 0.8 --sp system/code_interpreter.txt -i
 ```
 
 **Web Demo**
@@ -395,12 +395,12 @@ streamlit run chatglm3_demo.py
 
 ```sh
 # CLI demo
-python3 cli_demo.py -m ../codegeex2-ggml.bin --temp 0 --mode generate -p "\
+python3 cli_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --mode generate -p "\
 # language: Python
 # write a bubble sort function
 "
 # web demo
-python3 web_demo.py -m ../codegeex2-ggml.bin --temp 0 --max_length 512 --mode generate --plain
+python3 web_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --max_length 512 --mode generate --plain
 ```
 </details>
 
@@ -408,8 +408,8 @@ python3 web_demo.py -m ../codegeex2-ggml.bin --temp 0 --max_length 512 --mode ge
 <summary>Baichuan-13B-Chat</summary>
 
 ```sh
-python3 cli_demo.py -m ../baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo
-python3 web_demo.py -m ../baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1   # web demo
+python3 cli_demo.py -m ../models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo
+python3 web_demo.py -m ../models/baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1   # web demo
 ```
 </details>
 
@@ -417,8 +417,8 @@ python3 web_demo.py -m ../baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --te
 <summary>Baichuan2-7B-Chat</summary>
 
 ```sh
-python3 cli_demo.py -m ../baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
-python3 web_demo.py -m ../baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05   # web demo
+python3 cli_demo.py -m ../models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
+python3 web_demo.py -m ../models/baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05   # web demo
 ```
 </details>
 
@@ -426,8 +426,8 @@ python3 web_demo.py -m ../baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --te
 <summary>Baichuan2-13B-Chat</summary>
 
 ```sh
-python3 cli_demo.py -m ../baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
-python3 web_demo.py -m ../baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05   # web demo
+python3 cli_demo.py -m ../models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
+python3 web_demo.py -m ../models/baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05   # web demo
 ```
 </details>
 
@@ -435,8 +435,8 @@ python3 web_demo.py -m ../baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --t
 <summary>InternLM-Chat-7B</summary>
 
 ```sh
-python3 cli_demo.py -m ../internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8  # CLI demo
-python3 web_demo.py -m ../internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8  # web demo
+python3 cli_demo.py -m ../models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8  # CLI demo
+python3 web_demo.py -m ../models/internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8  # web demo
 ```
 </details>
 
@@ -444,8 +444,8 @@ python3 web_demo.py -m ../internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8  # we
 <summary>InternLM-Chat-20B</summary>
 
 ```sh
-python3 cli_demo.py -m ../internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo
-python3 web_demo.py -m ../internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo
+python3 cli_demo.py -m ../models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo
+python3 web_demo.py -m ../models/internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo
 ```
 </details>
 
@@ -480,7 +480,7 @@ Remember to add the corresponding `CMAKE_ARGS` to enable acceleration.
 
 Start the api server for LangChain:
 ```sh
-MODEL=./chatglm2-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000
+MODEL=./models/chatglm2-ggml.bin uvicorn chatglm_cpp.langchain_api:app --host 127.0.0.1 --port 8000
 ```
 
 Test the api endpoint with `curl`:
@@ -503,7 +503,7 @@ For more options, please refer to [examples/langchain_client.py](examples/langch
 
 Start an API server compatible with [OpenAI chat completions protocol](https://platform.openai.com/docs/api-reference/chat):
 ```sh
-MODEL=./chatglm3-ggml.bin uvicorn chatglm_cpp.openai_api:app --host 127.0.0.1 --port 8000
+MODEL=./models/chatglm3-ggml.bin uvicorn chatglm_cpp.openai_api:app --host 127.0.0.1 --port 8000
 ```
 
 Test your endpoint with `curl`:
@@ -542,14 +542,14 @@ Building docker image locally and start a container to run inference on CPU:
 ```sh
 docker build . --network=host -t chatglm.cpp
 # cpp demo
-docker run -it --rm -v $PWD:/opt chatglm.cpp ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好"
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models chatglm.cpp ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
 # python demo
-docker run -it --rm -v $PWD:/opt chatglm.cpp python3 examples/cli_demo.py -m /opt/chatglm-ggml.bin -p "你好"
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models chatglm.cpp python3 examples/cli_demo.py -m models/chatglm-ggml.bin -p "你好"
 # langchain api server
-docker run -it --rm -v $PWD:/opt -p 8000:8000 -e MODEL=/opt/chatglm-ggml.bin chatglm.cpp \
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models -p 8000:8000 -e MODEL=models/chatglm-ggml.bin chatglm.cpp \
     uvicorn chatglm_cpp.langchain_api:app --host 0.0.0.0 --port 8000
 # openai api server
-docker run -it --rm -v $PWD:/opt -p 8000:8000 -e MODEL=/opt/chatglm-ggml.bin chatglm.cpp \
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models -p 8000:8000 -e MODEL=models/chatglm-ggml.bin chatglm.cpp \
     uvicorn chatglm_cpp.openai_api:app --host 0.0.0.0 --port 8000
 ```
 
@@ -557,8 +557,9 @@ For CUDA support, make sure [nvidia-docker](https://github.com/NVIDIA/nvidia-doc
 ```sh
 docker build . --network=host -t chatglm.cpp-cuda \
     --build-arg BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 \
-    --build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON"
-docker run -it --rm --gpus all -v $PWD:/chatglm.cpp/models chatglm.cpp-cuda ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
+    --build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES=80"
+docker run -it --rm --gpus all -v $PWD/models:/chatglm.cpp/models chatglm.cpp-cuda \
+    ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
 ```
 
 **Option 2: Using Pre-built Image**
@@ -567,14 +568,14 @@ The pre-built image for CPU inference is published on both [Docker Hub](https://
 
 To pull from Docker Hub and run demo:
 ```sh
-docker run -it --rm -v $PWD:/opt liplusx/chatglm.cpp:main \
-    ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好"
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models liplusx/chatglm.cpp:main \
+    ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
 ```
 
 To pull from GHCR and run demo:
 ```sh
-docker run -it --rm -v $PWD:/opt ghcr.io/li-plus/chatglm.cpp:main \
-    ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好"
+docker run -it --rm -v $PWD/models:/chatglm.cpp/models ghcr.io/li-plus/chatglm.cpp:main \
+    ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
 ```
 
 Python demo and API servers are also supported in pre-built image. Use it in the same way as **Option 1**.
@@ -646,7 +647,7 @@ We measure model quality by evaluating the perplexity over the WikiText-2 test d
 
 Download and unzip the dataset from [link](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip). Measure the perplexity with a stride of 512 and max input length of 2048:
 ```sh
-./build/bin/perplexity -m <model_path> -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
+./build/bin/perplexity -m models/chatglm3-ggml.bin -f wikitext-2-raw/wiki.test.raw -s 512 -l 2048
 ```
 
 |                         | Q4_0  | Q4_1  | Q5_0  | Q5_1  | Q8_0  | F16   |
diff --git a/chatglm_cpp/convert.py b/chatglm_cpp/convert.py
index a98168c1..78ea912d 100644
--- a/chatglm_cpp/convert.py
+++ b/chatglm_cpp/convert.py
@@ -562,7 +562,7 @@ def main():
         help="Lora model name or path used in PeftModel.from_pretrained",
     )
     parser.add_argument(
-        "-o", "--save_path", default="chatglm-ggml.bin", type=Path, help="Path to save the generated GGML model"
+        "-o", "--save_path", default="models/chatglm-ggml.bin", type=Path, help="Path to save the generated GGML model"
     )
     parser.add_argument(
         "-t",
diff --git a/chatglm_cpp/langchain_api.py b/chatglm_cpp/langchain_api.py
index ceea5988..12de66a5 100644
--- a/chatglm_cpp/langchain_api.py
+++ b/chatglm_cpp/langchain_api.py
@@ -11,7 +11,7 @@
 
 
 class Settings(BaseSettings):
-    model: str = "chatglm-ggml.bin"
+    model: str = "models/chatglm-ggml.bin"
 
 
 class ChatRequest(BaseModel):
diff --git a/chatglm_cpp/openai_api.py b/chatglm_cpp/openai_api.py
index 42c0a479..ad5115b8 100644
--- a/chatglm_cpp/openai_api.py
+++ b/chatglm_cpp/openai_api.py
@@ -16,7 +16,7 @@
 
 
 class Settings(BaseSettings):
-    model: str = "chatglm3-ggml.bin"
+    model: str = "models/chatglm3-ggml.bin"
     num_threads: int = 0
 
 
diff --git a/chatglm_test.cpp b/chatglm_test.cpp
index 1b0befe1..c0ebd13b 100644
--- a/chatglm_test.cpp
+++ b/chatglm_test.cpp
@@ -995,7 +995,7 @@ static void check_chat_format(const Pipeline &pipeline) {
 }
 
 TEST(Pipeline, ChatGLM) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping ChatGLM e2e test (ggml model not found)";
     }
@@ -1057,7 +1057,7 @@ TEST(Pipeline, ChatGLM) {
 }
 
 TEST(Pipeline, ChatGLM2) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm2-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm2-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping ChatGLM2 e2e test (ggml model not found)";
     }
@@ -1127,7 +1127,7 @@ static inline std::string read_text(const fs::path &path) {
 }
 
 TEST(Pipeline, ChatGLM3) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm3-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm3-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping ChatGLM3 e2e test (ggml model not found)";
     }
@@ -1296,7 +1296,7 @@ primes_up_to_100
 }
 
 TEST(Pipeline, CodeGeeX2) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "codegeex2-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/codegeex2-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping CodeGeeX2 e2e test (ggml model not found)";
     }
@@ -1320,12 +1320,12 @@ TEST(Pipeline, CodeGeeX2) {
         std::string prompt = "# language: Python\n# write a bubble sort function\n";
         std::string target = R"(
 
-def bubble_sort(list):
-    for i in range(len(list) - 1):
-        for j in range(len(list) - 1):
-            if list[j] > list[j + 1]:
-                list[j], list[j + 1] = list[j + 1], list[j]
-    return list
+def bubble_sort(lst):
+    for i in range(len(lst) - 1):
+        for j in range(len(lst) - 1 - i):
+            if lst[j] > lst[j + 1]:
+                lst[j], lst[j + 1] = lst[j + 1], lst[j]
+    return lst
 
 
 print(bubble_sort([5, 4, 3, 2, 1])))";
@@ -1336,7 +1336,7 @@ print(bubble_sort([5, 4, 3, 2, 1])))";
 }
 
 TEST(Pipeline, Baichuan13B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan-13b-chat-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan-13b-chat-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping Baichuan13B e2e test (ggml model not found)";
     }
@@ -1391,7 +1391,7 @@ TEST(Pipeline, Baichuan13B) {
 }
 
 TEST(Pipeline, Baichuan2_7B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-7b-chat-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping Baichuan2-7B e2e test (ggml model not found)";
     }
@@ -1446,7 +1446,7 @@ TEST(Pipeline, Baichuan2_7B) {
 }
 
 TEST(Pipeline, Baichuan2_13B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-13b-chat-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping Baichuan2-13B e2e test (ggml model not found)";
     }
@@ -1489,7 +1489,7 @@ TEST(Pipeline, Baichuan2_13B) {
 }
 
 TEST(Pipeline, InternLM) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-7b-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin";
     if (!fs::exists(model_path)) {
         GTEST_SKIP() << "Skipping InternLM e2e test (ggml model not found)";
     }
@@ -1539,7 +1539,7 @@ TEST(Pipeline, InternLM) {
         gen_config.do_sample = false;
         std::vector<ChatMessage> messages{{ChatMessage::ROLE_USER, "你好"}};
         ChatMessage output = pipeline.chat(messages, gen_config);
-        EXPECT_EQ(output.content, "你好，有什么我可以帮助你的吗？");
+        EXPECT_EQ(output.content, "你好！有什么我可以帮助你的吗？");
     }
 }
 
@@ -1578,32 +1578,32 @@ static void run_benchmark(const fs::path &model_path) {
 }
 
 TEST(Benchmark, ChatGLM) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm-ggml.bin";
     run_benchmark(model_path);
 }
 
 TEST(Benchmark, ChatGLM2) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "chatglm2-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/chatglm2-ggml.bin";
     run_benchmark(model_path);
 }
 
 TEST(Benchmark, Baichuan2_7B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-7b-chat-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-7b-chat-ggml.bin";
     run_benchmark(model_path);
 }
 
 TEST(Benchmark, Baichuan2_13B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "baichuan2-13b-chat-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/baichuan2-13b-chat-ggml.bin";
     run_benchmark(model_path);
 }
 
 TEST(Benchmark, InternLM7B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-7b-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-7b-ggml.bin";
     run_benchmark(model_path);
 }
 
 TEST(Benchmark, InternLM20B) {
-    fs::path model_path = fs::path(__FILE__).parent_path() / "internlm-chat-20b-ggml.bin";
+    fs::path model_path = fs::path(__FILE__).parent_path() / "models/internlm-chat-20b-ggml.bin";
     run_benchmark(model_path);
 }
 
diff --git a/examples/chatglm3_demo.py b/examples/chatglm3_demo.py
index 5c200bb8..783b58ad 100644
--- a/examples/chatglm3_demo.py
+++ b/examples/chatglm3_demo.py
@@ -17,7 +17,7 @@
 from PIL import Image
 
 IPYKERNEL = "chatglm3-demo"
-MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm3-ggml.bin"
+MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm3-ggml.bin"
 
 CHAT_SYSTEM_PROMPT = "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown."
 
diff --git a/examples/cli_demo.py b/examples/cli_demo.py
index c134b7dd..d4532b97 100644
--- a/examples/cli_demo.py
+++ b/examples/cli_demo.py
@@ -4,7 +4,7 @@
 
 import chatglm_cpp
 
-DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin"
+DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm-ggml.bin"
 
 BANNER = """
     ________          __  ________    __  ___                 
diff --git a/examples/web_demo.py b/examples/web_demo.py
index 7fe74cdf..96b39a7f 100644
--- a/examples/web_demo.py
+++ b/examples/web_demo.py
@@ -6,7 +6,7 @@
 import chatglm_cpp
 import gradio as gr
 
-DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin"
+DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "models/chatglm-ggml.bin"
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", default=DEFAULT_MODEL_PATH, type=Path, help="model path")
diff --git a/main.cpp b/main.cpp
index a909746a..6a14765e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,7 +22,7 @@ static inline InferenceMode to_inference_mode(const std::string &s) {
 }
 
 struct Args {
-    std::string model_path = "chatglm-ggml.bin";
+    std::string model_path = "models/chatglm-ggml.bin";
     InferenceMode mode = INFERENCE_MODE_CHAT;
     bool sync = false;
     std::string prompt = "你好";
@@ -44,7 +44,7 @@ static void usage(const std::string &prog) {
 
 options:
   -h, --help            show this help message and exit
-  -m, --model PATH      model path (default: chatglm-ggml.bin)
+  -m, --model PATH      model path (default: models/chatglm-ggml.bin)
   --mode                inference mode chosen from {chat, generate} (default: chat)
   --sync                synchronized generation without streaming
   -p, --prompt PROMPT   prompt to start generation with (default: 你好)
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 00000000..a8a0dcec
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1 @@
+*.bin
diff --git a/tests/perf.sh b/tests/perf.sh
index 3412f58c..84e614ef 100644
--- a/tests/perf.sh
+++ b/tests/perf.sh
@@ -3,13 +3,13 @@
 export CUDA_VISIBLE_DEVICES=0
 
 # InternLM-7B
-hf_model=internlm/internlm-chat-7b-v1_1
-ggml_model=internlm-chat-7b-ggml.bin
+hf_model=internlm/internlm-chat-7b
+ggml_model=models/internlm-chat-7b-ggml.bin
 benchmark=Benchmark.InternLM7B
 
 # InternLM-20B
 # hf_model=internlm/internlm-chat-20b
-# ggml_model=internlm-chat-20b-ggml.bin
+# ggml_model=models/internlm-chat-20b-ggml.bin
 # benchmark=Benchmark.InternLM20B
 
 for dtype in q4_0 q4_1 q5_0 q5_1 q8_0 f16; do
diff --git a/tests/perplexity.cpp b/tests/perplexity.cpp
index a98e624c..5bb41c30 100644
--- a/tests/perplexity.cpp
+++ b/tests/perplexity.cpp
@@ -5,7 +5,7 @@
 #include <iostream>
 
 struct Args {
-    std::string model_path = "chatglm-ggml.bin";
+    std::string model_path = "models/chatglm-ggml.bin";
     std::string corpus_path = "data/wikitext-2-raw/wiki.test.raw";
     int max_length = 1024;
     int stride = 512;
diff --git a/tests/ppl.sh b/tests/ppl.sh
index 206025ea..71a1ac8a 100644
--- a/tests/ppl.sh
+++ b/tests/ppl.sh
@@ -4,15 +4,15 @@ export CUDA_VISIBLE_DEVICES=0
 
 # ChatGLM3-6B-Base
 hf_model=THUDM/chatglm3-6b-base
-ggml_model=chatglm3-base-ggml.bin
+ggml_model=models/hatglm3-base-ggml.bin
 
 # Baichuan2-7B-Base
 # hf_model=baichuan-inc/Baichuan2-7B-Base
-# ggml_model=baichuan2-7b-base-ggml.bin
+# ggml_model=models/baichuan2-7b-base-ggml.bin
 
 # InternLM
 # hf_model=internlm/internlm-7b
-# ggml_model=internlm-7b-base-ggml.bin
+# ggml_model=models/internlm-7b-base-ggml.bin
 
 for dtype in f16; do
     python3 chatglm_cpp/convert.py -i $hf_model -o $ggml_model -t $dtype
diff --git a/tests/test_chatglm_cpp.py b/tests/test_chatglm_cpp.py
index 45386bbd..2456ffd3 100644
--- a/tests/test_chatglm_cpp.py
+++ b/tests/test_chatglm_cpp.py
@@ -5,15 +5,15 @@
 
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 
-CHATGLM_MODEL_PATH = PROJECT_ROOT / "chatglm-ggml.bin"
-CHATGLM2_MODEL_PATH = PROJECT_ROOT / "chatglm2-ggml.bin"
-CHATGLM3_MODEL_PATH = PROJECT_ROOT / "chatglm3-ggml.bin"
-CODEGEEX2_MODEL_PATH = PROJECT_ROOT / "codegeex2-ggml.bin"
-BAICHUAN13B_MODEL_PATH = PROJECT_ROOT / "baichuan-13b-chat-ggml.bin"
-BAICHUAN2_7B_MODEL_PATH = PROJECT_ROOT / "baichuan2-7b-chat-ggml.bin"
-BAICHUAN2_13B_MODEL_PATH = PROJECT_ROOT / "baichuan2-13b-chat-ggml.bin"
-INTERNLM7B_MODEL_PATH = PROJECT_ROOT / "internlm-chat-7b-ggml.bin"
-INTERNLM20B_MODEL_PATH = PROJECT_ROOT / "internlm-chat-20b-ggml.bin"
+CHATGLM_MODEL_PATH = PROJECT_ROOT / "models/chatglm-ggml.bin"
+CHATGLM2_MODEL_PATH = PROJECT_ROOT / "models/chatglm2-ggml.bin"
+CHATGLM3_MODEL_PATH = PROJECT_ROOT / "models/chatglm3-ggml.bin"
+CODEGEEX2_MODEL_PATH = PROJECT_ROOT / "models/codegeex2-ggml.bin"
+BAICHUAN13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan-13b-chat-ggml.bin"
+BAICHUAN2_7B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-7b-chat-ggml.bin"
+BAICHUAN2_13B_MODEL_PATH = PROJECT_ROOT / "models/baichuan2-13b-chat-ggml.bin"
+INTERNLM7B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-7b-ggml.bin"
+INTERNLM20B_MODEL_PATH = PROJECT_ROOT / "models/internlm-chat-20b-ggml.bin"
 
 
 def test_chatglm_version():
@@ -80,12 +80,12 @@ def test_codegeex2_pipeline():
     prompt = "# language: Python\n# write a bubble sort function\n"
     target = """
 
-def bubble_sort(list):
-    for i in range(len(list) - 1):
-        for j in range(len(list) - 1):
-            if list[j] > list[j + 1]:
-                list[j], list[j + 1] = list[j + 1], list[j]
-    return list
+def bubble_sort(lst):
+    for i in range(len(lst) - 1):
+        for j in range(len(lst) - 1 - i):
+            if lst[j] > lst[j + 1]:
+                lst[j], lst[j + 1] = lst[j + 1], lst[j]
+    return lst
 
 
 print(bubble_sort([5, 4, 3, 2, 1]))"""
@@ -131,7 +131,7 @@ def test_baichuan2_13b_pipeline():
 
 @pytest.mark.skipif(not INTERNLM7B_MODEL_PATH.exists(), reason="model file not found")
 def test_internlm7b_pipeline():
-    check_pipeline(model_path=INTERNLM7B_MODEL_PATH, prompt="你好", target="你好，有什么我可以帮助你的吗？")
+    check_pipeline(model_path=INTERNLM7B_MODEL_PATH, prompt="你好", target="你好！有什么我可以帮助你的吗？")
 
 
 @pytest.mark.skipif(not INTERNLM20B_MODEL_PATH.exists(), reason="model file not found")