FEAT: Support CodeLlama-Instruct (#414)

xorbitsai · Sep 5, 2023 · 28475c4 · 28475c4
1 parent ce528cc
commit 28475c4
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -201,6 +201,9 @@ $ xinference list --all
 | chatglm          | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2         | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2-32k     | ['en', 'zh']  | ['embed', 'chat']      |
+| code-llama       | ['en']        | ['generate']           |
+| code-llama-python| ['en']        | ['generate']           |
+| code-llama-instruct| ['en']      | ['chat']               |
 | falcon           | ['en']        | ['embed', 'generate']  |
 | falcon-instruct  | ['en']        | ['embed', 'chat']      |
 | gpt-2            | ['en']        | ['generate']           |

diff --git a/README_ja_JP.md b/README_ja_JP.md
@@ -180,6 +180,9 @@ $ xinference list --all
 | chatglm          | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2         | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2-32k     | ['en', 'zh']  | ['embed', 'chat']      |
+| code-llama       | ['en']        | ['generate']           |
+| code-llama-python| ['en']        | ['generate']           |
+| code-llama-instruct| ['en']      | ['chat']               |
 | falcon           | ['en']        | ['embed', 'generate']  |
 | falcon-instruct  | ['en']        | ['embed', 'chat']      |
 | gpt-2            | ['en']        | ['generate']           |

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -183,6 +183,9 @@ $ xinference list --all
 | chatglm          | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2         | ['en', 'zh']  | ['embed', 'chat']      |
 | chatglm2-32k     | ['en', 'zh']  | ['embed', 'chat']      |
+| code-llama       | ['en']        | ['generate']           |
+| code-llama-python| ['en']        | ['generate']           |
+| code-llama-instruct| ['en']      | ['chat']               |
 | falcon           | ['en']        | ['embed', 'generate']  |
 | falcon-instruct  | ['en']        | ['embed', 'chat']      |
 | gpt-2            | ['en']        | ['generate']           |

diff --git a/doc/source/models/builtin/code-llama-instruct.rst b/doc/source/models/builtin/code-llama-instruct.rst
@@ -0,0 +1,67 @@
+.. _models_builtin_code_llama_instruct:
+
+
+===================
+Code-Llama-Instruct
+===================
+
+- **Context Length:** 100000
+- **Model Name:** code-llama-instruct
+- **Languages:** en
+- **Abilities:** chat
+
+Specifications
+^^^^^^^^^^^^^^
+
+Model Spec 1 (pytorch, 7 Billion)
++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-7b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+.. note::
+
+   4-bit quantization is not supported on macOS.
+
+Model Spec 2 (pytorch, 13 Billion)
+++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 13
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-13b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
+
+.. note::
+
+   4-bit quantization is not supported on macOS.
+
+Model Spec 3 (pytorch, 34 Billion)
+++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 34
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** codellama/CodeLlama-34b-Instruct-hf
+
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-instruct --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
+
+.. note::
+
+   4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/code-llama-python.rst b/doc/source/models/builtin/code-llama-python.rst
@@ -14,37 +14,52 @@ Specifications
 ^^^^^^^^^^^^^^
 
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-7B-Python-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-13B-Python-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
 
 Model Spec 3 (pytorch, 34 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 34
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-34B-Python-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama-python --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/code-llama.rst b/doc/source/models/builtin/code-llama.rst
@@ -13,37 +13,52 @@ Specifications
 ^^^^^^^^^^^^^^
 
 Model Spec 1 (pytorch, 7 Billion)
-+++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 7
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-7B-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
 
 Model Spec 2 (pytorch, 13 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 13
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-13B-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 13 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
 
 Model Spec 3 (pytorch, 34 Billion)
-+++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
 
 - **Model Format:** pytorch
 - **Model Size (in billions):** 34
 - **Quantizations:** 4-bit, 8-bit, none
 - **Model ID:** TheBloke/CodeLlama-34B-fp16
 
+Execute the following command to launch the model, remember to replace `${quantization}` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name code-llama --size-in-billions 34 --model-format pytorch --quantization ${quantization}
+
 .. note::
 
    4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/index.rst b/doc/source/models/builtin/index.rst
@@ -24,6 +24,7 @@ Chat & Instruction-following Models
 - :ref:`ChatGLM <models_builtin_chatglm>`
 - :ref:`ChatGLM2 <models_builtin_chatglm2>`
 - :ref:`ChatGLM2-32k <models_builtin_chatglm2_32k>`
+- :ref:`CodeLlama-Instruct <models_builtin_code_llama_instruct>`
 - :ref:`Falcon Instruct <models_builtin_falcon_instruct>`
 - :ref:`InternLM Chat <models_builtin_internlm_chat>`
 - :ref:`InternLM Chat 8K <models_builtin_internlm_chat_8k>`
@@ -42,6 +43,7 @@ Code Generation Models
 - :ref:`Starcoder <models_builtin_starcoder>`
 - :ref:`StarCoderPlus <models_builtin_starcoderplus>`
 - :ref:`Code-Llama <models_builtin_code_llama>`
+- :ref:`CodeLlama-Instruct <models_builtin_code_llama_instruct>`
 - :ref:`Code-Llama-Python <models_builtin_code_llama_python>`
 
 
@@ -59,6 +61,9 @@ Code Assistant Models
    chatglm
    chatglm2-32k
    chatglm2
+   code-llama
+   code-llama-instruct
+   code-llama-python
    falcon-instruct
    falcon
    internlm

diff --git a/doc/source/models/builtin/llama-2-chat.rst b/doc/source/models/builtin/llama-2-chat.rst
@@ -64,7 +64,8 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
 
 
@@ -81,7 +82,8 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
 
 Model Spec 6 (pytorch, 70 Billion)
@@ -97,5 +99,6 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2-chat --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
diff --git a/doc/source/models/builtin/llama-2.rst b/doc/source/models/builtin/llama-2.rst
@@ -63,7 +63,8 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
 
 Model Spec 5 (pytorch, 13 Billion)
@@ -79,7 +80,8 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 13 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
 
 Model Spec 6 (pytorch, 70 Billion)
@@ -95,5 +97,6 @@ chosen quantization method from the options listed above::
 
    xinference launch --model-name llama-2 --size-in-billions 70 --model-format pytorch --quantization ${quantization}
 
-Note
+.. note::
+
 4-bit quantization is not supported on macOS.
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -1371,5 +1371,64 @@
         "model_revision": "875f9d97fb6c9619d8867887dd1d80918ff0f593"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 100000,
+    "model_name": "code-llama-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "codellama/CodeLlama-7b-Instruct-hf",
+        "model_revision": "6114dd1e16f69e0765ccbd7a64d33d04b265fbd2"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "codellama/CodeLlama-13b-Instruct-hf",
+        "model_revision": "ff0983bc4267bb98ead4fb5168fe2f049b442787"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "codellama/CodeLlama-34b-Instruct-hf",
+        "model_revision": "38a1e15d8524a1f0a7760a7acf8242b81ae4eb87"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA2",
+      "system_prompt": "<s>[INST] <<SYS>>\nWrite code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n<</SYS>>\n\n",
+      "roles": [
+        "[INST]",
+        "[/INST]"
+      ],
+      "intra_message_sep": " ",
+      "inter_message_sep": " </s><s>",
+      "stop_token_ids": [
+        2
+      ]
+    }
   }
 ]