Skip to content

Commit

Permalink
ENH: quantization for glm-4v (#1610)
Browse files Browse the repository at this point in the history
  • Loading branch information
Minamiyama authored Jun 14, 2024
1 parent 29b7337 commit 34a57df
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 3 deletions.
2 changes: 2 additions & 0 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,8 @@
"model_format": "pytorch",
"model_size_in_billions": 9,
"quantizations": [
"4-bit",
"8-bit",
"none"
],
"model_id": "THUDM/glm-4v-9b",
Expand Down
2 changes: 2 additions & 0 deletions xinference/model/llm/llm_family_modelscope.json
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,8 @@
"model_format": "pytorch",
"model_size_in_billions": 9,
"quantizations": [
"4-bit",
"8-bit",
"none"
],
"model_hub": "modelscope",
Expand Down
16 changes: 13 additions & 3 deletions xinference/model/llm/pytorch/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,29 @@ def match(
return True
return False

def load(self, **kwargs):
def load(self):
from transformers import AutoModelForCausalLM, AutoTokenizer

device = self._pytorch_model_config.get("device", "auto")
self._device = select_device(device)
self._device = "auto" if self._device == "cuda" else self._device

kwargs = {"device_map": self._device}
quantization = self.quantization
if quantization != "none":
if self._device == "cuda" and self._is_linux():
kwargs["device_map"] = "auto"
self._device = "auto"
if quantization == "4-bit":
kwargs["load_in_4bit"] = True
elif quantization == "8-bit":
kwargs["load_in_8bit"] = True

model = AutoModelForCausalLM.from_pretrained(
self.model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map=self._device,
**kwargs,
)
self._model = model.eval()

Expand Down

0 comments on commit 34a57df

Please sign in to comment.