diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index b5806199b6..5f24917fcc 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -939,6 +939,8 @@ "model_format": "pytorch", "model_size_in_billions": 9, "quantizations": [ + "4-bit", + "8-bit", "none" ], "model_id": "THUDM/glm-4v-9b", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 635630028d..7675378a97 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -632,6 +632,8 @@ "model_format": "pytorch", "model_size_in_billions": 9, "quantizations": [ + "4-bit", + "8-bit", "none" ], "model_hub": "modelscope", diff --git a/xinference/model/llm/pytorch/glm4v.py b/xinference/model/llm/pytorch/glm4v.py index 09b4d98491..0b4bde2e09 100644 --- a/xinference/model/llm/pytorch/glm4v.py +++ b/xinference/model/llm/pytorch/glm4v.py @@ -56,19 +56,29 @@ def match( return True return False - def load(self, **kwargs): + def load(self): from transformers import AutoModelForCausalLM, AutoTokenizer device = self._pytorch_model_config.get("device", "auto") self._device = select_device(device) - self._device = "auto" if self._device == "cuda" else self._device + + kwargs = {"device_map": self._device} + quantization = self.quantization + if quantization != "none": + if self._device == "cuda" and self._is_linux(): + kwargs["device_map"] = "auto" + self._device = "auto" + if quantization == "4-bit": + kwargs["load_in_4bit"] = True + elif quantization == "8-bit": + kwargs["load_in_8bit"] = True model = AutoModelForCausalLM.from_pretrained( self.model_path, low_cpu_mem_usage=True, trust_remote_code=True, torch_dtype=torch.float16, - device_map=self._device, + **kwargs, ) self._model = model.eval()