diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index b5806199b6..5f24917fcc 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -939,6 +939,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 9,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "THUDM/glm-4v-9b",
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index 635630028d..7675378a97 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -632,6 +632,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 9,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
diff --git a/xinference/model/llm/pytorch/glm4v.py b/xinference/model/llm/pytorch/glm4v.py
index 09b4d98491..0b4bde2e09 100644
--- a/xinference/model/llm/pytorch/glm4v.py
+++ b/xinference/model/llm/pytorch/glm4v.py
@@ -56,19 +56,29 @@ def match(
             return True
         return False
 
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
-        self._device = "auto" if self._device == "cuda" else self._device
+
+        kwargs = {"device_map": self._device}
+        quantization = self.quantization
+        if quantization != "none":
+            if self._device == "cuda" and self._is_linux():
+                kwargs["device_map"] = "auto"
+                self._device = "auto"
+                if quantization == "4-bit":
+                    kwargs["load_in_4bit"] = True
+                elif quantization == "8-bit":
+                    kwargs["load_in_8bit"] = True
 
         model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             low_cpu_mem_usage=True,
             trust_remote_code=True,
             torch_dtype=torch.float16,
-            device_map=self._device,
+            **kwargs,
         )
         self._model = model.eval()