diff --git a/README.md b/README.md
index 2d407e2..9b16321 100644
--- a/README.md
+++ b/README.md
@@ -320,14 +320,23 @@ docker build . --network=host -t chatglm.cpp-cuda \
 docker run -it --rm --gpus all -v $PWD:/chatglm.cpp/models chatglm.cpp-cuda ./build/bin/main -m models/chatglm-ggml.bin -p "你好"
 ```
 
-**Option 2: Pulling from GHCR**
+**Option 2: Using Pre-built Image**
 
-Pre-built image for CPU inference is published on GitHub Container Registry (GHCR). Download it with the below script and use it in the same way:
+The pre-built image for CPU inference is published on both [Docker Hub](https://hub.docker.com/repository/docker/liplusx/chatglm.cpp) and [GitHub Container Registry (GHCR)](https://github.com/li-plus/chatglm.cpp/pkgs/container/chatglm.cpp).
+
+To pull from Docker Hub and run demo:
+```sh
+docker run -it --rm -v $PWD:/opt liplusx/chatglm.cpp:main \
+    ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好"
+```
+
+To pull from GHCR and run demo:
 ```sh
-docker pull ghcr.io/li-plus/chatglm.cpp:main
+docker run -it --rm -v $PWD:/opt ghcr.io/li-plus/chatglm.cpp:main \
+    ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好"
 ```
 
-Visit [container/chatglm.cpp](https://github.com/li-plus/chatglm.cpp/pkgs/container/chatglm.cpp) for more information.
+Python demo and API servers are also supported in pre-built image. Use it in the same way as **Option 1**.
 
 ## Performance
 
diff --git a/api_Dockerfile b/api_Dockerfile
deleted file mode 100644
index 48fa864..0000000
--- a/api_Dockerfile
+++ /dev/null
@@ -1,10 +0,0 @@
-FROM python:3.9
-
-RUN pip3 install uvicorn fastapi==0.92.0 sse_starlette chatglm-cpp -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn --no-cache-dir
-
-ADD ./examples/api_demo.py /chatglm/api_demo.py
-
-WORKDIR /chatglm
-
-CMD ["python", "api_demo.py"]
-
diff --git a/examples/api_demo.py b/examples/api_demo.py
deleted file mode 100644
index 55de23c..0000000
--- a/examples/api_demo.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Adapted from https://github.com/lloydzhou/rwkv.cpp/blob/master/rwkv/api.py
-import functools
-import json
-import logging
-from pathlib import Path
-from threading import Lock
-from typing import List
-
-import chatglm_cpp
-import uvicorn
-from fastapi import FastAPI, HTTPException, Request, status
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field
-from sse_starlette.sse import EventSourceResponse
-
-DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin"
-
-
-class Settings(BaseSettings):
-    server_name: str = "ChatGLM CPP API Server"
-    model: str = str(DEFAULT_MODEL_PATH)  # Path to chatglm model in ggml format
-    host: str = "0.0.0.0"
-    port: int = 8000
-
-
-settings = Settings()
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
-)
-pipeline = None
-completion_lock = Lock()
-requests_num = 0
-
-
-def run_with_lock(method):
-    @functools.wraps(method)
-    async def wrapper(request, *args, **kwargs):
-        global requests_num
-        requests_num = requests_num + 1
-        logging.debug("Start Waiting. RequestsNum: %r", requests_num)
-        while completion_lock.locked():
-            if await request.is_disconnected():
-                logging.debug("Stop Waiting (Lock). RequestsNum: %r", requests_num)
-                return
-            # 等待
-            logging.debug("Waiting. RequestsNum: %r", requests_num)
-            time.sleep(0.1)
-        else:
-            with completion_lock:
-                if await request.is_disconnected():
-                    logging.debug("Stop Waiting (Lock). RequestsNum: %r", requests_num)
-                    return
-                return method(request, *args, **kwargs)
-
-    return wrapper
-
-
-@app.on_event("startup")
-async def startup_event():
-    global pipeline
-    pipeline = chatglm_cpp.Pipeline(settings.model)
-    logging.info("End Loading chatglm model")
-
-
-@run_with_lock
-async def stream_chat(request, history, body):
-    for piece in pipeline.stream_chat(
-        history,
-        max_length=body.max_tokens,
-        max_context_length=body.max_context_length,
-        do_sample=body.temperature > 0,
-        top_k=body.top_k,
-        top_p=body.top_p,
-        temperature=body.temperature,
-        num_threads=16,
-    ):
-        # debug log
-        print(piece, end="", flush=True)
-        yield piece
-
-
-async def process_generate(history, chat_model, body, request):
-    # TODO calc tokens
-    usage = {}
-
-    if len(history) % 2 == 0:
-        history = ["hi"] + history
-
-    async def generate():
-        response = ""
-        async for delta in await stream_chat(request, history, body):
-            response += delta
-            if body.stream:
-                chunk = format_message("", delta, chunk=True, chat_model=chat_model)
-                yield json.dumps(chunk)
-        if body.stream:
-            result = format_message(response, "", chunk=True, chat_model=chat_model, finish_reason="stop")
-            result.update(usage=usage)
-            yield json.dumps(result)
-        else:
-            result = format_message(response, response, chunk=False, chat_model=chat_model, finish_reason="stop")
-            result.update(usage=usage)
-            yield result
-
-    if body.stream:
-        return EventSourceResponse(generate())
-    return await generate().__anext__()
-
-
-def format_message(response, delta, chunk=False, chat_model=False, model_name="chatglm2-6b", finish_reason=None):
-    if not chat_model:
-        object = "text_completion"
-    else:
-        if chunk:
-            object = "chat.completion.chunk"
-        else:
-            object = "chat.completion"
-
-    return {
-        "object": object,
-        "response": response,
-        "model": model_name,
-        "choices": [
-            {
-                "delta": {"content": delta},
-                "index": 0,
-                "finish_reason": finish_reason,
-            }
-            if chat_model
-            else {
-                "text": delta,
-                "index": 0,
-                "finish_reason": finish_reason,
-            }
-        ],
-    }
-
-
-class ModelConfigBody(BaseModel):
-    max_tokens: int = Field(default=2048, gt=0, le=102400)
-    max_context_length: int = Field(default=512, gt=0, le=102400)
-    temperature: float = Field(default=0.95, ge=0, le=2)
-    top_p: float = Field(default=0.7, ge=0, le=1)
-    top_k: float = Field(default=0, ge=0, le=1)
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "max_tokens": 2048,
-                "max_context_length": 512,
-                "temperature": 0.95,
-                "top_p": 0.7,
-                "top_k": 0,
-            }
-        }
-
-
-class Message(BaseModel):
-    role: str
-    content: str
-
-
-class ChatCompletionBody(ModelConfigBody):
-    messages: List[Message]
-    model: str = "chatglm2-6b"
-    stream: bool = False
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [{"role": "user", "content": "hello"}],
-                "model": "chatglm2-6b",
-                "stream": False,
-                "max_tokens": 2048,
-                "max_context_length": 512,
-                "temperature": 0.95,
-                "top_p": 0.7,
-                "top_k": 0,
-            }
-        }
-
-
-class CompletionBody(ModelConfigBody):
-    prompt: str or List[str]
-    model: str = "chatglm2-6b"
-    stream: bool = False
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "The following is an epic science fiction masterpiece that is immortalized, "
-                + "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n",
-                "model": "chatglm2-6b",
-                "stream": False,
-                "max_tokens": 2048,
-                "max_context_length": 512,
-                "temperature": 0.95,
-                "top_p": 0.7,
-                "top_k": 0,
-            }
-        }
-
-
-@app.post("/v1/completions")
-@app.post("/completions")
-async def completions(body: CompletionBody, request: Request):
-    return await process_generate([body.prompt], False, body, request)
-
-
-@app.post("/v1/chat/completions")
-@app.post("/chat/completions")
-async def chat_completions(body: ChatCompletionBody, request: Request):
-    if len(body.messages) == 0 or body.messages[-1].role != "user":
-        raise HTTPException(status.HTTP_400_BAD_REQUEST, "no question found")
-
-    # history = [f'{message.role}: {message.content}' for message in body.messages]
-    history = [message.content for message in body.messages]
-    return await process_generate(history, True, body, request)
-
-
-if __name__ == "__main__":
-    uvicorn.run("api_demo:app", host=settings.host, port=settings.port)