diff --git a/README.md b/README.md index 2d407e2..9b16321 100644 --- a/README.md +++ b/README.md @@ -320,14 +320,23 @@ docker build . --network=host -t chatglm.cpp-cuda \ docker run -it --rm --gpus all -v $PWD:/chatglm.cpp/models chatglm.cpp-cuda ./build/bin/main -m models/chatglm-ggml.bin -p "你好" ``` -**Option 2: Pulling from GHCR** +**Option 2: Using Pre-built Image** -Pre-built image for CPU inference is published on GitHub Container Registry (GHCR). Download it with the below script and use it in the same way: +The pre-built image for CPU inference is published on both [Docker Hub](https://hub.docker.com/repository/docker/liplusx/chatglm.cpp) and [GitHub Container Registry (GHCR)](https://github.com/li-plus/chatglm.cpp/pkgs/container/chatglm.cpp). + +To pull from Docker Hub and run demo: +```sh +docker run -it --rm -v $PWD:/opt liplusx/chatglm.cpp:main \ + ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好" +``` + +To pull from GHCR and run demo: ```sh -docker pull ghcr.io/li-plus/chatglm.cpp:main +docker run -it --rm -v $PWD:/opt ghcr.io/li-plus/chatglm.cpp:main \ + ./build/bin/main -m /opt/chatglm-ggml.bin -p "你好" ``` -Visit [container/chatglm.cpp](https://github.com/li-plus/chatglm.cpp/pkgs/container/chatglm.cpp) for more information. +Python demo and API servers are also supported in pre-built image. Use it in the same way as **Option 1**. ## Performance diff --git a/api_Dockerfile b/api_Dockerfile deleted file mode 100644 index 48fa864..0000000 --- a/api_Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM python:3.9 - -RUN pip3 install uvicorn fastapi==0.92.0 sse_starlette chatglm-cpp -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn --no-cache-dir - -ADD ./examples/api_demo.py /chatglm/api_demo.py - -WORKDIR /chatglm - -CMD ["python", "api_demo.py"] - diff --git a/examples/api_demo.py b/examples/api_demo.py deleted file mode 100644 index 55de23c..0000000 --- a/examples/api_demo.py +++ /dev/null @@ -1,223 +0,0 @@ -# Adapted from https://github.com/lloydzhou/rwkv.cpp/blob/master/rwkv/api.py -import functools -import json -import logging -from pathlib import Path -from threading import Lock -from typing import List - -import chatglm_cpp -import uvicorn -from fastapi import FastAPI, HTTPException, Request, status -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field -from sse_starlette.sse import EventSourceResponse - -DEFAULT_MODEL_PATH = Path(__file__).resolve().parent.parent / "chatglm-ggml.bin" - - -class Settings(BaseSettings): - server_name: str = "ChatGLM CPP API Server" - model: str = str(DEFAULT_MODEL_PATH) # Path to chatglm model in ggml format - host: str = "0.0.0.0" - port: int = 8000 - - -settings = Settings() -app = FastAPI() -app.add_middleware( - CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] -) -pipeline = None -completion_lock = Lock() -requests_num = 0 - - -def run_with_lock(method): - @functools.wraps(method) - async def wrapper(request, *args, **kwargs): - global requests_num - requests_num = requests_num + 1 - logging.debug("Start Waiting. RequestsNum: %r", requests_num) - while completion_lock.locked(): - if await request.is_disconnected(): - logging.debug("Stop Waiting (Lock). RequestsNum: %r", requests_num) - return - # 等待 - logging.debug("Waiting. RequestsNum: %r", requests_num) - time.sleep(0.1) - else: - with completion_lock: - if await request.is_disconnected(): - logging.debug("Stop Waiting (Lock). RequestsNum: %r", requests_num) - return - return method(request, *args, **kwargs) - - return wrapper - - -@app.on_event("startup") -async def startup_event(): - global pipeline - pipeline = chatglm_cpp.Pipeline(settings.model) - logging.info("End Loading chatglm model") - - -@run_with_lock -async def stream_chat(request, history, body): - for piece in pipeline.stream_chat( - history, - max_length=body.max_tokens, - max_context_length=body.max_context_length, - do_sample=body.temperature > 0, - top_k=body.top_k, - top_p=body.top_p, - temperature=body.temperature, - num_threads=16, - ): - # debug log - print(piece, end="", flush=True) - yield piece - - -async def process_generate(history, chat_model, body, request): - # TODO calc tokens - usage = {} - - if len(history) % 2 == 0: - history = ["hi"] + history - - async def generate(): - response = "" - async for delta in await stream_chat(request, history, body): - response += delta - if body.stream: - chunk = format_message("", delta, chunk=True, chat_model=chat_model) - yield json.dumps(chunk) - if body.stream: - result = format_message(response, "", chunk=True, chat_model=chat_model, finish_reason="stop") - result.update(usage=usage) - yield json.dumps(result) - else: - result = format_message(response, response, chunk=False, chat_model=chat_model, finish_reason="stop") - result.update(usage=usage) - yield result - - if body.stream: - return EventSourceResponse(generate()) - return await generate().__anext__() - - -def format_message(response, delta, chunk=False, chat_model=False, model_name="chatglm2-6b", finish_reason=None): - if not chat_model: - object = "text_completion" - else: - if chunk: - object = "chat.completion.chunk" - else: - object = "chat.completion" - - return { - "object": object, - "response": response, - "model": model_name, - "choices": [ - { - "delta": {"content": delta}, - "index": 0, - "finish_reason": finish_reason, - } - if chat_model - else { - "text": delta, - "index": 0, - "finish_reason": finish_reason, - } - ], - } - - -class ModelConfigBody(BaseModel): - max_tokens: int = Field(default=2048, gt=0, le=102400) - max_context_length: int = Field(default=512, gt=0, le=102400) - temperature: float = Field(default=0.95, ge=0, le=2) - top_p: float = Field(default=0.7, ge=0, le=1) - top_k: float = Field(default=0, ge=0, le=1) - - class Config: - schema_extra = { - "example": { - "max_tokens": 2048, - "max_context_length": 512, - "temperature": 0.95, - "top_p": 0.7, - "top_k": 0, - } - } - - -class Message(BaseModel): - role: str - content: str - - -class ChatCompletionBody(ModelConfigBody): - messages: List[Message] - model: str = "chatglm2-6b" - stream: bool = False - - class Config: - schema_extra = { - "example": { - "messages": [{"role": "user", "content": "hello"}], - "model": "chatglm2-6b", - "stream": False, - "max_tokens": 2048, - "max_context_length": 512, - "temperature": 0.95, - "top_p": 0.7, - "top_k": 0, - } - } - - -class CompletionBody(ModelConfigBody): - prompt: str or List[str] - model: str = "chatglm2-6b" - stream: bool = False - - class Config: - schema_extra = { - "example": { - "prompt": "The following is an epic science fiction masterpiece that is immortalized, " - + "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n", - "model": "chatglm2-6b", - "stream": False, - "max_tokens": 2048, - "max_context_length": 512, - "temperature": 0.95, - "top_p": 0.7, - "top_k": 0, - } - } - - -@app.post("/v1/completions") -@app.post("/completions") -async def completions(body: CompletionBody, request: Request): - return await process_generate([body.prompt], False, body, request) - - -@app.post("/v1/chat/completions") -@app.post("/chat/completions") -async def chat_completions(body: ChatCompletionBody, request: Request): - if len(body.messages) == 0 or body.messages[-1].role != "user": - raise HTTPException(status.HTTP_400_BAD_REQUEST, "no question found") - - # history = [f'{message.role}: {message.content}' for message in body.messages] - history = [message.content for message in body.messages] - return await process_generate(history, True, body, request) - - -if __name__ == "__main__": - uvicorn.run("api_demo:app", host=settings.host, port=settings.port)