From 61f196c20b5080cdcf28243d23d4b040a129dcc2 Mon Sep 17 00:00:00 2001 From: Jiri Podivin Date: Tue, 5 Nov 2024 10:39:59 +0100 Subject: [PATCH] Setting up production deployment * new compose file for production pod * development pod now uses images and same volume permissions * config for llama-cpp-python runtime added Signed-off-by: Jiri Podivin --- .env | 1 + docker-compose-prod.yaml | 34 ++++++++++++++++++++++++++++++++++ docker-compose.yaml | 4 +++- llama_cpp_server_config.json | 16 ++++++++++++++++ logdetective/server.py | 6 ++++-- 5 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 docker-compose-prod.yaml create mode 100644 llama_cpp_server_config.json diff --git a/.env b/.env index 0aedbda..eefcb6c 100644 --- a/.env +++ b/.env @@ -7,3 +7,4 @@ MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf # for some reason, fastapi cripples sys.path and some deps cannot be found PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages LLM_NGPUS=-1 +LLAMA_CPP_CONFIG=llama_cpp_server_config.json diff --git a/docker-compose-prod.yaml b/docker-compose-prod.yaml new file mode 100644 index 0000000..a7090cd --- /dev/null +++ b/docker-compose-prod.yaml @@ -0,0 +1,34 @@ +version: "3" +services: + llama-cpp: + image: logdetective/runtime:latest-cuda + build: + context: . + dockerfile: ./Containerfile.cuda + hostname: "${LLAMA_CPP_HOST}" + command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}" + stdin_open: true + tty: true + env_file: .env + ports: + - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}" + volumes: + - ${MODELS_PATH-./models}:/models:Z + # these lines are needed for CUDA acceleration + devices: + - nvidia.com/gpu=all + server: + image: logdetective/runtime:latest + build: + context: . + dockerfile: ./Containerfile + hostname: logdetective-server + stdin_open: true + tty: true + volumes: + - .:/src/:Z + ports: + - "${LOGDETECTIVE_SERVER_PORT:-8080}:${LOGDETECTIVE_SERVER_PORT:-8080}" + env_file: .env + # --no-reload: doesn't work in a container - `PermissionError: Permission denied (os error 13) about ["/proc"]` + command: fastapi run /src/logdetective/server.py --host 0.0.0.0 --port $LOGDETECTIVE_SERVER_PORT --no-reload diff --git a/docker-compose.yaml b/docker-compose.yaml index ec1002c..90ef6bc 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -6,7 +6,7 @@ services: context: . dockerfile: ./Containerfile.cuda hostname: "${LLAMA_CPP_HOST}" - command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}" + command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}" stdin_open: true tty: true env_file: .env @@ -14,12 +14,14 @@ services: - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}" volumes: - ${MODELS_PATH-./models}:/models:Z + - ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z # these 4 lines are needed for CUDA acceleration # devices: # - nvidia.com/gpu=all # security_opt: # - "label=disable" server: + image: logdetective/runtime:latest build: context: . dockerfile: ./Containerfile diff --git a/llama_cpp_server_config.json b/llama_cpp_server_config.json new file mode 100644 index 0000000..695de0d --- /dev/null +++ b/llama_cpp_server_config.json @@ -0,0 +1,16 @@ +{ + "host": "0.0.0.0", + "port": 8000, + "models": [ + { + "model": "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf", + "model_alias": "default-model", + "chat_format": "mistral-instruct", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 32768 + } + ] +} diff --git a/logdetective/server.py b/logdetective/server.py index bf3eb69..399aced 100644 --- a/logdetective/server.py +++ b/logdetective/server.py @@ -121,7 +121,8 @@ def mine_logs(log: str) -> List[str]: return log_summary -async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False): +async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False, + model: str = "default-model"): """Submit prompt to LLM. max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS log_probs: number of token choices to produce log probs for @@ -131,7 +132,8 @@ async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream "prompt": text, "max_tokens": str(max_tokens), "logprobs": str(log_probs), - "stream": stream} + "stream": stream, + "model": model} try: # Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT