Setting up production deployment

* new compose file for production pod * development pod now uses images and same volume permissions * config for llama-cpp-python runtime added Signed-off-by: Jiri Podivin <[email protected]>
fedora-copr · Nov 8, 2024 · 61f196c · 61f196c
1 parent 3891ddf
commit 61f196c
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 3 deletions.
diff --git a/.env b/.env
@@ -7,3 +7,4 @@ MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf
 # for some reason, fastapi cripples sys.path and some deps cannot be found
 PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages
 LLM_NGPUS=-1
+LLAMA_CPP_CONFIG=llama_cpp_server_config.json
diff --git a/docker-compose-prod.yaml b/docker-compose-prod.yaml
@@ -0,0 +1,34 @@
+version: "3"
+services:
+  llama-cpp:
+    image: logdetective/runtime:latest-cuda
+    build:
+      context: .
+      dockerfile: ./Containerfile.cuda
+    hostname: "${LLAMA_CPP_HOST}"
+    command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}"
+    stdin_open: true
+    tty: true
+    env_file: .env
+    ports:
+      - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
+    volumes:
+      - ${MODELS_PATH-./models}:/models:Z
+    # these lines are needed for CUDA acceleration
+    devices:
+      - nvidia.com/gpu=all
+  server:
+    image: logdetective/runtime:latest
+    build:
+      context: .
+      dockerfile: ./Containerfile
+    hostname: logdetective-server
+    stdin_open: true
+    tty: true
+    volumes:
+      - .:/src/:Z
+    ports:
+      - "${LOGDETECTIVE_SERVER_PORT:-8080}:${LOGDETECTIVE_SERVER_PORT:-8080}"
+    env_file: .env
+    # --no-reload: doesn't work in a container - `PermissionError: Permission denied (os error 13) about ["/proc"]`
+    command: fastapi run /src/logdetective/server.py --host 0.0.0.0 --port $LOGDETECTIVE_SERVER_PORT --no-reload
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -6,20 +6,22 @@ services:
       context: .
       dockerfile: ./Containerfile.cuda
     hostname: "${LLAMA_CPP_HOST}"
-    command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}"
+    command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}"
     stdin_open: true
     tty: true
     env_file: .env
     ports:
       - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
     volumes:
       - ${MODELS_PATH-./models}:/models:Z
+      - ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z
     # these 4 lines are needed for CUDA acceleration
     # devices:
     #   - nvidia.com/gpu=all
     # security_opt:
     #   - "label=disable"
   server:
+    image: logdetective/runtime:latest
     build:
       context: .
       dockerfile: ./Containerfile

diff --git a/llama_cpp_server_config.json b/llama_cpp_server_config.json
@@ -0,0 +1,16 @@
+{
+    "host": "0.0.0.0",
+    "port": 8000,
+    "models": [
+        {
+            "model": "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf",
+            "model_alias": "default-model",
+            "chat_format": "mistral-instruct",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 32768
+        }
+    ]
+}
diff --git a/logdetective/server.py b/logdetective/server.py
@@ -121,7 +121,8 @@ def mine_logs(log: str) -> List[str]:
 
     return log_summary
 
-async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False):
+async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False,
+                      model: str = "default-model"):
     """Submit prompt to LLM.
     max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
     log_probs: number of token choices to produce log probs for
@@ -131,7 +132,8 @@ async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream
             "prompt": text,
             "max_tokens": str(max_tokens),
             "logprobs": str(log_probs),
-            "stream": stream}
+            "stream": stream,
+            "model": model}
 
     try:
         # Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT