Skip to content

Commit

Permalink
Setting up production deployment
Browse files Browse the repository at this point in the history
* new compose file for production pod
* development pod now uses images and same volume permissions
* config for llama-cpp-python runtime added

Signed-off-by: Jiri Podivin <[email protected]>
  • Loading branch information
jpodivin committed Nov 8, 2024
1 parent 3891ddf commit 61f196c
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 3 deletions.
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf
# for some reason, fastapi cripples sys.path and some deps cannot be found
PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages
LLM_NGPUS=-1
LLAMA_CPP_CONFIG=llama_cpp_server_config.json
34 changes: 34 additions & 0 deletions docker-compose-prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
version: "3"
services:
llama-cpp:
image: logdetective/runtime:latest-cuda
build:
context: .
dockerfile: ./Containerfile.cuda
hostname: "${LLAMA_CPP_HOST}"
command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}"
stdin_open: true
tty: true
env_file: .env
ports:
- "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
volumes:
- ${MODELS_PATH-./models}:/models:Z
# these lines are needed for CUDA acceleration
devices:
- nvidia.com/gpu=all
server:
image: logdetective/runtime:latest
build:
context: .
dockerfile: ./Containerfile
hostname: logdetective-server
stdin_open: true
tty: true
volumes:
- .:/src/:Z
ports:
- "${LOGDETECTIVE_SERVER_PORT:-8080}:${LOGDETECTIVE_SERVER_PORT:-8080}"
env_file: .env
# --no-reload: doesn't work in a container - `PermissionError: Permission denied (os error 13) about ["/proc"]`
command: fastapi run /src/logdetective/server.py --host 0.0.0.0 --port $LOGDETECTIVE_SERVER_PORT --no-reload
4 changes: 3 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@ services:
context: .
dockerfile: ./Containerfile.cuda
hostname: "${LLAMA_CPP_HOST}"
command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0}"
command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}"
stdin_open: true
tty: true
env_file: .env
ports:
- "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
volumes:
- ${MODELS_PATH-./models}:/models:Z
- ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z
# these 4 lines are needed for CUDA acceleration
# devices:
# - nvidia.com/gpu=all
# security_opt:
# - "label=disable"
server:
image: logdetective/runtime:latest
build:
context: .
dockerfile: ./Containerfile
Expand Down
16 changes: 16 additions & 0 deletions llama_cpp_server_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"host": "0.0.0.0",
"port": 8000,
"models": [
{
"model": "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf",
"model_alias": "default-model",
"chat_format": "mistral-instruct",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 512,
"n_ctx": 32768
}
]
}
6 changes: 4 additions & 2 deletions logdetective/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def mine_logs(log: str) -> List[str]:

return log_summary

async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False):
async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream: bool = False,
model: str = "default-model"):
"""Submit prompt to LLM.
max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
log_probs: number of token choices to produce log probs for
Expand All @@ -131,7 +132,8 @@ async def submit_text(text: str, max_tokens: int = 0, log_probs: int = 1, stream
"prompt": text,
"max_tokens": str(max_tokens),
"logprobs": str(log_probs),
"stream": stream}
"stream": stream,
"model": model}

try:
# Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT
Expand Down

0 comments on commit 61f196c

Please sign in to comment.