diff --git a/distributions/README.md b/distributions/README.md new file mode 100644 index 000000000..92640210b --- /dev/null +++ b/distributions/README.md @@ -0,0 +1,11 @@ +# Llama Stack Distribution + +A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications. + + +## Quick Start Llama Stack Distributions Guide +| **Distribution** | **Llama Stack Docker** | Start This Distribution | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|:----------------: |:------------------------------------------: |:-----------------------: |:------------------: |:------------------: |:------------------: |:------------------: |:------------------: | +| Meta Reference | llamastack/distribution-meta-reference-gpu | [Guide](./meta-reference-gpu/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| Ollama | llamastack/distribution-ollama | [Guide](./ollama/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| TGI | llamastack/distribution-tgi | [Guide](./tgi/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | diff --git a/llama_stack/distribution/templates/build_configs/local-bedrock-conda-example-build.yaml b/distributions/bedrock/build.yaml similarity index 87% rename from llama_stack/distribution/templates/build_configs/local-bedrock-conda-example-build.yaml rename to distributions/bedrock/build.yaml index 50d5e7048..ae7b27d49 100644 --- a/llama_stack/distribution/templates/build_configs/local-bedrock-conda-example-build.yaml +++ b/distributions/bedrock/build.yaml @@ -1,4 +1,4 @@ -name: local-bedrock-conda-example +name: bedrock distribution_spec: description: Use Amazon Bedrock APIs. providers: diff --git a/llama_stack/distribution/templates/build_configs/local-databricks-build.yaml b/distributions/databricks/build.yaml similarity index 85% rename from llama_stack/distribution/templates/build_configs/local-databricks-build.yaml rename to distributions/databricks/build.yaml index 754af7668..2188dd0a0 100644 --- a/llama_stack/distribution/templates/build_configs/local-databricks-build.yaml +++ b/distributions/databricks/build.yaml @@ -1,4 +1,4 @@ -name: local-databricks +name: databricks distribution_spec: description: Use Databricks for running LLM inference providers: @@ -7,4 +7,4 @@ distribution_spec: safety: meta-reference agents: meta-reference telemetry: meta-reference -image_type: conda \ No newline at end of file +image_type: conda diff --git a/llama_stack/distribution/templates/build_configs/local-fireworks-build.yaml b/distributions/fireworks/build.yaml similarity index 91% rename from llama_stack/distribution/templates/build_configs/local-fireworks-build.yaml rename to distributions/fireworks/build.yaml index 33bdee3b5..831643ff1 100644 --- a/llama_stack/distribution/templates/build_configs/local-fireworks-build.yaml +++ b/distributions/fireworks/build.yaml @@ -1,4 +1,4 @@ -name: local-fireworks +name: fireworks distribution_spec: description: Use Fireworks.ai for running LLM inference providers: diff --git a/llama_stack/distribution/templates/build_configs/local-hf-endpoint-build.yaml b/distributions/hf-endpoint/build.yaml similarity index 93% rename from llama_stack/distribution/templates/build_configs/local-hf-endpoint-build.yaml rename to distributions/hf-endpoint/build.yaml index e5c4ae8cc..750bebcb5 100644 --- a/llama_stack/distribution/templates/build_configs/local-hf-endpoint-build.yaml +++ b/distributions/hf-endpoint/build.yaml @@ -1,4 +1,4 @@ -name: local-hf-endpoint +name: hf-endpoint distribution_spec: description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." providers: diff --git a/llama_stack/distribution/templates/build_configs/local-hf-serverless-build.yaml b/distributions/hf-serverless/build.yaml similarity index 92% rename from llama_stack/distribution/templates/build_configs/local-hf-serverless-build.yaml rename to distributions/hf-serverless/build.yaml index 752390b40..f6da3ad4d 100644 --- a/llama_stack/distribution/templates/build_configs/local-hf-serverless-build.yaml +++ b/distributions/hf-serverless/build.yaml @@ -1,4 +1,4 @@ -name: local-hf-serverless +name: hf-serverless distribution_spec: description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." providers: diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md new file mode 100644 index 000000000..951120da5 --- /dev/null +++ b/distributions/meta-reference-gpu/README.md @@ -0,0 +1,33 @@ +# Meta Reference Distribution + +The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | meta-reference | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference | + + +### Start the Distribution (Single Node GPU) + +> [!NOTE] +> This assumes you have access to GPU to start a TGI server with access to your GPU. + +> [!NOTE] +> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container. +``` +export LLAMA_CHECKPOINT_DIR=~/.llama +``` + +> [!NOTE] +> `~/.llama` should be the path containing downloaded weights of Llama models. + + +To download and start running a pre-built docker container, you may use the following commands: + +``` +docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu +``` + +### Alternative (Build and start distribution locally via conda) +- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on starting up a meta-reference distribution. diff --git a/llama_stack/distribution/templates/build_configs/local-gpu-docker-build.yaml b/distributions/meta-reference-gpu/build.yaml similarity index 67% rename from llama_stack/distribution/templates/build_configs/local-gpu-docker-build.yaml rename to distributions/meta-reference-gpu/build.yaml index 01af1021e..ca786c51c 100644 --- a/llama_stack/distribution/templates/build_configs/local-gpu-docker-build.yaml +++ b/distributions/meta-reference-gpu/build.yaml @@ -1,9 +1,12 @@ -name: local-gpu +name: distribution-meta-reference-gpu distribution_spec: description: Use code from `llama_stack` itself to serve all llama stack APIs providers: inference: meta-reference - memory: meta-reference + memory: + - meta-reference + - remote::chromadb + - remote::pgvector safety: meta-reference agents: meta-reference telemetry: meta-reference diff --git a/llama_stack/distribution/templates/run_configs/local-run.yaml b/distributions/meta-reference-gpu/run.yaml similarity index 85% rename from llama_stack/distribution/templates/run_configs/local-run.yaml rename to distributions/meta-reference-gpu/run.yaml index 7abf2b4dc..724ca030a 100644 --- a/llama_stack/distribution/templates/run_configs/local-run.yaml +++ b/distributions/meta-reference-gpu/run.yaml @@ -13,7 +13,7 @@ apis: - safety providers: inference: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: model: Llama3.1-8B-Instruct @@ -22,7 +22,7 @@ providers: max_seq_len: 4096 max_batch_size: 1 safety: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: llama_guard_shield: @@ -33,11 +33,11 @@ providers: prompt_guard_shield: model: Prompt-Guard-86M memory: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: {} agents: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: persistence_store: @@ -45,6 +45,6 @@ providers: type: sqlite db_path: ~/.llama/runtime/kvstore.db telemetry: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: {} diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md new file mode 100644 index 000000000..43c764cbe --- /dev/null +++ b/distributions/ollama/README.md @@ -0,0 +1,91 @@ +# Ollama Distribution + +The `llamastack/distribution-ollama` distribution consists of the following provider configurations. + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |---------------- |---------------- |---------------------------------- |---------------- |---------------- | +| **Provider(s)** | remote::ollama | meta-reference | remote::pgvector, remote::chroma | remote::ollama | meta-reference | + + +### Start a Distribution (Single Node GPU) + +> [!NOTE] +> This assumes you have access to GPU to start a Ollama server with access to your GPU. + +``` +$ cd llama-stack/distribution/ollama/gpu +$ ls +compose.yaml run.yaml +$ docker compose up +``` + +You will see outputs similar to following --- +``` +[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" +[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" +INFO: Started server process [1] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) +[llamastack] | Resolved 12 providers +[llamastack] | inner-inference => ollama0 +[llamastack] | models => __routing_table__ +[llamastack] | inference => __autorouted__ +``` + +To kill the server +``` +docker compose down +``` + +### Start the Distribution (Single Node CPU) + +> [!NOTE] +> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only. + +``` +$ cd llama-stack/distribution/ollama/cpu +$ ls +compose.yaml run.yaml +$ docker compose up +``` + +### (Alternative) ollama run + llama stack Run + +If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands. + +#### Start Ollama server. +- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details. + +**Via Docker** +``` +docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +``` + +**Via CLI** +``` +ollama run +``` + +#### Start Llama Stack server pointing to Ollama server + +**Via Docker** +``` +docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./ollama-run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack-local-cpu --yaml_config /root/llamastack-run-ollama.yaml +``` + +Make sure in you `ollama-run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g. +``` +inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 +``` + +**Via Conda** + +``` +llama stack build --config ./build.yaml +llama stack run ./gpu/run.yaml +``` diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml new file mode 100644 index 000000000..d14091814 --- /dev/null +++ b/distributions/ollama/build.yaml @@ -0,0 +1,13 @@ +name: distribution-ollama +distribution_spec: + description: Use ollama for running LLM inference + providers: + inference: remote::ollama + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/ollama/cpu/compose.yaml b/distributions/ollama/cpu/compose.yaml new file mode 100644 index 000000000..841b0b88c --- /dev/null +++ b/distributions/ollama/cpu/compose.yaml @@ -0,0 +1,30 @@ +services: + ollama: + image: ollama/ollama:latest + network_mode: "host" + volumes: + - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + ports: + - "11434:11434" + command: [] + llamastack: + depends_on: + - ollama + image: llamastack/llamastack-local-cpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + ollama: diff --git a/distributions/ollama/cpu/run.yaml b/distributions/ollama/cpu/run.yaml new file mode 100644 index 000000000..798dabc0b --- /dev/null +++ b/distributions/ollama/cpu/run.yaml @@ -0,0 +1,46 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/ollama/gpu/compose.yaml b/distributions/ollama/gpu/compose.yaml new file mode 100644 index 000000000..2e3f85e45 --- /dev/null +++ b/distributions/ollama/gpu/compose.yaml @@ -0,0 +1,48 @@ +services: + ollama: + image: ollama/ollama:latest + network_mode: "host" + volumes: + - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + ports: + - "11434:11434" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + llamastack-local-cpu: + depends_on: + - ollama + image: llamastack/llamastack-local-cpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./ollama-run.yaml:/root/llamastack-run-ollama.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + ollama: diff --git a/distributions/ollama/gpu/run.yaml b/distributions/ollama/gpu/run.yaml new file mode 100644 index 000000000..798dabc0b --- /dev/null +++ b/distributions/ollama/gpu/run.yaml @@ -0,0 +1,46 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md new file mode 100644 index 000000000..86d2636d7 --- /dev/null +++ b/distributions/tgi/README.md @@ -0,0 +1,94 @@ +# TGI Distribution + +The `llamastack/distribution-tgi` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | remote::tgi | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference | + + +### Start the Distribution (Single Node GPU) + +> [!NOTE] +> This assumes you have access to GPU to start a TGI server with access to your GPU. + + +``` +$ cd llama_stack/distribution/docker/tgi +$ ls +compose.yaml tgi-run.yaml +$ docker compose up +``` + +The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- +``` +[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) +[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 +[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected +INFO: Started server process [1] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) +``` + +To kill the server +``` +docker compose down +``` + +### Start the Distribution (Single Node CPU) + +> [!NOTE] +> This assumes you have an hosted endpoint compatible with TGI server. + +``` +$ cd llama-stack/distribution/tgi/cpu +$ ls +compose.yaml run.yaml +$ docker compose up +``` + +Replace in `run.yaml` file with your TGI endpoint. +``` +inference: + - provider_id: tgi0 + provider_type: remote::tgi + config: + url: +``` + +### (Alternative) TGI server + llama stack run (Single Node GPU) + +If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands. + +#### (optional) Start TGI server locally +- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. + +``` +docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.1-8B-Instruct --port 5009 +``` + + +#### Start Llama Stack server pointing to TGI server + +``` +docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack-local-cpu --yaml_config /root/my-run.yaml +``` + +Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g. +``` +inference: + - provider_id: tgi0 + provider_type: remote::tgi + config: + url: http://127.0.0.1:5009 +``` + +**Via Conda** + +```bash +llama stack build --config ./build.yaml +# -- start a TGI server endpoint +llama stack run ./gpu/run.yaml +``` diff --git a/llama_stack/distribution/templates/build_configs/local-tgi-build.yaml b/distributions/tgi/build.yaml similarity index 51% rename from llama_stack/distribution/templates/build_configs/local-tgi-build.yaml rename to distributions/tgi/build.yaml index d4752539d..c3950e900 100644 --- a/llama_stack/distribution/templates/build_configs/local-tgi-build.yaml +++ b/distributions/tgi/build.yaml @@ -1,9 +1,12 @@ -name: local-tgi +name: distribution-tgi distribution_spec: - description: Like local, but use a TGI server for running LLM inference. + description: Use TGI for running LLM inference providers: inference: remote::tgi - memory: meta-reference + memory: + - meta-reference + - remote::chromadb + - remote::pgvector safety: meta-reference agents: meta-reference telemetry: meta-reference diff --git a/distributions/tgi/cpu/compose.yaml b/distributions/tgi/cpu/compose.yaml new file mode 100644 index 000000000..df7c74489 --- /dev/null +++ b/distributions/tgi/cpu/compose.yaml @@ -0,0 +1,54 @@ +services: + text-generation-inference: + image: ghcr.io/huggingface/text-generation-inference:latest + network_mode: "host" + volumes: + - $HOME/.cache/huggingface:/data + ports: + - "5009:5009" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + - HF_HOME=/data + - HF_DATASETS_CACHE=/data + - HF_MODULES_CACHE=/data + - HF_HUB_CACHE=/data + command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + healthcheck: + test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] + interval: 5s + timeout: 5s + retries: 30 + llamastack: + depends_on: + text-generation-inference: + condition: service_healthy + image: llamastack/llamastack-local-cpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to run.yaml file + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/llama_stack/distribution/templates/run_configs/local-tgi-run.yaml b/distributions/tgi/cpu/run.yaml similarity index 83% rename from llama_stack/distribution/templates/run_configs/local-tgi-run.yaml rename to distributions/tgi/cpu/run.yaml index ec3af742c..bf46391b4 100644 --- a/llama_stack/distribution/templates/run_configs/local-tgi-run.yaml +++ b/distributions/tgi/cpu/run.yaml @@ -16,9 +16,9 @@ providers: - provider_id: tgi0 provider_type: remote::tgi config: - url: http://127.0.0.1:5009 + url: safety: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: llama_guard_shield: @@ -29,11 +29,11 @@ providers: prompt_guard_shield: model: Prompt-Guard-86M memory: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: {} agents: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: persistence_store: @@ -41,6 +41,6 @@ providers: type: sqlite db_path: ~/.llama/runtime/kvstore.db telemetry: - - provider_id: meta-reference + - provider_id: meta0 provider_type: meta-reference config: {} diff --git a/llama_stack/distribution/docker/tgi/compose.yaml b/distributions/tgi/gpu/compose.yaml similarity index 90% rename from llama_stack/distribution/docker/tgi/compose.yaml rename to distributions/tgi/gpu/compose.yaml index d5bcd50f3..60dbe4938 100644 --- a/llama_stack/distribution/docker/tgi/compose.yaml +++ b/distributions/tgi/gpu/compose.yaml @@ -34,20 +34,20 @@ services: interval: 5s timeout: 5s retries: 30 - llamastack-local-cpu: + llamastack: depends_on: text-generation-inference: condition: service_healthy - image: llamastack-local-cpu + image: llamastack/llamastack-local-cpu network_mode: "host" volumes: - ~/.llama:/root/.llama # Link to TGI run.yaml file - - ./tgi-run.yaml:/root/llamastack-run-tgi.yaml + - ./run.yaml:/root/my-run.yaml ports: - "5000:5000" # Hack: wait for TGI server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-tgi.yaml" + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" restart_policy: condition: on-failure delay: 3s diff --git a/llama_stack/distribution/docker/tgi/tgi-run.yaml b/distributions/tgi/gpu/run.yaml similarity index 100% rename from llama_stack/distribution/docker/tgi/tgi-run.yaml rename to distributions/tgi/gpu/run.yaml diff --git a/llama_stack/distribution/templates/build_configs/local-together-build.yaml b/distributions/together/build.yaml similarity index 92% rename from llama_stack/distribution/templates/build_configs/local-together-build.yaml rename to distributions/together/build.yaml index ebf0bf1fb..67ba2eefa 100644 --- a/llama_stack/distribution/templates/build_configs/local-together-build.yaml +++ b/distributions/together/build.yaml @@ -1,4 +1,4 @@ -name: local-together +name: together distribution_spec: description: Use Together.ai for running LLM inference providers: diff --git a/llama_stack/distribution/templates/build_configs/local-ollama-build.yaml b/distributions/vllm/build.yaml similarity index 51% rename from llama_stack/distribution/templates/build_configs/local-ollama-build.yaml rename to distributions/vllm/build.yaml index d9116b4b1..f41352eb1 100644 --- a/llama_stack/distribution/templates/build_configs/local-ollama-build.yaml +++ b/distributions/vllm/build.yaml @@ -1,10 +1,10 @@ -name: local-ollama +name: vllm distribution_spec: - description: Like local, but use ollama for running LLM inference + description: Like local, but use vLLM for running LLM inference providers: - inference: remote::ollama + inference: vllm memory: meta-reference safety: meta-reference agents: meta-reference telemetry: meta-reference -image_type: conda +image_type: conda \ No newline at end of file diff --git a/docs/building_distro.md b/docs/building_distro.md new file mode 100644 index 000000000..05e5c09bb --- /dev/null +++ b/docs/building_distro.md @@ -0,0 +1,270 @@ +# Building a Llama Stack Distribution + +This guide will walk you through the steps to get started with building a Llama Stack distributiom from scratch with your choice of API providers. Please see the [Getting Started Guide](./getting_started.md) if you just want the basic steps to start a Llama Stack distribution. + +## Step 1. Build +In the following steps, imagine we'll be working with a `Meta-Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify: +- `name`: the name for our distribution (e.g. `8b-instruct`) +- `image_type`: our build image type (`conda | docker`) +- `distribution_spec`: our distribution specs for specifying API providers + - `description`: a short description of the configurations for the distribution + - `providers`: specifies the underlying implementation for serving each API endpoint + - `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment. + + +At the end of build command, we will generate `-build.yaml` file storing the build configurations. + +After this step is complete, a file named `-build.yaml` will be generated and saved at the output file path specified at the end of the command. + +#### Building from scratch +- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations. +``` +llama stack build +``` + +Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs. + +``` +> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): 8b-instruct +> Enter the image type you want your distribution to be built with (docker or conda): conda + + Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs. +> Enter the API provider for the inference API: (default=meta-reference): meta-reference +> Enter the API provider for the safety API: (default=meta-reference): meta-reference +> Enter the API provider for the agents API: (default=meta-reference): meta-reference +> Enter the API provider for the memory API: (default=meta-reference): meta-reference +> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference + + > (Optional) Enter a short description for your Llama Stack distribution: + +Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/8b-instruct-build.yaml +``` + +**Ollama (optional)** + +If you plan to use Ollama for inference, you'll need to install the server [via these instructions](https://ollama.com/download). + + +#### Building from templates +- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers. + +The following command will allow you to see the available templates and their corresponding providers. +``` +llama stack build --list-templates +``` + +![alt text](resources/list-templates.png) + +You may then pick a template to build your distribution with providers fitted to your liking. + +``` +llama stack build --template local-tgi --name my-tgi-stack +``` + +``` +$ llama stack build --template local-tgi --name my-tgi-stack +... +... +Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml +You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml` +``` + +#### Building from config file +- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command. + +- The config file will be of contents like the ones in `llama_stack/distributions/templates/`. + +``` +$ cat llama_stack/distribution/templates/local-ollama-build.yaml + +name: local-ollama +distribution_spec: + description: Like local, but use ollama for running LLM inference + providers: + inference: remote::ollama + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda +``` + +``` +llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml +``` + +#### How to build distribution with Docker image + +> [!TIP] +> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman. + +To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type. + +``` +llama stack build --template local --image-type docker --name docker-0 +``` + +Alternatively, you may use a config file and set `image_type` to `docker` in our `-build.yaml` file, and run `llama stack build -build.yaml`. The `-build.yaml` will be of contents like: + +``` +name: local-docker-example +distribution_spec: + description: Use code from `llama_stack` itself to serve all llama stack APIs + docker_image: null + providers: + inference: meta-reference + memory: meta-reference-faiss + safety: meta-reference + agentic_system: meta-reference + telemetry: console +image_type: docker +``` + +The following command allows you to build a Docker image with the name `` +``` +llama stack build --config -build.yaml + +Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim +WORKDIR /app +... +... +You can run it with: podman run -p 8000:8000 llamastack-docker-local +Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml +``` + + +## Step 2. Configure +After our distribution is built (either in form of docker or conda environment), we will run the following command to +``` +llama stack configure [ | | ] +``` +- For `conda` environments: would be the generated build spec saved from Step 1. +- For `docker` images downloaded from Dockerhub, you could also use as the argument. + - Run `docker images` to check list of available images on your machine. + +``` +$ llama stack configure 8b-instruct + +Configuring API: inference (meta-reference) +Enter value for model (existing: Meta-Llama3.1-8B-Instruct) (required): +Enter value for quantization (optional): +Enter value for torch_seed (optional): +Enter value for max_seq_len (existing: 4096) (required): +Enter value for max_batch_size (existing: 1) (required): + +Configuring API: memory (meta-reference-faiss) + +Configuring API: safety (meta-reference) +Do you want to configure llama_guard_shield? (y/n): y +Entering sub-configuration for llama_guard_shield: +Enter value for model (default: Llama-Guard-3-1B) (required): +Enter value for excluded_categories (default: []) (required): +Enter value for disable_input_check (default: False) (required): +Enter value for disable_output_check (default: False) (required): +Do you want to configure prompt_guard_shield? (y/n): y +Entering sub-configuration for prompt_guard_shield: +Enter value for model (default: Prompt-Guard-86M) (required): + +Configuring API: agentic_system (meta-reference) +Enter value for brave_search_api_key (optional): +Enter value for bing_search_api_key (optional): +Enter value for wolfram_api_key (optional): + +Configuring API: telemetry (console) + +YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml +``` + +After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings. + +As you can see, we did basic configuration above and configured: +- inference to run on model `Meta-Llama3.1-8B-Instruct` (obtained from `llama model list`) +- Llama Guard safety shield with model `Llama-Guard-3-1B` +- Prompt Guard safety shield with model `Prompt-Guard-86M` + +For how these configurations are stored as yaml, checkout the file printed at the end of the configuration. + +Note that all configurations as well as models are stored in `~/.llama` + + +## Step 3. Run +Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step. + +``` +llama stack run 8b-instruct +``` + +You should see the Llama Stack server start and print the APIs that it is supporting + +``` +$ llama stack run 8b-instruct + +> initializing model parallel with size 1 +> initializing ddp with size 1 +> initializing pipeline with size 1 +Loaded in 19.28 seconds +NCCL version 2.20.5+cuda12.4 +Finished model load YES READY +Serving POST /inference/batch_chat_completion +Serving POST /inference/batch_completion +Serving POST /inference/chat_completion +Serving POST /inference/completion +Serving POST /safety/run_shield +Serving POST /agentic_system/memory_bank/attach +Serving POST /agentic_system/create +Serving POST /agentic_system/session/create +Serving POST /agentic_system/turn/create +Serving POST /agentic_system/delete +Serving POST /agentic_system/session/delete +Serving POST /agentic_system/memory_bank/detach +Serving POST /agentic_system/session/get +Serving POST /agentic_system/step/get +Serving POST /agentic_system/turn/get +Listening on :::5000 +INFO: Started server process [453333] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) +``` + +> [!NOTE] +> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`. + +> [!IMPORTANT] +> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines. + +> [!TIP] +> You might need to use the flag `--disable-ipv6` to Disable IPv6 support + +This server is running a Llama model locally. + +## Step 4. Test with Client +Once the server is setup, we can test it with a client to see the example outputs. +``` +cd /path/to/llama-stack +conda activate # any environment containing the llama-stack pip package will work + +python -m llama_stack.apis.inference.client localhost 5000 +``` + +This will run the chat completion client and query the distribution’s /inference/chat_completion API. + +Here is an example output: +``` +User>hello world, write me a 2 sentence poem about the moon +Assistant> Here's a 2-sentence poem about the moon: + +The moon glows softly in the midnight sky, +A beacon of wonder, as it passes by. +``` + +Similarly you can test safety (if you configured llama-guard and/or prompt-guard shields) by: + +``` +python -m llama_stack.apis.safety.client localhost 5000 +``` + + +Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. + +You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. diff --git a/docs/developer_cookbook.md b/docs/developer_cookbook.md new file mode 100644 index 000000000..eed1aca3d --- /dev/null +++ b/docs/developer_cookbook.md @@ -0,0 +1,41 @@ +# Llama Stack Developer Cookbook + +Based on your developer needs, below are references to guides to help you get started. + +### Hosted Llama Stack Endpoint +* Developer Need: I want to connect to a Llama Stack endpoint to build my applications. +* Effort: 1min +* Guide: + - Checkout our [DeepLearning course](https://www.deeplearning.ai/short-courses/introducing-multimodal-llama-3-2) on building with Llama Stack apps on pre-hosted Llama Stack endpoint. + + +### Local meta-reference Llama Stack Server +* Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations. +* Effort: 5min +* Guide: + - Please see our [Getting Started Guide](./getting_started.md) on starting up a meta-reference Llama Stack server. + +### Llama Stack Server with Remote Providers +* Developer need: I want a Llama Stack distribution with a remote provider. +* Effort: 10min +* Guide + - Please see our [Distributions Guide](../distributions/) on starting up distributions with remote providers. + + +### On-Device (iOS) Llama Stack +* Developer Need: I want to use Llama Stack on-Device +* Effort: 1.5hr +* Guide: + - Please see our [iOS Llama Stack SDK](../llama_stack/providers/impls/ios/inference) implementations + +### Assemble your own Llama Stack Distribution +* Developer Need: I want to assemble my own distribution with API providers to my likings +* Effort: 30min +* Guide + - Please see our [Building Distribution](./building_distro.md) guide for assembling your own Llama Stack distribution with your choice of API providers. + +### Adding a New API Provider +* Developer Need: I want to add a new API provider to Llama Stack. +* Effort: 3hr +* Guide + - Please see our [Adding a New API Provider](./new_api_provider.md) guide for adding a new API provider. diff --git a/docs/getting_started.md b/docs/getting_started.md index 3eebf8bbc..e3db908a7 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -23,8 +23,7 @@ $CONDA_PREFIX/bin/pip install -e . For what you can do with the Llama CLI, please refer to [CLI Reference](./cli_reference.md). -## Quick Starting Llama Stack Server - +## Starting Up Llama Stack Server #### Starting up server via docker We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links. @@ -50,7 +49,7 @@ docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llama ``` > [!TIP] -> Pro Tip: We may use `docker compose up` for starting up a distribution with remote providers (e.g. TGI) using [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general). You can checkout [these scripts](../llama_stack/distribution/docker/README.md) to help you get started. +> Pro Tip: We may use `docker compose up` for starting up a distribution with remote providers (e.g. TGI) using [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general). You can checkout [these scripts](../distributions/) to help you get started. #### Build->Configure->Run Llama Stack server via conda You may also build a LlamaStack distribution from scratch, configure it, and start running the distribution. This is useful for developing on LlamaStack. @@ -160,245 +159,8 @@ INFO: Application startup complete. INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) ``` -## Building a Distribution - -## Step 1. Build -In the following steps, imagine we'll be working with a `Meta-Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify: -- `name`: the name for our distribution (e.g. `8b-instruct`) -- `image_type`: our build image type (`conda | docker`) -- `distribution_spec`: our distribution specs for specifying API providers - - `description`: a short description of the configurations for the distribution - - `providers`: specifies the underlying implementation for serving each API endpoint - - `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment. - - -At the end of build command, we will generate `-build.yaml` file storing the build configurations. - -After this step is complete, a file named `-build.yaml` will be generated and saved at the output file path specified at the end of the command. - -#### Building from scratch -- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations. -``` -llama stack build -``` - -Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs. - -``` -> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): 8b-instruct -> Enter the image type you want your distribution to be built with (docker or conda): conda - - Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs. -> Enter the API provider for the inference API: (default=meta-reference): meta-reference -> Enter the API provider for the safety API: (default=meta-reference): meta-reference -> Enter the API provider for the agents API: (default=meta-reference): meta-reference -> Enter the API provider for the memory API: (default=meta-reference): meta-reference -> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference - - > (Optional) Enter a short description for your Llama Stack distribution: - -Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/8b-instruct-build.yaml -``` - -**Ollama (optional)** - -If you plan to use Ollama for inference, you'll need to install the server [via these instructions](https://ollama.com/download). - - -#### Building from templates -- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers. - -The following command will allow you to see the available templates and their corresponding providers. -``` -llama stack build --list-templates -``` - -![alt text](resources/list-templates.png) - -You may then pick a template to build your distribution with providers fitted to your liking. - -``` -llama stack build --template local-tgi --name my-tgi-stack -``` - -``` -$ llama stack build --template local-tgi --name my-tgi-stack -... -... -Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml -You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml` -``` - -#### Building from config file -- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command. - -- The config file will be of contents like the ones in `llama_stack/distributions/templates/`. - -``` -$ cat llama_stack/distribution/templates/local-ollama-build.yaml - -name: local-ollama -distribution_spec: - description: Like local, but use ollama for running LLM inference - providers: - inference: remote::ollama - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda -``` - -``` -llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml -``` - -#### How to build distribution with Docker image - -> [!TIP] -> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman. - -To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type. - -``` -llama stack build --template local --image-type docker --name docker-0 -``` - -Alternatively, you may use a config file and set `image_type` to `docker` in our `-build.yaml` file, and run `llama stack build -build.yaml`. The `-build.yaml` will be of contents like: - -``` -name: local-docker-example -distribution_spec: - description: Use code from `llama_stack` itself to serve all llama stack APIs - docker_image: null - providers: - inference: meta-reference - memory: meta-reference-faiss - safety: meta-reference - agentic_system: meta-reference - telemetry: console -image_type: docker -``` - -The following command allows you to build a Docker image with the name `` -``` -llama stack build --config -build.yaml - -Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim -WORKDIR /app -... -... -You can run it with: podman run -p 8000:8000 llamastack-docker-local -Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml -``` - - -## Step 2. Configure -After our distribution is built (either in form of docker or conda environment), we will run the following command to -``` -llama stack configure [ | | ] -``` -- For `conda` environments: would be the generated build spec saved from Step 1. -- For `docker` images downloaded from Dockerhub, you could also use as the argument. - - Run `docker images` to check list of available images on your machine. - -``` -$ llama stack configure 8b-instruct - -Configuring API: inference (meta-reference) -Enter value for model (existing: Meta-Llama3.1-8B-Instruct) (required): -Enter value for quantization (optional): -Enter value for torch_seed (optional): -Enter value for max_seq_len (existing: 4096) (required): -Enter value for max_batch_size (existing: 1) (required): - -Configuring API: memory (meta-reference-faiss) - -Configuring API: safety (meta-reference) -Do you want to configure llama_guard_shield? (y/n): y -Entering sub-configuration for llama_guard_shield: -Enter value for model (default: Llama-Guard-3-1B) (required): -Enter value for excluded_categories (default: []) (required): -Enter value for disable_input_check (default: False) (required): -Enter value for disable_output_check (default: False) (required): -Do you want to configure prompt_guard_shield? (y/n): y -Entering sub-configuration for prompt_guard_shield: -Enter value for model (default: Prompt-Guard-86M) (required): - -Configuring API: agentic_system (meta-reference) -Enter value for brave_search_api_key (optional): -Enter value for bing_search_api_key (optional): -Enter value for wolfram_api_key (optional): - -Configuring API: telemetry (console) - -YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml -``` - -After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings. - -As you can see, we did basic configuration above and configured: -- inference to run on model `Meta-Llama3.1-8B-Instruct` (obtained from `llama model list`) -- Llama Guard safety shield with model `Llama-Guard-3-1B` -- Prompt Guard safety shield with model `Prompt-Guard-86M` - -For how these configurations are stored as yaml, checkout the file printed at the end of the configuration. - -Note that all configurations as well as models are stored in `~/.llama` - - -## Step 3. Run -Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step. - -``` -llama stack run 8b-instruct -``` - -You should see the Llama Stack server start and print the APIs that it is supporting - -``` -$ llama stack run 8b-instruct - -> initializing model parallel with size 1 -> initializing ddp with size 1 -> initializing pipeline with size 1 -Loaded in 19.28 seconds -NCCL version 2.20.5+cuda12.4 -Finished model load YES READY -Serving POST /inference/batch_chat_completion -Serving POST /inference/batch_completion -Serving POST /inference/chat_completion -Serving POST /inference/completion -Serving POST /safety/run_shield -Serving POST /agentic_system/memory_bank/attach -Serving POST /agentic_system/create -Serving POST /agentic_system/session/create -Serving POST /agentic_system/turn/create -Serving POST /agentic_system/delete -Serving POST /agentic_system/session/delete -Serving POST /agentic_system/memory_bank/detach -Serving POST /agentic_system/session/get -Serving POST /agentic_system/step/get -Serving POST /agentic_system/turn/get -Listening on :::5000 -INFO: Started server process [453333] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -``` - -> [!NOTE] -> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`. - -> [!IMPORTANT] -> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines. - -> [!TIP] -> You might need to use the flag `--disable-ipv6` to Disable IPv6 support -This server is running a Llama model locally. - -## Step 4. Test with Client +## Testing with client Once the server is setup, we can test it with a client to see the example outputs. ``` cd /path/to/llama-stack @@ -428,3 +190,7 @@ python -m llama_stack.apis.safety.client localhost 5000 Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. + + +## Advanced Guides +Please see our [Building a LLama Stack Distribution](./building_distro.md) guide for more details on how to assemble your own Llama Stack Distribution. diff --git a/docs/new_api_provider.md b/docs/new_api_provider.md new file mode 100644 index 000000000..bfef3a6b3 --- /dev/null +++ b/docs/new_api_provider.md @@ -0,0 +1,20 @@ +# Developer Guide: Adding a New API Provider + +This guide contains references to walk you through + +### Adding a new API provider +1. First, decide which API your provider falls into (e.g. Inference, Safety, Agents, Memory). +2. Decide whether your provider is a remote provider, or inline implmentation. A remote provider is a provider that makes a remote request to an service. An inline provider is a provider where implementation is executed locally. Checkout the examples, and follow the structure to add your own API provider. Please find the following code pointers: + - [Inference Remote Adapter](../llama_stack/providers/adapters/inference/) + - [Inference Inline Provider](../llama_stack/providers/impls/) +3. [Build a Llama Stack distribution](./building_distro.md) with your API provider. +4. Test your code! + +### Testing your newly added API providers +1. Start Llama Stack server with your +2. Test with sending a client request to the server. +3. Add tests for your newly added provider. See [tests/](../tests/) for example unit tests. +4. Test the supported functionalities for your provider using our providers tests infra. See [llama_stack/providers/tests//test_](../llama_stack/providers/tests/inference/test_inference.py). + +### Submit your PR +After you have fully tested your newly added API provider, submit a PR with the attached test plan, and we will help you verify the necessary requirements. diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index 3c59e8c20..26aa35e16 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -13,7 +13,7 @@ from pathlib import Path TEMPLATES_PATH = ( - Path(os.path.relpath(__file__)).parent.parent.parent / "distribution" / "templates" + Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions" ) diff --git a/llama_stack/distribution/docker/README.md b/llama_stack/distribution/docker/README.md deleted file mode 100644 index 962a07def..000000000 --- a/llama_stack/distribution/docker/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Docker Compose Scripts - -This folder contains scripts to enable starting a distribution using `docker compose`. - - -#### Example: TGI Inference Adapter -``` -$ cd llama_stack/distribution/docker/tgi -$ ls -compose.yaml tgi-run.yaml -$ docker compose up -``` - -The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- -``` -[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) -[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 -[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -``` - -To kill the server -``` -docker compose down -``` diff --git a/llama_stack/distribution/templates/build_configs/local-cpu-docker-build.yaml b/llama_stack/distribution/templates/build_configs/local-cpu-docker-build.yaml deleted file mode 100644 index 9db019454..000000000 --- a/llama_stack/distribution/templates/build_configs/local-cpu-docker-build.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: local-cpu -distribution_spec: - description: remote inference + local safety/agents/memory - docker_image: null - providers: - inference: - - remote::ollama - - remote::tgi - - remote::together - - remote::fireworks - safety: meta-reference - agents: meta-reference - memory: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/llama_stack/distribution/templates/build_configs/local-tgi-chroma-docker-build.yaml b/llama_stack/distribution/templates/build_configs/local-tgi-chroma-docker-build.yaml deleted file mode 100644 index 30715c551..000000000 --- a/llama_stack/distribution/templates/build_configs/local-tgi-chroma-docker-build.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: local-tgi-chroma -distribution_spec: - description: remote tgi inference + chromadb memory - docker_image: null - providers: - inference: remote::tgi - safety: meta-reference - agents: meta-reference - memory: remote::chromadb - telemetry: meta-reference -image_type: docker diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index c3370bfd9..c54cf5939 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -60,15 +60,15 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.adapters.inference.ollama", ), ), -# remote_provider_spec( -# api=Api.inference, -# adapter=AdapterSpec( -# adapter_type="vllm", -# pip_packages=["openai"], -# module="llama_stack.providers.adapters.inference.vllm", -# config_class="llama_stack.providers.adapters.inference.vllm.VLLMImplConfig", -# ), -# ), + # remote_provider_spec( + # api=Api.inference, + # adapter=AdapterSpec( + # adapter_type="vllm", + # pip_packages=["openai"], + # module="llama_stack.providers.adapters.inference.vllm", + # config_class="llama_stack.providers.adapters.inference.vllm.VLLMImplConfig", + # ), + # ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec(