From c19e1723240f1bedcc44d79b8336b87b6fff7ac1 Mon Sep 17 00:00:00 2001 From: samiksha <20553087+spal1@users.noreply.github.com> Date: Thu, 24 Oct 2024 15:24:09 +0000 Subject: [PATCH 1/4] add /metrics to truss server --- truss/templates/server/requirements.txt | 1 + truss/templates/server/truss_server.py | 6 ++++++ truss/tests/test_model_inference.py | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/truss/templates/server/requirements.txt b/truss/templates/server/requirements.txt index 328a99149..cfb9b55a8 100644 --- a/truss/templates/server/requirements.txt +++ b/truss/templates/server/requirements.txt @@ -18,3 +18,4 @@ requests==2.31.0 uvicorn==0.24.0 uvloop==0.19.0 aiofiles==24.1.0 +prometheus-client==0.15.0 diff --git a/truss/templates/server/truss_server.py b/truss/templates/server/truss_server.py index 6cc5fad9a..af2ecfa1f 100644 --- a/truss/templates/server/truss_server.py +++ b/truss/templates/server/truss_server.py @@ -24,6 +24,7 @@ from opentelemetry import propagate as otel_propagate from opentelemetry import trace from opentelemetry.sdk import trace as sdk_trace +from prometheus_client import make_asgi_app from shared import serialization, util from shared.logging import setup_logging from shared.secrets_resolver import SecretsResolver @@ -342,6 +343,11 @@ def exit_self(): on_term=exit_self, ) app.add_middleware(BaseHTTPMiddleware, dispatch=termination_handler_middleware) + + # Add prometheus asgi middleware to route /metrics requests + metrics_app = make_asgi_app() + app.mount("/metrics", metrics_app) + return app def start(self): diff --git a/truss/tests/test_model_inference.py b/truss/tests/test_model_inference.py index 04db1b7fa..0bd54131b 100644 --- a/truss/tests/test_model_inference.py +++ b/truss/tests/test_model_inference.py @@ -760,6 +760,31 @@ def _test_invocations(expected_code): _test_invocations(200) +@pytest.mark.integration +def test_metrics(): + model = """ + from fastapi.responses import Response + from prometheus_client import Counter + + class Model: + def __init__(self): + self.counter = Counter('my_really_cool_metric', 'my really cool metric description') + + def predict(self, model_input): + self.counter.inc(10) + return model_input + """ + config = "model_name: metrics-truss" + with ensure_kill_all(), temp_truss(model, config) as tr: + _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True) + metrics_url = "http://localhost:8090/metrics" + requests.post(PREDICT_URL, json={}) + resp = requests.get(metrics_url) + assert resp.status_code == 200 + assert "my_really_cool_metric_total 10.0" in resp.text + assert "my_really_cool_metric_created" in resp.text + + @pytest.mark.integration def test_setup_environment(): # Test truss that uses setup_environment() without load() From 9c7e9ae1cf8f8923e3c9950828c3c9e60fb62141 Mon Sep 17 00:00:00 2001 From: samiksha <20553087+spal1@users.noreply.github.com> Date: Thu, 24 Oct 2024 15:28:22 +0000 Subject: [PATCH 2/4] update ctx builder version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4410500fa..0a3e679c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "truss" -version = "0.9.45rc009" +version = "0.9.45rc010" description = "A seamless bridge from model development to model delivery" license = "MIT" readme = "README.md" From d92d2ff96de6d9c28dcede627a3d29171e0e4a8f Mon Sep 17 00:00:00 2001 From: samiksha <20553087+spal1@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:04:17 +0000 Subject: [PATCH 3/4] add metrics to config spec --- truss/tests/test_config.py | 61 ++++++++++++++++++++++++++++++++++++++ truss/truss_config.py | 35 +++++++++++++++++++++- 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/truss/tests/test_config.py b/truss/tests/test_config.py index ffc4d6ae0..40f4c0ef2 100644 --- a/truss/tests/test_config.py +++ b/truss/tests/test_config.py @@ -15,6 +15,7 @@ Accelerator, AcceleratorSpec, BaseImage, + CustomMetricConfig, DockerAuthSettings, DockerAuthType, ModelCache, @@ -126,6 +127,66 @@ def test_parse_resources(input_dict, expect_resources, output_dict): assert parsed_result.to_dict() == output_dict +@pytest.mark.parametrize( + "input_dict, expect_metrics, output_dict", + [ + ( + { + "name": "metric_name", + "display_name": "Metric Name", + "type": "histogram", + "unit": "ms", + }, + CustomMetricConfig( + name="metric_name", + display_name="Metric Name", + type="histogram", + unit="ms", + ), + { + "name": "metric_name", + "display_name": "Metric Name", + "type": "histogram", + "unit": "ms", + }, + ), + ], +) +def test_parse_custom_metric(input_dict, expect_metrics, output_dict): + parsed_result = CustomMetricConfig.from_dict(input_dict) + assert parsed_result == expect_metrics + assert parsed_result.to_dict() == output_dict + + +def test_config_metrics(default_config): + default_config["metrics"] = [ + { + "name": "metric_name", + "display_name": "Metric Name", + "type": "histogram", + "unit": "ms", + }, + { + "name": "metric_name2", + "display_name": "Metric Name 2", + "type": "counter", + "unit": "count", + }, + ] + config = TrussConfig.from_dict(default_config) + assert config.metrics == [ + CustomMetricConfig( + name="metric_name", display_name="Metric Name", type="histogram", unit="ms" + ), + CustomMetricConfig( + name="metric_name2", + display_name="Metric Name 2", + type="counter", + unit="count", + ), + ] + + @pytest.mark.parametrize( "input_str, expected_acc", [ diff --git a/truss/truss_config.py b/truss/truss_config.py index e7a53b739..bda058da8 100644 --- a/truss/truss_config.py +++ b/truss/truss_config.py @@ -243,6 +243,31 @@ def to_dict(self): } +@dataclass +class CustomMetricConfig: + name: str + display_name: str + type: str + unit: str + + @staticmethod + def from_dict(d): + return CustomMetricConfig( + name=d.get("name"), + display_name=d.get("display_name"), + type=d.get("type"), + unit=d.get("unit"), + ) + + def to_dict(self) -> dict: + return { + "name": self.name, + "display_name": self.display_name, + "type": self.type, + "unit": self.unit, + } + + @dataclass class ExternalDataItem: """A piece of remote data, to be made available to the Truss at serving time. @@ -546,6 +571,7 @@ class TrussConfig: model_cache: ModelCache = field(default_factory=ModelCache) trt_llm: Optional[TRTLLMConfiguration] = None build_commands: List[str] = field(default_factory=list) + metrics: List[CustomMetricConfig] = field(default_factory=list) @property def canonical_python_version(self) -> str: @@ -605,6 +631,9 @@ def from_dict(d): d.get("trt_llm"), lambda x: TRTLLMConfiguration(**x) ), build_commands=d.get("build_commands", []), + metrics=transform_optional( + d.get("metrics") or [], lambda x: [CustomMetricConfig(**m) for m in x] + ), ) config.validate() return config @@ -780,7 +809,11 @@ def obj_to_dict(obj, verbose: bool = False): d["docker_auth"] = transform_optional( field_curr_value, lambda data: data.to_dict() ) + elif field_name == "metrics": + d["metrics"] = transform_optional( + field_curr_value, + lambda data: [metric.to_dict() for metric in data] if data else [], + ) else: d[field_name] = field_curr_value - return d From e3adba5fd16dac7c70c76aaa4ef9f9858bf0c142 Mon Sep 17 00:00:00 2001 From: samiksha <20553087+spal1@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:53:20 +0000 Subject: [PATCH 4/4] add /metrics to control server --- pyproject.toml | 2 +- truss/templates/control/control/application.py | 5 +++++ truss/templates/control/requirements.txt | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0a3e679c2..650be82ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "truss" -version = "0.9.45rc010" +version = "0.9.45rc013" description = "A seamless bridge from model development to model delivery" license = "MIT" readme = "README.md" diff --git a/truss/templates/control/control/application.py b/truss/templates/control/control/application.py index 4b121538e..07e76384f 100644 --- a/truss/templates/control/control/application.py +++ b/truss/templates/control/control/application.py @@ -13,6 +13,7 @@ from helpers.inference_server_process_controller import InferenceServerProcessController from helpers.inference_server_starter import async_inference_server_startup_flow from helpers.truss_patch.model_container_patch_applier import ModelContainerPatchApplier +from prometheus_client import make_asgi_app from shared.logging import setup_logging from starlette.datastructures import State @@ -103,6 +104,10 @@ async def start_background_inference_startup(): app.state = app_state app.include_router(control_app) + # Add prometheus asgi middleware to route /metrics requests + metrics_app = make_asgi_app() + app.mount("/metrics", metrics_app) + @app.on_event("shutdown") def on_shutdown(): # FastApi handles the term signal to start the shutdown flow. Here we diff --git a/truss/templates/control/requirements.txt b/truss/templates/control/requirements.txt index 16aa11117..2c4a6dff9 100644 --- a/truss/templates/control/requirements.txt +++ b/truss/templates/control/requirements.txt @@ -7,3 +7,4 @@ tenacity==8.1.0 httpx==0.27.0 python-json-logger==2.0.2 loguru==0.7.2 +prometheus_client==0.15.0