Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Construct proper S3 URIs on register_model #39

Merged
merged 3 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions clients/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ registry = ModelRegistry(server_address="server-address", port=9090, author="aut

model = registry.register_model(
"my-model", # model name
"s3://path/to/model", # model URI
"https://storage-place.my-company.com", # model URI
version="2.0.0",
description="lorem ipsum",
model_format_name="onnx",
model_format_version="1",
storage_key="aws-connection-path",
storage_key="my-data-connection",
storage_path="path/to/model",
metadata={
# can be one of the following types
Expand All @@ -37,10 +37,33 @@ version = registry.get_model_version("my-model", "v2.0")
experiment = registry.get_model_artifact("my-model", "v2.0")
```

### Default values for metadata
### Importing from S3

If not supplied, `metadata` values defaults to a predefined set of conventional values.
Reference the technical documentation in the pydoc of the client.
When registering models stored on S3-compatible object storage, you should use `utils.s3_uri_from` to build an
unambiguous URI for your artifact.

```py
from model_registry import ModelRegistry, utils

registry = ModelRegistry(server_address="server-address", port=9090, author="author")

model = registry.register_model(
"my-model", # model name
uri=utils.s3_uri_from("path/to/model", "my-bucket"),
version="2.0.0",
description="lorem ipsum",
model_format_name="onnx",
model_format_version="1",
storage_key="my-data-connection",
metadata={
# can be one of the following types
"int_key": 1,
"bool_key": False,
"float_key": 3.14,
"str_key": "str_value",
}
)
```

### Importing from Hugging Face Hub

Expand Down
30 changes: 11 additions & 19 deletions clients/python/src/model_registry/_client.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Standard client for the model registry."""

from __future__ import annotations

import os
from typing import get_args
from warnings import warn

Expand Down Expand Up @@ -75,16 +75,22 @@ def register_model(
model_format_name: str,
model_format_version: str,
version: str,
author: str | None = None,
description: str | None = None,
storage_key: str | None = None,
storage_path: str | None = None,
service_account_name: str | None = None,
author: str | None = None,
description: str | None = None,
metadata: dict[str, ScalarType] | None = None,
) -> RegisteredModel:
"""Register a model.

Either `storage_key` and `storage_path`, or `service_account_name` must be provided.
This registers a model in the model registry. The model is not downloaded, and has to be stored prior to
registration.

Most models can be registered using their URI, along with optional connection-specific parameters, `storage_key`
and `storage_path` or, simply a `service_account_name`.
URI builder utilities are recommended when referring to specialized storage; for example `utils.s3_uri_from`
helper when using S3 object storage data connections.

Args:
name: Name of the model.
Expand All @@ -110,7 +116,7 @@ def register_model(
version,
author or self._author,
description=description,
metadata=metadata or self.default_metadata(),
metadata=metadata or {},
)
self._register_model_artifact(
mv,
Expand All @@ -124,19 +130,6 @@ def register_model(

return rm

def default_metadata(self) -> dict[str, ScalarType]:
"""Default metadata valorisations.

When not explicitly supplied by the end users, these valorisations will be used
by default.

Returns:
default metadata valorisations.
"""
return {
key: os.environ[key] for key in ["AWS_S3_ENDPOINT", "AWS_S3_BUCKET", "AWS_DEFAULT_REGION"] if key in os.environ
}

def register_hf_model(
self,
repo: str,
Expand Down Expand Up @@ -202,7 +195,6 @@ def register_hf_model(
model_author = author
source_uri = hf_hub_url(repo, path, revision=git_ref)
metadata = {
**self.default_metadata(),
"repo": repo,
"source_uri": source_uri,
"model_origin": "huggingface_hub",
Expand Down
109 changes: 109 additions & 0 deletions clients/python/src/model_registry/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from __future__ import annotations

import functools
import inspect
from collections.abc import Sequence
from typing import Any, Callable, TypeVar

CallableT = TypeVar("CallableT", bound=Callable[..., Any])


# copied from https://github.com/Rapptz/RoboDanny
def human_join(seq: Sequence[str], *, delim: str = ", ", final: str = "or") -> str:
size = len(seq)
if size == 0:
return ""

if size == 1:
return seq[0]

if size == 2:
return f"{seq[0]} {final} {seq[1]}"

return delim.join(seq[:-1]) + f" {final} {seq[-1]}"


def quote(string: str) -> str:
"""Add single quotation marks around the given string. Does *not* do any escaping."""
return f"'{string}'"


# copied from https://github.com/openai/openai-python
def required_args(*variants: Sequence[str]) -> Callable[[CallableT], CallableT]: # noqa: C901
"""Decorator to enforce a given set of arguments or variants of arguments are passed to the decorated function.

Useful for enforcing runtime validation of overloaded functions.

Example usage:
```py
@overload
def foo(*, a: str) -> str:
...


@overload
def foo(*, b: bool) -> str:
...


# This enforces the same constraints that a static type checker would
# i.e. that either a or b must be passed to the function
@required_args(["a"], ["b"])
def foo(*, a: str | None = None, b: bool | None = None) -> str:
...
```
"""

def inner(func: CallableT) -> CallableT: # noqa: C901
params = inspect.signature(func).parameters
positional = [
name
for name, param in params.items()
if param.kind
in {
param.POSITIONAL_ONLY,
param.POSITIONAL_OR_KEYWORD,
}
]

@functools.wraps(func)
def wrapper(*args: object, **kwargs: object) -> object:
given_params: set[str] = set()
for i, _ in enumerate(args):
try:
given_params.add(positional[i])
except IndexError:
msg = f"{func.__name__}() takes {len(positional)} argument(s) but {len(args)} were given"
raise TypeError(msg) from None

for key in kwargs:
given_params.add(key)

for variant in variants:
matches = all(param in given_params for param in variant)
if matches:
break
else: # no break
if len(variants) > 1:
variations = human_join(
[
"("
+ human_join([quote(arg) for arg in variant], final="and")
+ ")"
for variant in variants
]
)
msg = f"Missing required arguments; Expected either {variations} arguments to be given"
else:
# TODO: this error message is not deterministic
missing = list(set(variants[0]) - given_params)
if len(missing) > 1:
msg = f"Missing required arguments: {human_join([quote(arg) for arg in missing])}"
else:
msg = f"Missing required argument: {quote(missing[0])}"
raise TypeError(msg)
return func(*args, **kwargs)

return wrapper # type: ignore

return inner
4 changes: 4 additions & 0 deletions clients/python/src/model_registry/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ class StoreException(Exception):
"""Storage related error."""


class MissingMetadata(Exception):
"""Not enough metadata to complete operation."""


class UnsupportedTypeException(StoreException):
"""Raised when an unsupported type is encountered."""

Expand Down
92 changes: 92 additions & 0 deletions clients/python/src/model_registry/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Utilities for the model registry."""

from __future__ import annotations

import os

from typing_extensions import overload

from ._utils import required_args
from .exceptions import MissingMetadata


@overload
def s3_uri_from(
path: str,
) -> str: ...


@overload
def s3_uri_from(
path: str,
bucket: str,
) -> str: ...


@overload
def s3_uri_from(
path: str,
bucket: str,
*,
endpoint: str,
region: str,
) -> str: ...


@required_args(
(),
( # pre-configured env
"bucket",
),
( # custom env or non-default bucket
"bucket",
"endpoint",
"region",
),
)
def s3_uri_from(
path: str,
bucket: str | None = None,
*,
endpoint: str | None = None,
region: str | None = None,
) -> str:
"""Build an S3 URI.

This helper function builds an S3 URI from a path and a bucket name, assuming you have a configured environment
with a default bucket, endpoint, and region set.
If you don't, you must provide all three optional arguments.
That is also the case for custom environments, where the default bucket is not set, or if you want to use a
different bucket.

Args:
path: Storage path.
bucket: Name of the S3 bucket. Defaults to AWS_S3_BUCKET.
endpoint: Endpoint of the S3 bucket. Defaults to AWS_S3_ENDPOINT.
region: Region of the S3 bucket. Defaults to AWS_DEFAULT_REGION.

Returns:
S3 URI.
"""
default_bucket = os.environ.get("AWS_S3_BUCKET")
if not bucket:
if not default_bucket:
msg = "Custom environment requires all arguments"
raise MissingMetadata(msg)
bucket = default_bucket
elif (not default_bucket or default_bucket != bucket) and not endpoint:
msg = (
"bucket_endpoint and bucket_region must be provided for non-default bucket"
)
raise MissingMetadata(msg)

endpoint = endpoint or os.getenv("AWS_S3_ENDPOINT")
region = region or os.getenv("AWS_DEFAULT_REGION")

if not (endpoint and region):
msg = "Missing environment variables: bucket_endpoint and bucket_region are required"
raise MissingMetadata(msg)

# https://alexwlchan.net/2020/s3-keys-are-not-file-paths/ nor do they resolve to valid URls
# FIXME: is this safe?
return f"s3://{bucket}/{path}?endpoint={endpoint}&defaultRegion={region}"
Comment on lines +91 to +92
Copy link
Contributor Author

@isinyaaa isinyaaa Mar 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bumping this note here now that the PR is "ready", can you share your thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks better for uri building. Did we also want to add the URI components as individual properties to the model metadata? Maybe we can pass a metadata object to the uri builders so that it can be reused in the register call.
@tarilabs didn't UI say they would like the uri components individually, or are they happy to have the constructed uri?

Loading
Loading