-
Notifications
You must be signed in to change notification settings - Fork 443
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
* refactor(sdk): added option for custom metric collector for tune in… #2406
base: master
Are you sure you want to change the base?
Changes from 5 commits
6c85385
b746da0
516cabe
67c9b78
a0a7e53
80a718c
9245bcd
83ecb32
2d8ecd0
2222b99
3f4ea55
a51b863
1aa9c48
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,7 +186,10 @@ def tune( | |
retain_trials: bool = False, | ||
packages_to_install: List[str] = None, | ||
pip_index_url: str = "https://pypi.org/simple", | ||
metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"}, | ||
metrics_collector_config: Dict[str, Any] = { | ||
"kind": "StdOut", | ||
"custom_collector": None | ||
}, | ||
): | ||
"""Create HyperParameter Tuning Katib Experiment from the objective function. | ||
|
||
|
@@ -251,8 +254,8 @@ def tune( | |
pip_index_url: The PyPI url from which to install Python packages. | ||
metrics_collector_config: Specify the config of metrics collector, | ||
for example, `metrics_collector_config = {"kind": "Push"}`. | ||
Currently, we only support `StdOut` and `Push` metrics collector. | ||
|
||
for using custom metric collectors use "custom_collector" key and provide instance of custom V1Container as value, | ||
for example, `metrics_collector_config = {"kind" : "Custom", "custom_collector": <Instance of V1Container>}`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can reorganize these comments and explain each field in
|
||
Raises: | ||
ValueError: Function arguments have incorrect type or value. | ||
TimeoutError: Timeout to create Katib Experiment. | ||
|
@@ -387,7 +390,10 @@ def tune( | |
# Add metrics collector to the Katib Experiment. | ||
# Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. | ||
experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( | ||
collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) | ||
collector=models.V1beta1CollectorSpec( | ||
kind=metrics_collector_config["kind"], | ||
custom_collector=metrics_collector_config["custom_collector"], | ||
) | ||
) | ||
|
||
# Create Trial specification. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
FROM python:3.8-slim | ||
|
||
WORKDIR /app | ||
|
||
COPY dummy-collector.py . | ||
|
||
RUN pip install kubernetes | ||
|
||
CMD ["python", "dummy-collector.py"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import argparse | ||
import logging | ||
import time | ||
|
||
from kubernetes import client | ||
from kubernetes import config | ||
|
||
# The default logging config. | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
def collect_metrics(metric_name : str): | ||
config.load_incluster_config() | ||
v1 = client.CoreV1Api() | ||
|
||
while True: | ||
dummy_metric_value = 42 | ||
logging.info(f"Collected dummy metric: {metric_name}={dummy_metric_value}") | ||
|
||
time.sleep(10) # Collect metrics every 10 seconds | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--metric-name", type=str, required=True, help="Name of the metric to collect") | ||
args = parser.parse_args() | ||
|
||
collect_metrics(args.metric_name) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from kubeflow.katib import KatibClient | ||
from kubeflow.katib import search | ||
from kubernetes import client | ||
from kubernetes.client import V1Container | ||
from verify import verify_experiment_results | ||
|
||
# Experiment timeout is 40 min. | ||
|
@@ -36,7 +37,19 @@ def objective(parameters): | |
"b": search.double(min=0.1, max=0.2) | ||
} | ||
|
||
# [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. | ||
# [3] Create a dummy metric collector (DOES NOT HAVE A IMAGE) | ||
metric_collector = V1Container( | ||
name="dummy-collector", | ||
image="dummy-collector:latest", | ||
command=["python", "/app/dummy-collector.py"], | ||
args=["--metric-name=result"], | ||
env=[ | ||
client.V1EnvVar(name="EXPERIMENT_NAME", value=exp_name), | ||
client.V1EnvVar(name="EXPERIMENT_NAMESPACE", value=exp_namespace) | ||
] | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we can create another function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure I think its a good idea .Will make this change in a few days There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have made corresponding changes to run both e2e tests (one with custom metrics collector and another with default metric collector [StdOut]) |
||
|
||
# [4] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. | ||
# And Wait until Experiment reaches Succeeded condition. | ||
katib_client.tune( | ||
name=exp_name, | ||
|
@@ -46,6 +59,16 @@ def objective(parameters): | |
objective_metric_name="result", | ||
max_trial_count=4, | ||
resources_per_trial={"cpu": "2"}, | ||
metrics_collector_config={ | ||
"collector": { | ||
"kind": "Custom", | ||
"customCollector": { | ||
"image": "dummy-collector:latest", | ||
"command": ["python", "/app/dummy-collector.py"], | ||
"args": ["--metric-name=result"] | ||
} | ||
} | ||
}, | ||
) | ||
experiment = katib_client.wait_for_experiment_condition( | ||
exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT | ||
|
@@ -94,4 +117,4 @@ def objective(parameters): | |
# Delete the Experiment. | ||
logging.info("---------------------------------------------------------------") | ||
logging.info("---------------------------------------------------------------") | ||
katib_client.delete_experiment(exp_name, exp_namespace) | ||
katib_client.delete_experiment(exp_name, exp_namespace) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to delete the former experiment before we run another experiment. Otherwise, we may run into |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we may need to tell users about the supported types of MC. So can you re-add this line?