Skip to content

Commit

Permalink
API: refactor and fix get_result(wait=True)
Browse files Browse the repository at this point in the history
The previous `GET /dispatches/{dispatch_id}` endpoint was trying to do
too much. Its responsibilities are now separated into two endpoints:

* `GET /dispatches`: bulk query dispatch summaries (including status)
with options to filter by `dispatch_id`, sort chronologically, and
also limit the output to status only.

* `GET /dispatches/{dispatch_id}`: download manifest

To achieve the desired behavior of `get_result(id, wait=True)`, the
client

1. Polls the dispatch status by querying the first endpoint.

2. Downloads the manifest after the dispatch has reached a final
status.

The server no longer returns 503 errors when the dispatch is not yet
"ready". A 503 status code is not entirely accurate here because it is
intended to convey temporary service unavailablity resulting from
server overload or rate limiting. However, the fact that the workflow
is still running does not indicate any fault of the server.

These changes will allow `get_result(dispatch_id, wait=True)` to wait
as long as required instead of erroring out after some time.

Supporting improvements:

DAL: Add sorting and pagination to Controller

DAL: improve bulk get when retrieving only some columns

Directly select the specified columns instead of retrieving the whole
ORM entities and deferring column loading using load_only
  • Loading branch information
cjao committed Jun 14, 2024
1 parent 215d8d3 commit dd00785
Show file tree
Hide file tree
Showing 14 changed files with 321 additions and 409 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Improved handling of Covalent version mismatches between client and
executor environments
- `get_result(wait=True)` will wait as long as needed

### Removed

Expand Down
66 changes: 1 addition & 65 deletions covalent/_dispatcher_plugins/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,6 @@ def dispatch(
Wrapper function which takes the inputs of the workflow as arguments
"""

multistage = get_config("sdk.multistage_dispatch") == "true"

# Extract triggers here
if "triggers" in orig_lattice.metadata:
triggers_data = orig_lattice.metadata.pop("triggers")
Expand All @@ -155,14 +153,7 @@ def wrapper(*args, **kwargs) -> str:
The dispatch id of the workflow.
"""

if multistage:
dispatch_id = LocalDispatcher.register(orig_lattice, dispatcher_addr)(
*args, **kwargs
)
else:
dispatch_id = LocalDispatcher.submit(orig_lattice, dispatcher_addr)(
*args, **kwargs
)
dispatch_id = LocalDispatcher.register(orig_lattice, dispatcher_addr)(*args, **kwargs)

if triggers_data:
LocalDispatcher.register_triggers(triggers_data, dispatch_id)
Expand Down Expand Up @@ -237,61 +228,6 @@ def wrapper(*args, **kwargs) -> str:

return wrapper

@staticmethod
def submit(
orig_lattice: Lattice,
dispatcher_addr: str = None,
) -> Callable:
"""
Wrapping the dispatching functionality to allow input passing
and server address specification.
Afterwards, send the lattice to the dispatcher server and return
the assigned dispatch id.
Args:
orig_lattice: The lattice/workflow to send to the dispatcher server.
dispatcher_addr: The address of the dispatcher server. If None then then defaults to the address set in Covalent's config.
Returns:
Wrapper function which takes the inputs of the workflow as arguments
"""

if dispatcher_addr is None:
dispatcher_addr = format_server_url()

@wraps(orig_lattice)
def wrapper(*args, **kwargs) -> str:
"""
Send the lattice to the dispatcher server and return
the assigned dispatch id.
Args:
*args: The inputs of the workflow.
**kwargs: The keyword arguments of the workflow.
Returns:
The dispatch id of the workflow.
"""

if not isinstance(orig_lattice, Lattice):
message = f"Dispatcher expected a Lattice, received {type(orig_lattice)} instead."
app_log.error(message)
raise TypeError(message)

lattice = deepcopy(orig_lattice)

lattice.build_graph(*args, **kwargs)

# Serialize the transport graph to JSON
json_lattice = lattice.serialize_to_json()
endpoint = "/api/v2/dispatches/submit"
r = APIClient(dispatcher_addr).post(endpoint, data=json_lattice)
r.raise_for_status()
return r.content.decode("utf-8").strip().replace('"', "")

return wrapper

@staticmethod
def start(
dispatch_id: str,
Expand Down
98 changes: 46 additions & 52 deletions covalent/_results_manager/results_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@

import contextlib
import os
import time
from pathlib import Path
from typing import Dict, List, Optional
from typing import List, Optional

from furl import furl
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

from .._api.apiclient import CovalentAPIClient
from .._serialize.common import load_asset
Expand All @@ -40,9 +39,9 @@
from .._shared_files.exceptions import MissingLatticeRecordError
from .._shared_files.schemas.asset import AssetSchema
from .._shared_files.schemas.result import ResultSchema
from .._shared_files.util_classes import RESULT_STATUS, Status
from .._shared_files.utils import copy_file_locally, format_server_url
from .result import Result
from .wait import EXTREME

app_log = logger.app_log
log_stack_info = logger.log_stack_info
Expand Down Expand Up @@ -139,12 +138,20 @@ def cancel(dispatch_id: str, task_ids: List[int] = None, dispatcher_addr: str =
# Multi-part


def _query_dispatch_status(dispatch_id: str, api_client: CovalentAPIClient):
endpoint = "/api/v2/dispatches"
resp = api_client.get(endpoint, params={"dispatch_id": dispatch_id, "status_only": True})
resp.raise_for_status()
dispatches = resp.json()["dispatches"]
if len(dispatches) == 0:
raise MissingLatticeRecordError

return dispatches[0]["status"]


def _get_result_export_from_dispatcher(
dispatch_id: str,
wait: bool = False,
status_only: bool = False,
dispatcher_addr: str = None,
) -> Dict:
dispatch_id: str, api_client: CovalentAPIClient
) -> ResultSchema:
"""
Internal function to get the results of a dispatch from the server without checking if it is ready to read.
Expand All @@ -161,24 +168,13 @@ def _get_result_export_from_dispatcher(
MissingLatticeRecordError: If the result is not found.
"""

if dispatcher_addr is None:
dispatcher_addr = format_server_url()

retries = int(EXTREME) if wait else 5

adapter = HTTPAdapter(max_retries=Retry(total=retries, backoff_factor=1))
api_client = CovalentAPIClient(dispatcher_addr, adapter=adapter, auto_raise=False)

endpoint = f"/api/v2/dispatches/{dispatch_id}"
response = api_client.get(
endpoint,
params={"wait": wait, "status_only": status_only},
)
response = api_client.get(endpoint)
if response.status_code == 404:
raise MissingLatticeRecordError
response.raise_for_status()
export = response.json()
return export
return ResultSchema.model_validate(export)


# Function to download default assets
Expand Down Expand Up @@ -346,11 +342,17 @@ def from_dispatch_id(
wait: bool = False,
dispatcher_addr: str = None,
) -> "ResultManager":
export = _get_result_export_from_dispatcher(
dispatch_id, wait, status_only=False, dispatcher_addr=dispatcher_addr
)
if dispatcher_addr is None:
dispatcher_addr = format_server_url()

manifest = ResultSchema.model_validate(export["result_export"])
api_client = CovalentAPIClient(dispatcher_addr)
if wait:
status = Status(_query_dispatch_status(dispatch_id, api_client))
while not RESULT_STATUS.is_terminal(status):
time.sleep(1)
status = Status(_query_dispatch_status(dispatch_id, api_client))

manifest = _get_result_export_from_dispatcher(dispatch_id, api_client)

# sort the nodes
manifest.lattice.transport_graph.nodes.sort(key=lambda x: x.id)
Expand Down Expand Up @@ -408,14 +410,15 @@ def _get_result_multistage(
"""

if dispatcher_addr is None:
dispatcher_addr = format_server_url()

api_client = CovalentAPIClient(dispatcher_addr)
try:
if status_only:
return _get_result_export_from_dispatcher(
dispatch_id=dispatch_id,
wait=wait,
status_only=status_only,
dispatcher_addr=dispatcher_addr,
)
status = _query_dispatch_status(dispatch_id, api_client)
return {"id": dispatch_id, "status": status}

rm = get_result_manager(dispatch_id, results_dir, wait, dispatcher_addr)
_get_default_assets(rm)

Expand Down Expand Up @@ -496,23 +499,14 @@ def get_result(
The Result object from the Covalent server
"""
max_attempts = int(os.getenv("COVALENT_GET_RESULT_RETRIES", 10))
num_attempts = 0
while num_attempts < max_attempts:
try:
return _get_result_multistage(
dispatch_id=dispatch_id,
wait=wait,
dispatcher_addr=dispatcher_addr,
status_only=status_only,
results_dir=results_dir,
workflow_output=workflow_output,
intermediate_outputs=intermediate_outputs,
sublattice_results=sublattice_results,
qelectron_db=qelectron_db,
)

except RecursionError as re:
app_log.error(re)
num_attempts += 1
raise RuntimeError("Timed out waiting for result. Please retry or check dispatch.")
return _get_result_multistage(
dispatch_id=dispatch_id,
wait=wait,
dispatcher_addr=dispatcher_addr,
status_only=status_only,
results_dir=results_dir,
workflow_output=workflow_output,
intermediate_outputs=intermediate_outputs,
sublattice_results=sublattice_results,
qelectron_db=qelectron_db,
)
3 changes: 0 additions & 3 deletions covalent/_shared_files/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,6 @@ def get_default_sdk_config():
+ "/covalent/dispatches"
),
"task_packing": "true" if os.environ.get("COVALENT_ENABLE_TASK_PACKING") else "false",
"multistage_dispatch": (
"false" if os.environ.get("COVALENT_DISABLE_MULTISTAGE_DISPATCH") == "1" else "true"
),
"results_dir": os.environ.get(
"COVALENT_RESULTS_DIR"
) # COVALENT_RESULTS_DIR is where the client downloads workflow artifacts during get_result() which is different from COVALENT_DATA_DIR
Expand Down
17 changes: 5 additions & 12 deletions covalent/triggers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.


import asyncio
import json
from abc import abstractmethod

import requests
Expand Down Expand Up @@ -108,17 +106,12 @@ def _get_status(self) -> Status:
"""

if self.use_internal_funcs:
from covalent_dispatcher._service.app import export_result
from covalent_dispatcher._service.app import get_dispatches_bulk

response = asyncio.run_coroutine_threadsafe(
export_result(self.lattice_dispatch_id, status_only=True),
self.event_loop,
).result()

if isinstance(response, dict):
return response["status"]

return json.loads(response.body.decode()).get("status")
response = get_dispatches_bulk(
dispatch_id=[self.lattice_dispatch_id], status_only=True
)
return response.dispatches[0].status

from .. import get_result

Expand Down
46 changes: 37 additions & 9 deletions covalent_dispatcher/_dal/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@

from __future__ import annotations

from typing import Generic, Type, TypeVar
from typing import Generic, List, Optional, Sequence, Type, TypeVar, Union

from sqlalchemy import select, update
from sqlalchemy.orm import Session, load_only
from sqlalchemy.engine import Row
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import Select, desc

from .._db import models

Expand Down Expand Up @@ -50,11 +52,16 @@ def get(
cls,
session: Session,
*,
stmt: Optional[Select] = None,
fields: list,
equality_filters: dict,
membership_filters: dict,
for_update: bool = False,
):
sort_fields: List[str] = [],
reverse: bool = True,
offset: int = 0,
max_items: Optional[int] = None,
) -> Union[Sequence[Row], Sequence[T]]:
"""Bulk ORM-enabled SELECT.
Args:
Expand All @@ -64,19 +71,40 @@ def get(
membership_filters: Dict{field_name: value_list}
for_update: Whether to lock the selected rows
Returns:
A list of SQLAlchemy Rows or whole ORM entities depending
on whether only a subset of fields is specified.
"""
stmt = select(cls.model)
if stmt is None:
if len(fields) > 0:
entities = [getattr(cls.model, attr) for attr in fields]
stmt = select(*entities)
else:
stmt = select(cls.model)

for attr, val in equality_filters.items():
stmt = stmt.where(getattr(cls.model, attr) == val)
for attr, vals in membership_filters.items():
stmt = stmt.where(getattr(cls.model, attr).in_(vals))
if len(fields) > 0:
attrs = [getattr(cls.model, f) for f in fields]
stmt = stmt.options(load_only(*attrs))
if for_update:
stmt = stmt.with_for_update()

return session.scalars(stmt).all()
for attr in sort_fields:
if reverse:
stmt = stmt.order_by(desc(getattr(cls.model, attr)))
else:
stmt = stmt.order_by(getattr(cls.model, attr))

stmt = stmt.offset(offset)
if max_items:
stmt = stmt.limit(max_items)

if len(fields) == 0:
# Return whole ORM entities
return session.scalars(stmt).all()
else:
# Return a named tuple containing the selected cols
return session.execute(stmt).all()

@classmethod
def get_by_primary_key(
Expand Down
Loading

0 comments on commit dd00785

Please sign in to comment.