Skip to content

Commit

Permalink
API: redesign implementation of get_result(wait=True)
Browse files Browse the repository at this point in the history
The previous `GET /dispatches/{dispatch_id}` endpoint was trying to do
too much. Its responsibilities are now separated into two endpoints:

* `GET /dispatches`: bulk query dispatch summaries (including status)
with options to filter by `dispatch_id`, sort chronologically, and
also limit the output to status only.

* `GET /dispatches/{dispatch_id}`: download manifest

To achieve the desired behavior of `get_result(id, wait=True)`, the
client

1. Polls the dispatch status by querying the first endpoint.

2. Downloads the manifest after the dispatch has reached a final
status.

The server no longer returns 503 errors when the dispatch is not yet
"ready". A 503 status code is not entirely accurate here because it is
intended to convey temporary service unavailablity resulting from
server overload or rate limiting. However, the fact that the workflow
is still running does not indicate any fault of the server.

These changes will allow `get_result(dispatch_id, wait=True)` to wait
as long as required instead of erroring out after some time.

Supporting improvements:

DAL: Add sorting and pagination to Controller

DAL: improve bulk get when retrieving only some columns

Directly select the specified columns instead of retrieving the whole
ORM entities and deferring column loading using load_only
  • Loading branch information
cjao committed May 21, 2024
1 parent 5486a14 commit 5f4adfb
Show file tree
Hide file tree
Showing 11 changed files with 350 additions and 204 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [UNRELEASED]

### Fixed

- `get_result(wait=True)` will wait as long as needed

## [0.234.1-rc.0] - 2024-05-10

### Authors
Expand Down
100 changes: 51 additions & 49 deletions covalent/_results_manager/results_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@

import contextlib
import os
import time
from pathlib import Path
from typing import Dict, List, Optional
from typing import List, Optional

from furl import furl
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

from .._api.apiclient import CovalentAPIClient
from .._serialize.common import load_asset
Expand All @@ -40,9 +39,9 @@
from .._shared_files.exceptions import MissingLatticeRecordError
from .._shared_files.schemas.asset import AssetSchema
from .._shared_files.schemas.result import ResultSchema
from .._shared_files.util_classes import RESULT_STATUS, Status
from .._shared_files.utils import copy_file_locally, format_server_url
from .result import Result
from .wait import EXTREME

app_log = logger.app_log
log_stack_info = logger.log_stack_info
Expand Down Expand Up @@ -139,12 +138,20 @@ def cancel(dispatch_id: str, task_ids: List[int] = None, dispatcher_addr: str =
# Multi-part


def _query_dispatch_status(dispatch_id: str, api_client: CovalentAPIClient):
endpoint = "/api/v2/dispatches"
resp = api_client.get(endpoint, params={"dispatch_id": dispatch_id, "status_only": True})
resp.raise_for_status()
dispatches = resp.json()["dispatches"]
if len(dispatches) == 0:
raise MissingLatticeRecordError

return dispatches[0]["status"]


def _get_result_export_from_dispatcher(
dispatch_id: str,
wait: bool = False,
status_only: bool = False,
dispatcher_addr: str = None,
) -> Dict:
dispatch_id: str, api_client: CovalentAPIClient
) -> ResultSchema:
"""
Internal function to get the results of a dispatch from the server without checking if it is ready to read.
Expand All @@ -161,24 +168,21 @@ def _get_result_export_from_dispatcher(
MissingLatticeRecordError: If the result is not found.
"""

if dispatcher_addr is None:
dispatcher_addr = format_server_url()
# if dispatcher_addr is None:
# dispatcher_addr = format_server_url()

retries = int(EXTREME) if wait else 5
# retries = int(EXTREME) if wait else 5

adapter = HTTPAdapter(max_retries=Retry(total=retries, backoff_factor=1))
api_client = CovalentAPIClient(dispatcher_addr, adapter=adapter, auto_raise=False)
# adapter = HTTPAdapter(max_retries=Retry(total=retries, backoff_factor=1))
# api_client = CovalentAPIClient(dispatcher_addr, adapter=adapter, auto_raise=False)

endpoint = f"/api/v2/dispatches/{dispatch_id}"
response = api_client.get(
endpoint,
params={"wait": wait, "status_only": status_only},
)
response = api_client.get(endpoint)
if response.status_code == 404:
raise MissingLatticeRecordError
response.raise_for_status()
export = response.json()
return export
return ResultSchema.model_validate(export)


# Function to download default assets
Expand Down Expand Up @@ -346,11 +350,17 @@ def from_dispatch_id(
wait: bool = False,
dispatcher_addr: str = None,
) -> "ResultManager":
export = _get_result_export_from_dispatcher(
dispatch_id, wait, status_only=False, dispatcher_addr=dispatcher_addr
)
if dispatcher_addr is None:
dispatcher_addr = format_server_url()

manifest = ResultSchema.model_validate(export["result_export"])
api_client = CovalentAPIClient(dispatcher_addr)
if wait:
status = Status(_query_dispatch_status(dispatch_id, api_client))
while not RESULT_STATUS.is_terminal(status):
time.sleep(1)
status = Status(_query_dispatch_status(dispatch_id, api_client))

manifest = _get_result_export_from_dispatcher(dispatch_id, api_client)

# sort the nodes
manifest.lattice.transport_graph.nodes.sort(key=lambda x: x.id)
Expand Down Expand Up @@ -408,14 +418,15 @@ def _get_result_multistage(
"""

if dispatcher_addr is None:
dispatcher_addr = format_server_url()

api_client = CovalentAPIClient(dispatcher_addr)
try:
if status_only:
return _get_result_export_from_dispatcher(
dispatch_id=dispatch_id,
wait=wait,
status_only=status_only,
dispatcher_addr=dispatcher_addr,
)
status = _query_dispatch_status(dispatch_id, api_client)
return {"id": dispatch_id, "status": status}

rm = get_result_manager(dispatch_id, results_dir, wait, dispatcher_addr)
_get_default_assets(rm)

Expand Down Expand Up @@ -496,23 +507,14 @@ def get_result(
The Result object from the Covalent server
"""
max_attempts = int(os.getenv("COVALENT_GET_RESULT_RETRIES", 10))
num_attempts = 0
while num_attempts < max_attempts:
try:
return _get_result_multistage(
dispatch_id=dispatch_id,
wait=wait,
dispatcher_addr=dispatcher_addr,
status_only=status_only,
results_dir=results_dir,
workflow_output=workflow_output,
intermediate_outputs=intermediate_outputs,
sublattice_results=sublattice_results,
qelectron_db=qelectron_db,
)

except RecursionError as re:
app_log.error(re)
num_attempts += 1
raise RuntimeError("Timed out waiting for result. Please retry or check dispatch.")
return _get_result_multistage(
dispatch_id=dispatch_id,
wait=wait,
dispatcher_addr=dispatcher_addr,
status_only=status_only,
results_dir=results_dir,
workflow_output=workflow_output,
intermediate_outputs=intermediate_outputs,
sublattice_results=sublattice_results,
qelectron_db=qelectron_db,
)
17 changes: 5 additions & 12 deletions covalent/triggers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.


import asyncio
import json
from abc import abstractmethod

import requests
Expand Down Expand Up @@ -108,17 +106,12 @@ def _get_status(self) -> Status:
"""

if self.use_internal_funcs:
from covalent_dispatcher._service.app import export_result
from covalent_dispatcher._service.app import get_dispatches_bulk

response = asyncio.run_coroutine_threadsafe(
export_result(self.lattice_dispatch_id, status_only=True),
self.event_loop,
).result()

if isinstance(response, dict):
return response["status"]

return json.loads(response.body.decode()).get("status")
response = get_dispatches_bulk(
dispatch_id=[self.lattice_dispatch_id], status_only=True
)
return response.dispatches[0].status

from .. import get_result

Expand Down
46 changes: 37 additions & 9 deletions covalent_dispatcher/_dal/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@

from __future__ import annotations

from typing import Generic, Type, TypeVar
from typing import Generic, List, Optional, Sequence, Type, TypeVar, Union

from sqlalchemy import select, update
from sqlalchemy.orm import Session, load_only
from sqlalchemy.engine import Row
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import Select, desc

from .._db import models

Expand Down Expand Up @@ -50,11 +52,16 @@ def get(
cls,
session: Session,
*,
stmt: Optional[Select] = None,
fields: list,
equality_filters: dict,
membership_filters: dict,
for_update: bool = False,
):
sort_fields: List[str] = [],
reverse: bool = True,
offset: int = 0,
max_items: Optional[int] = None,
) -> Union[Sequence[Row], Sequence[T]]:
"""Bulk ORM-enabled SELECT.
Args:
Expand All @@ -64,19 +71,40 @@ def get(
membership_filters: Dict{field_name: value_list}
for_update: Whether to lock the selected rows
Returns:
A list of SQLAlchemy Rows or whole ORM entities depending
on whether only a subset of fields is specified.
"""
stmt = select(cls.model)
if stmt is None:
if len(fields) > 0:
entities = [getattr(cls.model, attr) for attr in fields]
stmt = select(*entities)
else:
stmt = select(cls.model)

for attr, val in equality_filters.items():
stmt = stmt.where(getattr(cls.model, attr) == val)
for attr, vals in membership_filters.items():
stmt = stmt.where(getattr(cls.model, attr).in_(vals))
if len(fields) > 0:
attrs = [getattr(cls.model, f) for f in fields]
stmt = stmt.options(load_only(*attrs))
if for_update:
stmt = stmt.with_for_update()

return session.scalars(stmt).all()
for attr in sort_fields:
if reverse:
stmt = stmt.order_by(desc(getattr(cls.model, attr)))
else:
stmt = stmt.order_by(getattr(cls.model, attr))

stmt = stmt.offset(offset)
if max_items:
stmt = stmt.limit(max_items)

if len(fields) == 0:
# Return whole ORM entities
return session.scalars(stmt).all()
else:
# Return a named tuple containing the selected cols
return session.execute(stmt).all()

@classmethod
def get_by_primary_key(
Expand Down
40 changes: 38 additions & 2 deletions covalent_dispatcher/_dal/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from datetime import datetime
from typing import Any, Dict, List

from sqlalchemy import select
from sqlalchemy.orm import Session

from covalent._shared_files import logger
Expand All @@ -45,6 +46,41 @@
class ResultMeta(Record[models.Lattice]):
model = models.Lattice

@classmethod
def get_toplevel_dispatches(
cls,
session: Session,
*,
fields: list,
equality_filters: dict,
membership_filters: dict,
for_update: bool = False,
sort_fields: List[str] = [],
reverse: bool = True,
offset: int = 0,
max_items: int = 10,
):
if len(fields) > 0:
entities = [getattr(cls.model, attr) for attr in fields]
stmt = select(*entities)
else:
stmt = select(cls.model)

stmt = stmt.where(models.Lattice.root_dispatch_id == models.Lattice.dispatch_id)

return cls.get(
session=session,
stmt=stmt,
fields=fields,
equality_filters=equality_filters,
membership_filters=membership_filters,
for_update=for_update,
sort_fields=sort_fields,
reverse=reverse,
offset=offset,
max_items=max_items,
)


class ResultAsset(Record[models.LatticeAsset]):
model = models.LatticeAsset
Expand Down Expand Up @@ -175,7 +211,7 @@ def _update_dispatch(
with self.session() as session:
electron_rec = Electron.get_db_records(
session,
keys={"id", "parent_lattice_id"},
keys=ELECTRON_KEYS,
equality_filters={"id": self._electron_id},
membership_filters={},
)[0]
Expand Down Expand Up @@ -343,7 +379,7 @@ def _get_incomplete_nodes(self):
A dictionary {"failed": [node_ids], "cancelled": [node_ids]}
"""
with self.session() as session:
query_keys = {"parent_lattice_id", "node_id", "name", "status"}
query_keys = {"id", "parent_lattice_id", "node_id", "name", "status"}
records = Electron.get_db_records(
session,
keys=query_keys,
Expand Down
Loading

0 comments on commit 5f4adfb

Please sign in to comment.