Skip to content

Commit

Permalink
Make GCS client more robust (#74)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh authored Oct 31, 2024
1 parent 3fe59b6 commit 55d261e
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Removed `flash-attn` from the Beaker images since `flash-attn` currently can't be built for torch 2.5.1. We are waiting on updates from the `flash-attn` maintainers. See https://github.com/Dao-AILab/flash-attention/issues/1302.

### Fixed

- Made GCS client more robust by automatically retrying timeout errors for most operations.

## [v1.5.0](https://github.com/allenai/OLMo-core/releases/tag/v1.5.0) - 2024-10-23

### Added
Expand Down
24 changes: 21 additions & 3 deletions src/olmo_core/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
except ImportError:
from functools import lru_cache as cache

import requests
import torch
from cached_path import cached_path
from cached_path.schemes import S3Client, SchemeClient, add_scheme_client
Expand Down Expand Up @@ -409,14 +410,28 @@ def _get_gcs_client():
return gcs.Client()


def _gcs_is_retriable(exc: Exception) -> bool:
from google.api_core.retry import if_transient_error

return if_transient_error(exc) or isinstance(exc, requests.exceptions.Timeout)


def _get_gcs_retry():
from google.api_core.retry import Retry

return Retry(
predicate=_gcs_is_retriable, initial=1.0, maximum=10.0, multiplier=2.0, timeout=600.0
)


def _gcs_file_size(bucket_name: str, key: str) -> int:
from google.api_core.exceptions import NotFound

storage_client = _get_gcs_client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(key)
try:
blob.reload()
blob.reload(retry=_get_gcs_retry())
except NotFound:
raise FileNotFoundError(f"gs://{bucket_name}/{key}")
assert blob.size is not None
Expand All @@ -433,7 +448,9 @@ def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes
blob.reload()
except NotFound:
raise FileNotFoundError(f"gs://{bucket_name}/{key}")
return blob.download_as_bytes(start=bytes_start, end=bytes_start + num_bytes - 1)
return blob.download_as_bytes(
start=bytes_start, end=bytes_start + num_bytes - 1, retry=_get_gcs_retry()
)


def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False):
Expand Down Expand Up @@ -466,6 +483,7 @@ def _gcs_list_directory(bucket_name: str, prefix: str) -> Generator[str, None, N
prefix=prefix,
delimiter="/",
match_glob=match_glob,
retry=_get_gcs_retry(),
)
except NotFound:
raise FileNotFoundError(f"gs://{bucket_name}/{prefix}")
Expand All @@ -488,7 +506,7 @@ def _gcs_clear_directory(bucket_name: str, prefix: str):

try:
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=prefix)
blobs = bucket.list_blobs(prefix=prefix, retry=_get_gcs_retry())
for blob in blobs:
bucket.delete_blob(blob.name)
except NotFound:
Expand Down

0 comments on commit 55d261e

Please sign in to comment.