Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search S3 if granules are not Explicitly Provided #50

Merged
merged 46 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
78314e3
search s3 bucket&prefix for granules if none provided
AndrewPlayer3 Oct 2, 2024
981d7a9
updated changelog
AndrewPlayer3 Oct 2, 2024
795b71c
filter to only s1[ab]_iw_raw when searching s3
AndrewPlayer3 Oct 2, 2024
ef0d7d1
raise error if both gslc-bucket and granules are provided
AndrewPlayer3 Oct 2, 2024
f17be66
moved s3 functions to utils
AndrewPlayer3 Oct 2, 2024
1ef9f09
added test for getting gslc uris from s3
AndrewPlayer3 Oct 2, 2024
0905b01
moved tests for s3 functions
AndrewPlayer3 Oct 2, 2024
00a0303
removed unused import
AndrewPlayer3 Oct 2, 2024
723873a
corrected changelog
AndrewPlayer3 Oct 2, 2024
762f240
better changelog
AndrewPlayer3 Oct 2, 2024
d880b47
simplify gslc s3 search interface, fix pycharm warnings
jtherrmann Oct 4, 2024
5c04c22
temp hard-code granules sub-prefix, fix a typo
jtherrmann Oct 4, 2024
c7464cc
add --use-granules-from-s3 option
jtherrmann Oct 5, 2024
8c84b23
remove unexpected kwarg
jtherrmann Oct 5, 2024
09e1531
update time_series --bounds interface to match back_projection
jtherrmann Oct 7, 2024
f06485c
finish implementing gslc prefix option
jtherrmann Oct 9, 2024
0081c55
check if bucket and bucket_prefix given in back_projection workflow
jtherrmann Oct 9, 2024
e50a7e2
newlines
jtherrmann Oct 9, 2024
6d580b4
Merge pull request #51 from ASFHyP3/jth-search-s3
jtherrmann Oct 10, 2024
a781acf
rename dockerfile for test purposes
AndrewPlayer3 Oct 11, 2024
b8dc431
pin python to <3.13 due to hyp3lib bug
jtherrmann Oct 11, 2024
36a52de
Merge branch 'search-s3' of github.com:ASFHyP3/hyp3-srg into search-s3
jtherrmann Oct 11, 2024
512de27
testing multiple docker build
AndrewPlayer3 Oct 14, 2024
a00f8bb
Merge branch 'search-s3' of https://github.com/asfhyp3/hyp3-srg into …
AndrewPlayer3 Oct 14, 2024
f3d9a3e
fixed grammar in comment
AndrewPlayer3 Oct 14, 2024
9cf7e11
fail on particular input granule
jtherrmann Oct 14, 2024
4a8d440
add todo
jtherrmann Oct 14, 2024
183bd48
pin to actions develop
jtherrmann Oct 14, 2024
6f7866d
remove failure for particular granule
jtherrmann Oct 14, 2024
ad1adc3
renamed docker files
AndrewPlayer3 Oct 15, 2024
de18cff
Merge branch 'search-s3' of https://github.com/asfhyp3/hyp3-srg into …
AndrewPlayer3 Oct 15, 2024
36a0f05
pin docker action to latest release
jtherrmann Oct 15, 2024
604dea6
update changelog, revert python pin
jtherrmann Oct 15, 2024
5ab1d46
changelog version
jtherrmann Oct 15, 2024
54f8e6b
remove old todo
jtherrmann Oct 15, 2024
180f1b5
changelog tweak
jtherrmann Oct 15, 2024
f4bb3b8
remove references to manually pushing gpu image
AndrewPlayer3 Oct 15, 2024
441b41a
Merge branch 'search-s3' of https://github.com/asfhyp3/hyp3-srg into …
AndrewPlayer3 Oct 15, 2024
aa2a427
removed unnecessary checks
AndrewPlayer3 Oct 15, 2024
4a656d5
move s3 to global scope
AndrewPlayer3 Oct 15, 2024
27af46a
simplified search s3 function and removed test
AndrewPlayer3 Oct 15, 2024
62a4f82
Update src/hyp3_srg/utils.py
AndrewPlayer3 Oct 15, 2024
76dd5e0
Merge pull request #56 from ASFHyP3/jth-search-s3-updates
jtherrmann Oct 15, 2024
c2eed41
imports
jtherrmann Oct 15, 2024
46dd802
fix actions tags
jtherrmann Oct 15, 2024
7e82cc7
python 3.13
jtherrmann Oct 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/)
and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.8.1]

### Changed
* `time_series` can now search an s3 bucket for GSLCs (using `--gslc-bucket` and `--gslc-bucket-prefix`) if no GSLC granules are explicitly provided

## [0.8.0]

### Added
Expand Down
58 changes: 29 additions & 29 deletions src/hyp3_srg/time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,42 @@

import argparse
import logging
import re
import shutil
from os import mkdir
from pathlib import Path
from secrets import token_hex
from shutil import copyfile
from typing import Iterable, Optional

from boto3 import client
from hyp3lib.aws import upload_file_to_s3
from hyp3lib.fetch import download_file as download_from_http

from hyp3_srg import dem, utils


S3 = client('s3')
log = logging.getLogger(__name__)


def get_s3_args(uri: str, dest_dir: Optional[Path] = None) -> None:
"""Retrieve the arguments for downloading from an S3 bucket
def get_gslc_uris_from_s3(bucket: str, prefix: str = '') -> list[str]:
"""Retrieve granule (zip files) uris from the given s3 bucket and prefix.

Args:
uri: URI of the file to download
dest_dir: the directory to place the downloaded file in
bucket: the s3 bucket name
prefix: the path after the bucket and before the file

Returns:
bucket: the s3 bucket to download from
key: the path to the file following the s3 bucket
out_path: the destination path of the file to download
uris: a list of uris to the zip files
"""
if dest_dir is None:
dest_dir = Path.cwd()
bucket = bucket.replace('s3:', '').replace('/', '')
AndrewPlayer3 marked this conversation as resolved.
Show resolved Hide resolved
res = utils.s3_list_objects(bucket, prefix)

simple_s3_uri = Path(uri.replace('s3://', ''))
bucket = simple_s3_uri.parts[0]
key = '/'.join(simple_s3_uri.parts[1:])
out_path = dest_dir / simple_s3_uri.parts[-1]
return bucket, key, out_path
def is_valid_key(key):
return (key.endswith('.zip') or key.endswith('.geo')) and re.search('S1[AB]_IW_RAW', key.split('/')[-1])
AndrewPlayer3 marked this conversation as resolved.
Show resolved Hide resolved


def download_from_s3(uri: str, dest_dir: Optional[Path] = None) -> None:
"""Download a file from an S3 bucket

Args:
uri: URI of the file to download
dest_dir: the directory to place the downloaded file in
"""
bucket, key, out_path = get_s3_args(uri, dest_dir)
S3.download_file(bucket, key, out_path)
return out_path
keys = [item['Key'] for item in res['Contents'] if is_valid_key(item['Key'])]
uris = ['/'.join(['s3://' + bucket, key]) for key in keys]
return uris


def load_products(uris: Iterable[str], overwrite: bool = False):
Expand All @@ -74,7 +60,7 @@ def load_products(uris: Iterable[str], overwrite: bool = False):
if product_exists and not overwrite:
pass
elif uri.startswith('s3'):
download_from_s3(uri, dest_dir=work_dir)
utils.download_from_s3(uri, dest_dir=work_dir)
elif uri.startswith('http'):
download_from_http(uri, directory=work_dir)
elif len(Path(uri).parts) > 1:
Expand Down Expand Up @@ -299,6 +285,8 @@ def time_series(
bounds: list[float],
bucket: str = None,
bucket_prefix: str = '',
gslc_bucket: str = None,
gslc_bucket_prefix: str = '',
work_dir: Optional[Path] = None,
) -> None:
"""Create and package a time series stack from a set of Sentinel-1 GSLCs.
Expand All @@ -308,6 +296,8 @@ def time_series(
bounds: bounding box that was used to generate the GSLCs
bucket: AWS S3 bucket for uploading the final product(s)
bucket_prefix: Add a bucket prefix to the product(s)
gslc_bucket: AWS S3 bucket containing GSLCs for time-series processing
gslc_bucket_prefix: Path to GSLCs within gslc_bucket.
work_dir: Working directory for processing
"""
if work_dir is None:
Expand All @@ -316,6 +306,14 @@ def time_series(
if not sbas_dir.exists():
mkdir(sbas_dir)

if granules and gslc_bucket:
raise ValueError('One of a list of granules or a s3 bucket must be provided, but got both.')

if granules == []:
if gslc_bucket is None:
raise ValueError('Either a list of granules or a s3 bucket must be provided, but got neither.')
granules = get_gslc_uris_from_s3(gslc_bucket, gslc_bucket_prefix)

granule_names = load_products(granules)
dem_path = dem.download_dem_for_srg(bounds, work_dir)

Expand Down Expand Up @@ -345,7 +343,9 @@ def main():
)
parser.add_argument('--bucket', help='AWS S3 bucket HyP3 for upload the final product(s)')
parser.add_argument('--bucket-prefix', default='', help='Add a bucket prefix to product(s)')
parser.add_argument('granules', type=str.split, nargs='+', help='GSLC granules.')
parser.add_argument('--gslc-bucket', help='AWS S3 bucket containing GSLCs to process')
parser.add_argument('--gslc-bucket-prefix', default='', help='Path to GSLCs within gslc-bucket.')
parser.add_argument('granules', type=str.split, nargs='*', default='', help='GSLC granules.')
args = parser.parse_args()
args.granules = [item for sublist in args.granules for item in sublist]
time_series(**args.__dict__)
Expand Down
52 changes: 52 additions & 0 deletions src/hyp3_srg/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from zipfile import ZipFile

import asf_search
from boto3 import client
from s1_orbits import fetch_for_scene
from shapely.geometry import Polygon, shape

Expand Down Expand Up @@ -227,3 +228,54 @@ def how_many_gpus():
(param, err) = proc.communicate()
ngpus = int(str(param, 'UTF-8').split()[0])
return ngpus


def get_s3_args(uri: str, dest_dir: Optional[Path] = None) -> None:
jtherrmann marked this conversation as resolved.
Show resolved Hide resolved
"""Retrieve the arguments for downloading from an S3 bucket

Args:
uri: URI of the file to download
dest_dir: the directory to place the downloaded file in

Returns:
bucket: the s3 bucket to download from
key: the path to the file following the s3 bucket
out_path: the destination path of the file to download
"""
if dest_dir is None:
dest_dir = Path.cwd()

simple_s3_uri = Path(uri.replace('s3://', ''))
bucket = simple_s3_uri.parts[0]
key = '/'.join(simple_s3_uri.parts[1:])
out_path = dest_dir / simple_s3_uri.parts[-1]
return bucket, key, out_path


def s3_list_objects(bucket: str, prefix: str = '') -> dict:
"""List objects in bucket at prefix

Args:
bucket: the simple s3 bucket name
prefix: the path within the bucket to search

Returns:
res: dictionary containing the response
"""
S3 = client('s3')
AndrewPlayer3 marked this conversation as resolved.
Show resolved Hide resolved
AndrewPlayer3 marked this conversation as resolved.
Show resolved Hide resolved
bucket = bucket.replace('s3:', '').replace('/', '')
res = S3.list_objects(Bucket=bucket, Prefix=prefix)
return res


def download_from_s3(uri: str, dest_dir: Optional[Path] = None) -> None:
"""Download a file from an S3 bucket

Args:
uri: URI of the file to download
dest_dir: the directory to place the downloaded file in
"""
S3 = client('s3')
bucket, key, out_path = get_s3_args(uri, dest_dir)
S3.download_file(bucket, key, out_path)
return out_path
44 changes: 36 additions & 8 deletions tests/test_time_series.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path
from unittest import mock

from hyp3_srg import time_series
from hyp3_srg import time_series, utils


def test_create_time_series_product_name():
Expand Down Expand Up @@ -43,9 +43,37 @@ def test_get_size_from_dem(tmp_path):
assert dem_width, dem_height == (1235, 873)


def test_get_s3_args():
s3_uri_1 = 's3://foo/bar.zip'
s3_uri_2 = 's3://foo/bing/bong/bar.zip'
dest_dir = Path('output')
assert time_series.get_s3_args(s3_uri_1) == ('foo', 'bar.zip', Path.cwd() / "bar.zip")
assert time_series.get_s3_args(s3_uri_2, dest_dir) == ('foo', 'bing/bong/bar.zip', dest_dir / 'bar.zip')
def test_get_gslc_uris_from_s3(monkeypatch):
bucket = 'bucket'
prefix = 'prefix'

mock_response = {
'Contents': [
{
'Key': f'{prefix}/S1A_IW_RAW_foo.zip'
},
{
'Key': f'{prefix}/prefibad_key.zip'
},
{
'Key': f'{prefix}/S1A_IW_RAW_foo.bad_extension'
},
{
'Key': f'{prefix}/S1B_IW_RAW_bar.geo'
}
]
}

correct_uris = [
f's3://{bucket}/{prefix}/S1A_IW_RAW_foo.zip',
f's3://{bucket}/{prefix}/S1B_IW_RAW_bar.geo'
]

with monkeypatch.context() as m:
mock_s3_list_objects = mock.Mock(return_value=mock_response)
m.setattr(utils, 's3_list_objects', mock_s3_list_objects)

uris = time_series.get_gslc_uris_from_s3(bucket, prefix)
assert uris == correct_uris
uris = time_series.get_gslc_uris_from_s3(f's3://{bucket}/', prefix)
assert uris == correct_uris
8 changes: 8 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,11 @@ def test_call_stanford_module(monkeypatch):
m.setenv('PROC_HOME', '.')
utils.call_stanford_module('foo/bar.py', ['arg1', 'arg2'])
mock_run.assert_called_once_with([Path('foo/bar.py'), 'arg1', 'arg2'], cwd=Path.cwd(), check=True)


def test_get_s3_args():
s3_uri_1 = 's3://foo/bar.zip'
s3_uri_2 = 's3://foo/bing/bong/bar.zip'
dest_dir = Path('output')
assert utils.get_s3_args(s3_uri_1) == ('foo', 'bar.zip', Path.cwd() / "bar.zip")
assert utils.get_s3_args(s3_uri_2, dest_dir) == ('foo', 'bing/bong/bar.zip', dest_dir / 'bar.zip')
Loading