diff --git a/.github/workflows/docker-build-stage.yaml b/.github/workflows/docker-build-stage.yaml index 2c2d404..d4cf9f5 100644 --- a/.github/workflows/docker-build-stage.yaml +++ b/.github/workflows/docker-build-stage.yaml @@ -9,8 +9,7 @@ on: jobs: docker-build-push-dev: - # runs-on: ubuntu-latest - runs-on: spatialdays-self-hosted-runner-1 + runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v3 @@ -32,7 +31,7 @@ jobs: with: context: . file: Dockerfile - cache-from: type=registry,ref=${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accesability-scanner:cache - cache-to: type=registry,ref=${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accesability-scanner:cache,mode=max + cache-from: type=registry,ref=${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accessibility-scanner:cache + cache-to: type=registry,ref=${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accessibility-scanner:cache,mode=max push: true - tags: ${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accesability-scanner:${{ env.SHORT_SHA }} \ No newline at end of file + tags: ${{ secrets.EO_PROJ_STAGING_DOCKER_REGISTRY_URL }}/stac-accessibility-scanner:${{ env.SHORT_SHA }} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3210b9d..3e8effa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,4 +17,4 @@ COPY . . EXPOSE 5000 # Run the application -CMD ["python", "server.py"] +CMD ["gunicorn", "server:app", "-b", "0.0.0.0:8000", "-w", "4"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..0071d9e --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# STAC Accessibility Scanner + +Loops through publically available STAC Catalogs to find the collections which are openly accessible. Otherwise most of the STAC Portal Frontend collections are not accessible. diff --git a/database.py b/database.py index 6e85124..06fffa9 100644 --- a/database.py +++ b/database.py @@ -78,7 +78,7 @@ def store_collection_in_database( .first() ) if collection_db_entry is None: - logging.debug(f"Adding {_catalog_url} {_collection_id} to the database") + logger.debug(f"Adding {_catalog_url} {_collection_id} to the database") collection_db_entry = Collection() collection_db_entry.catalog_url = _catalog_url collection_db_entry.collection_id = _collection_id @@ -91,19 +91,24 @@ def store_collection_in_database( collection_db_entry.mpc_token_obtaining_url = _mpc_token_obtaining_url session.add(collection_db_entry) session.commit() - logging.info(f"Added {_catalog_url} {_collection_id} to the database") + logger.info(f"Added {_catalog_url} {_collection_id} to the database") else: - logging.debug(f"Updating {_catalog_url} {_collection_id} in the database") + logger.debug(f"Updating {_catalog_url} {_collection_id} in the database") collection_db_entry.http_downloadable = _http_downloadable collection_db_entry.requires_token = _requires_token collection_db_entry.is_from_mpc = _is_from_mpc collection_db_entry.mpc_token_obtaining_url = _mpc_token_obtaining_url session.commit() - logging.info(f"Updated {_catalog_url} {_collection_id} in the database") + logger.info(f"Updated {_catalog_url} {_collection_id} in the database") if __name__ == '__main__': plugin_enable_statement = sa.text("CREATE EXTENSION IF NOT EXISTS postgis;") - with engine.connect() as conn: + with engine.begin() as conn: conn.execute(plugin_enable_statement) + logger.info("Enabled postgis extension") + print("Enabled postgis extension") + # commit the changes + + base.metadata.create_all(engine) diff --git a/docker-compose.yaml b/docker-compose.yaml index 69ac3db..b0e3fc6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,23 +1,25 @@ version: "3" services: - # web: - # build: . - # ports: - # - "5005:5005" - # depends_on: - # - db - # environment: - # DATABASE_HOST: "db" - # DATABASE_PORT: "5432" - # DATABASE_NAME: "stacaccessibility_db" - # DATABASE_USER: "postgres" - # DATABASE_PASSWORD: "postgres" - # APP_HOST: "0.0.0.0" - # APP_PORT: "5005" - # APP_DEBUG: "True" - # volumes: - # - .:/app + web: + build: . + ports: + - "8000:8000" + depends_on: + - db + environment: + DATABASE_HOST: "db" + DATABASE_PORT: "5432" + DATABASE_NAME: "stacaccessibility_db" + DATABASE_USER: "postgres" + DATABASE_PASSWORD: "postgres" + APP_HOST: "0.0.0.0" + APP_PORT: "8000" + APP_DEBUG: "True" + volumes: + - .:/app + command: ["python", "server.py"] + # scrape: # build: . @@ -30,7 +32,7 @@ services: # DATABASE_USER: "postgres" # DATABASE_PASSWORD: "postgres" # APP_HOST: "0.0.0.0" - # APP_PORT: "5000" + # APP_PORT: "8000" # APP_DEBUG: "True" # command: ["python", "scrape.py"] @@ -45,7 +47,7 @@ services: DATABASE_USER: "postgres" DATABASE_PASSWORD: "postgres" APP_HOST: "0.0.0.0" - APP_PORT: "5000" + APP_PORT: "8000" APP_DEBUG: "True" command: ["python", "database.py"] @@ -58,12 +60,4 @@ services: ports: - "15432:5432" - # db: - # image: postgres:13 - # environment: - # POSTGRES_USER: "postgres" - # POSTGRES_PASSWORD: "postgres" - # POSTGRES_DB: "stacaccessibility_db" - # ports: - # - "15432:5432" diff --git a/requirements.txt b/requirements.txt index 5dbc3f9..8ded6cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,11 +8,12 @@ click-plugins==1.1.1 cligj==0.7.2 fiona==1.9.5 Flask==3.0.0 -Flask-Cors==3.0.10 +Flask-Cors==4.0.0 Flask-SQLAlchemy==3.1.1 GeoAlchemy2==0.14.2 geopandas==0.14.0 greenlet==3.0.1 +gunicorn==21.2.0 idna==3.4 itsdangerous==2.1.2 Jinja2==3.1.2 diff --git a/scrape.py b/scrape.py index e4b56a7..1166e18 100644 --- a/scrape.py +++ b/scrape.py @@ -25,9 +25,9 @@ def find_first_downloadable_asset_key(_assets: dict) -> str: for asset_key, asset_info in _assets.items(): asset_key_href = asset_info["href"].lower() if ( - asset_key_href.endswith(".tif") - or asset_key_href.endswith(".tiff") - or asset_key_href.endswith(".nc") + asset_key_href.endswith(".tif") + or asset_key_href.endswith(".tiff") + or asset_key_href.endswith(".nc") ): return asset_key # If no asset with specific extensions is found, return the first asset key @@ -56,7 +56,9 @@ def check_if_stac_item_is_http_downloadable(_stac_item: dict) -> bool: return False -def check_if_stac_item_is_http_directly_downloadable_without_token(_stac_item: dict) -> bool: +def check_if_stac_item_is_http_directly_downloadable_without_token( + _stac_item: dict, +) -> bool: """ Check if a STAC item is downloadable using http without a token or some signing mechanism. @@ -98,14 +100,12 @@ def check_if_sas_token_is_present_for_collection_on_mpc(_collection_id: str) -> Returns: Tuple of (True/False, URL to obtain the SAS token) """ - logger.info( - f"Checking if collection {_collection_id} has available token" + logger.info(f"Checking if collection {_collection_id} has available token") + token_check_url = ( + f"https://planetarycomputer.microsoft.com/api/sas/v1/token/{_collection_id}" ) - token_check_url = f"https://planetarycomputer.microsoft.com/api/sas/v1/token/{_collection_id}" try: - token_check_response = safe_request( - "GET", token_check_url - ) + token_check_response = safe_request("GET", token_check_url) token_check_response.raise_for_status() if token_check_response.status_code == 200: return True, token_check_url @@ -188,19 +188,37 @@ def check_if_sas_token_is_present_for_collection_on_mpc(_collection_id: str) -> if "planetarycomputer" in results_catalog_url: is_from_mpc = True - if check_if_stac_item_is_http_downloadable(response_json["features"][0]): + if check_if_stac_item_is_http_downloadable( + response_json["features"][0] + ): http_downloadable = True if check_if_stac_item_is_http_directly_downloadable_without_token( - response_json["features"][0]): + response_json["features"][0] + ): http_downloadable = True requires_token = False else: if "planetarycomputer" in results_catalog_url: - token_present, token_url = check_if_sas_token_is_present_for_collection_on_mpc( - results_collection_id) + ( + token_present, + token_url, + ) = check_if_sas_token_is_present_for_collection_on_mpc( + results_collection_id + ) if token_present: mpc_token_obtaining_url = token_url + # convert shapely_multipolygon_envelope to MultiPolygon if it is not multipolygon + if not isinstance( + shapely_multipolygon_envelope, + shapely.geometry.multipolygon.MultiPolygon, + ): + shapely_multipolygon_envelope = ( + shapely.geometry.multipolygon.MultiPolygon( + [shapely_multipolygon_envelope] + ) + ) + store_collection_in_database( results_catalog_url, results_collection_id, diff --git a/server.py b/server.py index a44932f..ff4c1d5 100644 --- a/server.py +++ b/server.py @@ -6,18 +6,24 @@ import flask from flask_cors import CORS from dotenv import load_dotenv -import shapely + import geoalchemy2 as ga +from sqlalchemy import or_, and_ from database import session, Collection from urllib.parse import urljoin +from shapely import to_geojson +from shapely.geometry import shape +from flask import request +from urllib.parse import urljoin + load_dotenv() APP_HOST = os.getenv("APP_HOST", "0.0.0.0") APP_PORT = os.getenv("APP_PORT", "5000") APP_DEBUG = os.getenv("APP_DEBUG", "True") == "True" app = flask.Flask(__name__) -app = CORS(app) +CORS(app) # Create /healthz endpoint @@ -29,43 +35,61 @@ def healthz(): # Make a POST endpoint which will take catalog_url and aoi # in geojson format and filter the database for available collections @app.route("/get_collections", methods=["POST"]) +@app.route("/get_collections/", methods=["POST"]) def get_collections(): - aoi = flask.request.json.get("aoi", None) - if not aoi: - # send 400 bad request with message that aoi is required - return {"error": "aoi is required"}, 400 - - catalog_url = flask.request.json.get("catalog_url", None) - collection_id = flask.request.json.get("collection_id", None) - aoi_shapely = shapely.geometry.shape(aoi) - collections = session.query(Collection).filter( - ga.functions.ST_Intersects( - Collection.spatial_extent, ga.shape.from_shape(aoi_shapely, srid=4326) - ), - ) + data = request.get_json() + aoi = data.get("aoi") + public = data.get("public") + mpc_with_token = data.get("mpc_with_token") - if catalog_url: - collections = collections.filter(Collection.catalog_url == catalog_url) + aoi_shapely = shape(aoi) + collections = ( + session.query(Collection) + .filter( + ga.functions.ST_Intersects( + Collection.spatial_extent, ga.shape.from_shape(aoi_shapely, srid=4326) + ) + ) + .distinct() + ) - if collection_id: - collections = collections.filter(Collection.collection_id == collection_id) + conditions = [] + if public or mpc_with_token: + if public: + conditions.append( + and_( + Collection.http_downloadable == True, + Collection.requires_token == False, + ) + ) + if mpc_with_token: + conditions.append( + and_( + Collection.requires_token == True, + Collection.is_from_mpc == True, + bool(Collection.mpc_token_obtaining_url != ""), + ) + ) - collections = collections.all() + collections = collections.filter(or_(*conditions)) + collection_results = collections.all() - results = {} - for i in collections: - aoi_as_shapely = shapely.geometry.shape(aoi) - aoi_as_geojson = json.loads(shapely.to_geojson(aoi_as_shapely)) - results[i.collection_id] = { - "catalog_url": i.catalog_url, - "http_downloadable": i.http_downloadable, - "requires_token": i.requires_token, - "is_from_mpc": i.is_from_mpc, - "mpc_token_obtaining_url": i.mpc_token_obtaining_url, - "collection_stac_url": urljoin(i.catalog_url, f"collections/{i.collection_id}"), - "aoi": aoi_as_geojson, - } - return flask.jsonify(results), 200 + response_data = [] + for i in collection_results: + response_data.append( + { + "collection_id": i.collection_id, + "catalog_url": i.catalog_url, + "http_downloadable": i.http_downloadable, + "requires_token": i.requires_token, + "is_from_mpc": i.is_from_mpc, + "mpc_token_obtaining_url": i.mpc_token_obtaining_url, + "collection_stac_url": urljoin( + i.catalog_url, f"collections/{i.collection_id}" + ), + } + ) + return flask.jsonify(response_data), 200 if __name__ == "__main__":