Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Development #430

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
14 changes: 7 additions & 7 deletions .github/workflows/build_and_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.9]
python-version: ["3.10"]
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Start containers
run: |
mv .env.sample .env
docker-compose -f "docker-compose.production.yaml" up -d --build --quiet-pull
docker compose -f "docker-compose.production.yaml" up -d --build --quiet-pull

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install ERDDAP harvester and db loader
run: pip install -q ./harvester ./db-loader
- name: Install Python environment
run: pip install .

- name: Run ERDDAP harvester on sample dataset
run: python -m cde_harvester --urls https://data.cioospacific.ca/erddap/ --dataset_ids DFO_MEDS_BUOYS --folder erddap_harvest
Expand All @@ -54,4 +54,4 @@ jobs:

- name: Stop containers
if: always()
run: docker-compose -f "docker-compose.production.yaml" down
run: docker compose -f "docker-compose.production.yaml" down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
If you just want to see how a dataset is harvested by CDE:

1. Start your python environment environment, `conda create -n cde python=3.10;conda activate cde`
1. `pip install -e ./harvester`
1. `python -m cde_harvester --urls https://data.cioospacific.ca/erddap --dataset_ids ECCC_MSC_BUOYS`
1. See files in `harvest` folder
2. `pip install -e .`
3. `python -m cde_harvester --urls https://data.cioospacific.ca/erddap --dataset_ids ECCC_MSC_BUOYS`
4. See files in `harvest` folder

## Starting using docker

Expand Down
57 changes: 40 additions & 17 deletions db-loader/cde_db_loader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,31 @@

import numpy as np
import pandas as pd
import sentry_sdk
from cde_harvester.utils import df_cde_eov_to_standard_name
from dotenv import load_dotenv
from sentry_sdk.integrations.logging import LoggingIntegration
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import ARRAY, INTEGER, TEXT

logging.getLogger("urllib3").setLevel(logging.WARNING)

logging.basicConfig(
level=logging.DEBUG, format="%(asctime)s - %(levelname)-8s - %(name)s : %(message)s"
)
logger = logging.getLogger()

sentry_sdk.init(
dsn=os.environ.get("SENTRY_DSN"),
integrations=[
LoggingIntegration(
level=logging.INFO, # Capture info and above as breadcrumbs
event_level=logging.WARNING, # Send records as events
),
],
environment=os.environ.get("ENVIRONMENT", "development"),
)


def main(folder):
# setup database connection
Expand All @@ -26,13 +44,15 @@ def main(folder):
engine = create_engine(database_link)
# test connection
engine.connect()
print("Connected to ", envs["DB_HOST_EXTERNAL"])
logger.info("Connected to %s", envs["DB_HOST_EXTERNAL"])

datasets_file = f"{folder}/datasets.csv"
profiles_file = f"{folder}/profiles.csv"
skipped_datasets_file = f"{folder}/skipped.csv"

print("Reading", datasets_file, profiles_file, skipped_datasets_file)
logger.info(
"Reading %s,%s, %s", datasets_file, profiles_file, skipped_datasets_file
)

# ckan_file = f"ckan_{uuid_suffix}.csv"

Expand All @@ -47,22 +67,22 @@ def main(folder):
)

if datasets.empty:
print("No datasets found")
logger.info("No datasets found")
sys.exit(1)

# this gets a list of all the standard names

schema = "cde"
with engine.begin() as transaction:
print("Writing to DB:")
logger.info("Writing to DB:")

print("Dropping constraints")
logger.info("Dropping constraints")
transaction.execute("SELECT drop_constraints();")

print("Clearing tables")
logger.info("Clearing tables")
transaction.execute("SELECT remove_all_data();")

print("Writing datasets")
logger.info("Writing datasets")

datasets.to_sql(
"datasets",
Expand All @@ -80,7 +100,7 @@ def main(folder):

profiles = profiles.replace("", np.NaN)

print("Writing profiles")
logger.info("Writing profiles")

# profiles has some columns to fix up first
profiles.to_sql(
Expand All @@ -92,7 +112,7 @@ def main(folder):
# method="multi",
)

print("Writing skipped_datasets")
logger.info("Writing skipped_datasets")
skipped_datasets.to_sql(
"skipped_datasets",
con=transaction,
Expand All @@ -101,20 +121,20 @@ def main(folder):
index=False,
)

print("Processing new records")
logger.info("Processing new records")
transaction.execute("SELECT profile_process();")
transaction.execute("SELECT ckan_process();")

print("Creating hexes")
logger.info("Creating hexes")
transaction.execute("SELECT create_hexes();")

# This ensures that all fields were set successfully
print("Setting constraints")
logger.info("Setting constraints")
transaction.execute("SELECT set_constraints();")

print("Wrote to db:", f"{schema}.datasets")
print("Wrote to db:", f"{schema}.profiles")
print("Wrote to db:", f"{schema}.skipped_datasets")
logger.info("Wrote to db: %s", f"{schema}.datasets")
logger.info("Wrote to db: %s", f"{schema}.profiles")
logger.info("Wrote to db: %s", f"{schema}.skipped_datasets")


if __name__ == "__main__":
Expand All @@ -127,5 +147,8 @@ def main(folder):
)

args = parser.parse_args()

main(args.folder)
try:
main(args.folder)
except Exception:
logger.error("Failed to write to db", exc_info=True)
sys.exit(1)
1 change: 1 addition & 0 deletions db-loader/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"psycopg2-binary",
"python-dotenv",
"numpy",
"sentry_sdk",
],
)
5 changes: 3 additions & 2 deletions download_scheduler/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9-slim-buster
FROM python:3.10.2

# for shapely, pdf creation
RUN apt-get -y -qq update
Expand All @@ -14,7 +14,8 @@ WORKDIR /usr/src/app

COPY . .

RUN pip install -q ./downloader ./download_scheduler ./harvester
RUN pip install --upgrade pip
RUN pip install -q .

# -u seems to be needed to get it to print to stdout
CMD [ "python", "-u","-m","download_scheduler" ]
13 changes: 9 additions & 4 deletions harvester/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ FROM python:3.10.2

WORKDIR /usr/src/app

COPY ./harvester ./harvester
COPY ./db-loader ./db-loader

RUN pip install -q ./db-loader ./harvester
COPY harvester /usr/src/app/harvester
COPY downloader /usr/src/app/downloader
COPY db-loader /usr/src/app/db-loader
COPY pyproject.toml /usr/src/app/pyproject.toml
COPY poetry.lock /usr/src/app/poetry.lock
COPY README.md /usr/src/app/README.md

RUN pip install --upgrade pip
RUN pip install -q .

CMD [ "sh","./harvester/run.sh" ]
7 changes: 4 additions & 3 deletions harvester/cde_harvester/platform_ioos_to_l06.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ def get_l06_codes_and_labels():

for platform in platforms:
# first entry describes the vocabulary, skip it
if not "identifier" in platform:
if not "dce:identifier" in platform:
continue

label = platform["prefLabel"]["@value"]
broader = platform.get("broader", [])
label = platform["skos:prefLabel"]["@value"]
broader = platform.get("skos:broader", [])
id = platform["@id"]
found_parent_platform = False
for url in broader:
url = url["@id"]
if "L06" in url:
platforms_parsed[id] = {"broader_L06_url": url, "l06_label": label}
found_parent_platform = True
Expand Down
2 changes: 1 addition & 1 deletion harvester/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"pandas<2.0.0",
"erddapy",
"shapely",
"sqlalchemy",
"sqlalchemy==1.4.16",
"psycopg2-binary",
"python-dotenv",
"diskcache",
Expand Down
Loading
Loading