From 0c68e92a9ab75a67901dce636682c2f36f8a2658 Mon Sep 17 00:00:00 2001 From: Tomo Oga Date: Thu, 27 Jun 2024 10:44:09 -0400 Subject: [PATCH] refactored for ./data/ --- src/trial_synth/who/__main__.py | 4 +++- src/trial_synth/who/config.py | 18 +++++++++++------- src/trial_synth/who/process.py | 6 ++---- src/trial_synth/who/util.py | 27 +++++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/src/trial_synth/who/__main__.py b/src/trial_synth/who/__main__.py index 522fde3..ca76567 100644 --- a/src/trial_synth/who/__main__.py +++ b/src/trial_synth/who/__main__.py @@ -14,7 +14,7 @@ from .process import Processor from .store import store_dataframe_as_flat_file from .transform import transform_csv_data - +from .util import ensure_output_directory_exists CONFIG = Config() logger = logging.getLogger(__name__) @@ -31,6 +31,8 @@ def ensure_df() -> pd.DataFrame: @click.command() def main(): + click.secho("Processing WHO ICTRP data", fg="green", bold=True) + ensure_output_directory_exists() df = ensure_df() store_dataframe_as_flat_file(df, CONFIG.sample_path, "\t", False) processor = Processor(df, CONFIG) diff --git a/src/trial_synth/who/config.py b/src/trial_synth/who/config.py index 7a09a9a..d8c9620 100644 --- a/src/trial_synth/who/config.py +++ b/src/trial_synth/who/config.py @@ -9,14 +9,18 @@ CONDITION_CURIE = os.environ.get("CONDITION_CURIE", "debio:0000036") HERE = Path(__file__).parent.resolve() +HOME_DIR = os.environ.get("HOME_DIRECTORY", Path.home()) +PARENT_DIR_STR = os.environ.get("BASE_DIRECTORY", ".data") +DATA_DIR_STR = os.environ.get("DATA_DIRECTORY", "who-ictrp") +DATA_DIR = Path(HOME_DIR, PARENT_DIR_STR, DATA_DIR_STR) + CSV_PATH = HERE.joinpath("ICTRP-Results.csv") CSV_COLUMN_PATH = HERE.joinpath("ictrp_headers.csv") -XML_PATH = HERE.joinpath("ICTRP-Results.xml.gz") -PARSED_PICKLE_PATH = HERE.joinpath("processed.pkl.gz") -SAMPLE_PATH = HERE.joinpath("sample.tsv") -NODES_PATH = HERE.joinpath("nodes.tsv") -EDGES_PATH = HERE.joinpath("edges.tsv.gz") -MAPPINGS_PATH = HERE.joinpath("mappings.tsv") +PARSED_PICKLE_PATH = DATA_DIR.joinpath("processed.pkl.gz") +SAMPLE_PATH = DATA_DIR.joinpath("sample.tsv") +NODES_PATH = DATA_DIR.joinpath("nodes.tsv") +EDGES_PATH = DATA_DIR.joinpath("edges.tsv.gz") +MAPPINGS_PATH = DATA_DIR.joinpath("mappings.tsv") SOURCE_KEY = "who" @@ -46,7 +50,7 @@ class Config: condition_curie = CONDITION_CURIE current_path = HERE - xml_path = XML_PATH + data_dir_path = DATA_DIR csv_path = CSV_PATH csv_column_path = CSV_COLUMN_PATH parsed_pickle_path = PARSED_PICKLE_PATH diff --git a/src/trial_synth/who/process.py b/src/trial_synth/who/process.py index 45a2484..94f92fc 100644 --- a/src/trial_synth/who/process.py +++ b/src/trial_synth/who/process.py @@ -129,10 +129,8 @@ def process_matches(self) -> None: logger.info("warming up grounder") gilda.annotate("stuff") logger.info("done warming up grounder") - - for column, namespaces, rtype, rcurie, annotate_fn, skip in tqdm( - config, leave=False, desc="Processing columns" - ): + logger.info("Processing columns") + for column, namespaces, rtype, rcurie, annotate_fn, skip in config: rows = [] for curie, cells in tqdm( self.df[["curie", column]].values, diff --git a/src/trial_synth/who/util.py b/src/trial_synth/who/util.py index c6e85a6..4e5472d 100644 --- a/src/trial_synth/who/util.py +++ b/src/trial_synth/who/util.py @@ -1,12 +1,16 @@ import re +import logging from lxml import etree import pandas as pd from tqdm import tqdm from typing import Optional - +from pathlib import Path +from .config import DATA_DIR import bioregistry +logger = logging.getLogger(__name__) + PREFIXES = { "ISRCTN": "isrctn", "ACTRN": "anzctr", @@ -25,7 +29,7 @@ "EU Clinical Trials Register": "euclinicaltrials", ### "JPRN-jRCT": "jrct", "JPRN-UMIN": "uminctr", # University hospital Medical Information Network - "JPRN-C": "uminctr", # new ID format starting with C + "JPRN-C": "uminctr", # new ID format starting with C "Clinical Trials Information System": "ctis", "CTIS": "ctis", # site broken "LBCTR": "lctr", # Lebanon Clinical Trials Registry @@ -105,6 +109,7 @@ def makelist(s: Optional[str], delimeter: str = '.') -> list: return sorted(x for x in {x.strip() for x in s.split(delimeter)} if x) return [] + def make_str(s: str): """Return a stripped string if it is not empty @@ -122,6 +127,8 @@ def make_str(s: str): return s.strip() return '' + + def matches_pattern(s: str) -> Optional[str]: """Matches a string to a pattern and returns the prefix if it matches. @@ -168,3 +175,19 @@ def transform_mappings(s: str) -> Optional[list[str]]: if not curies: return None return curies + + +def ensure_output_directory_exists(path: Path = DATA_DIR) -> None: + """Ensures that the output directory exists + + Parameters + ---------- + path : Path + Path to the output directory + """ + try: + logger.debug(f"Ensuring directory exists: {path}") + path.mkdir(parents=True, exist_ok=True) + except Exception: + logger.exception(f"An error occurred trying to create {path}") + raise