Skip to content

Commit

Permalink
refactored for ./data/
Browse files Browse the repository at this point in the history
  • Loading branch information
tomo-oga committed Jun 27, 2024
1 parent 94f390b commit 0c68e92
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 14 deletions.
4 changes: 3 additions & 1 deletion src/trial_synth/who/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .process import Processor
from .store import store_dataframe_as_flat_file
from .transform import transform_csv_data

from .util import ensure_output_directory_exists

CONFIG = Config()
logger = logging.getLogger(__name__)
Expand All @@ -31,6 +31,8 @@ def ensure_df() -> pd.DataFrame:

@click.command()
def main():
click.secho("Processing WHO ICTRP data", fg="green", bold=True)
ensure_output_directory_exists()
df = ensure_df()
store_dataframe_as_flat_file(df, CONFIG.sample_path, "\t", False)
processor = Processor(df, CONFIG)
Expand Down
18 changes: 11 additions & 7 deletions src/trial_synth/who/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
CONDITION_CURIE = os.environ.get("CONDITION_CURIE", "debio:0000036")

HERE = Path(__file__).parent.resolve()
HOME_DIR = os.environ.get("HOME_DIRECTORY", Path.home())
PARENT_DIR_STR = os.environ.get("BASE_DIRECTORY", ".data")
DATA_DIR_STR = os.environ.get("DATA_DIRECTORY", "who-ictrp")
DATA_DIR = Path(HOME_DIR, PARENT_DIR_STR, DATA_DIR_STR)

CSV_PATH = HERE.joinpath("ICTRP-Results.csv")
CSV_COLUMN_PATH = HERE.joinpath("ictrp_headers.csv")
XML_PATH = HERE.joinpath("ICTRP-Results.xml.gz")
PARSED_PICKLE_PATH = HERE.joinpath("processed.pkl.gz")
SAMPLE_PATH = HERE.joinpath("sample.tsv")
NODES_PATH = HERE.joinpath("nodes.tsv")
EDGES_PATH = HERE.joinpath("edges.tsv.gz")
MAPPINGS_PATH = HERE.joinpath("mappings.tsv")
PARSED_PICKLE_PATH = DATA_DIR.joinpath("processed.pkl.gz")
SAMPLE_PATH = DATA_DIR.joinpath("sample.tsv")
NODES_PATH = DATA_DIR.joinpath("nodes.tsv")
EDGES_PATH = DATA_DIR.joinpath("edges.tsv.gz")
MAPPINGS_PATH = DATA_DIR.joinpath("mappings.tsv")

SOURCE_KEY = "who"

Expand Down Expand Up @@ -46,7 +50,7 @@ class Config:
condition_curie = CONDITION_CURIE

current_path = HERE
xml_path = XML_PATH
data_dir_path = DATA_DIR
csv_path = CSV_PATH
csv_column_path = CSV_COLUMN_PATH
parsed_pickle_path = PARSED_PICKLE_PATH
Expand Down
6 changes: 2 additions & 4 deletions src/trial_synth/who/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,8 @@ def process_matches(self) -> None:
logger.info("warming up grounder")
gilda.annotate("stuff")
logger.info("done warming up grounder")

for column, namespaces, rtype, rcurie, annotate_fn, skip in tqdm(
config, leave=False, desc="Processing columns"
):
logger.info("Processing columns")
for column, namespaces, rtype, rcurie, annotate_fn, skip in config:
rows = []
for curie, cells in tqdm(
self.df[["curie", column]].values,
Expand Down
27 changes: 25 additions & 2 deletions src/trial_synth/who/util.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import re
import logging

from lxml import etree
import pandas as pd
from tqdm import tqdm
from typing import Optional

from pathlib import Path
from .config import DATA_DIR
import bioregistry

logger = logging.getLogger(__name__)

PREFIXES = {
"ISRCTN": "isrctn",
"ACTRN": "anzctr",
Expand All @@ -25,7 +29,7 @@
"EU Clinical Trials Register": "euclinicaltrials", ###
"JPRN-jRCT": "jrct",
"JPRN-UMIN": "uminctr", # University hospital Medical Information Network
"JPRN-C": "uminctr", # new ID format starting with C
"JPRN-C": "uminctr", # new ID format starting with C
"Clinical Trials Information System": "ctis",
"CTIS": "ctis", # site broken
"LBCTR": "lctr", # Lebanon Clinical Trials Registry
Expand Down Expand Up @@ -105,6 +109,7 @@ def makelist(s: Optional[str], delimeter: str = '.') -> list:
return sorted(x for x in {x.strip() for x in s.split(delimeter)} if x)
return []


def make_str(s: str):
"""Return a stripped string if it is not empty
Expand All @@ -122,6 +127,8 @@ def make_str(s: str):
return s.strip()

return ''


def matches_pattern(s: str) -> Optional[str]:
"""Matches a string to a pattern and returns the prefix if it matches.
Expand Down Expand Up @@ -168,3 +175,19 @@ def transform_mappings(s: str) -> Optional[list[str]]:
if not curies:
return None
return curies


def ensure_output_directory_exists(path: Path = DATA_DIR) -> None:
"""Ensures that the output directory exists
Parameters
----------
path : Path
Path to the output directory
"""
try:
logger.debug(f"Ensuring directory exists: {path}")
path.mkdir(parents=True, exist_ok=True)
except Exception:
logger.exception(f"An error occurred trying to create {path}")
raise

0 comments on commit 0c68e92

Please sign in to comment.