From 41144a230eb009d99b682590fe3b88b2326047ca Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 14:56:00 -0400 Subject: [PATCH 01/40] Adds a `shortest_name_length` value to the synonyms file. --- src/babel_utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/babel_utils.py b/src/babel_utils.py index 45710ecc..a254b09a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -1,3 +1,4 @@ +import logging from ftplib import FTP from io import BytesIO import gzip @@ -255,6 +256,19 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]): "types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink: if "label" in node["identifiers"][0]: document["preferred_name"] = node["identifiers"][0]["label"] + + # We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given + # two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the + # more interesting one for users. I'm not sure if there's a better way to do that -- for instance, + # could we consider the information content values? -- but in the interests of getting something + # working quickly, this code restores that previous method. + + # Since synonyms_list is sorted, + if len(synonyms_list) == 0: + logging.warning(f"Synonym list for {node} is empty: no valid name.") + else: + document["shortest_name_length"] = len(synonyms_list[0]) + sfile.write( document ) except Exception as ex: print(f"Exception thrown while write_compendium() was generating {ofname}: {ex}") From 75bebc1c805adb9214ef4f9a9409f77219e271fe Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 15:40:51 -0400 Subject: [PATCH 02/40] Removed some asserts with brackets because they're tuples. Which means they're always true. --- src/createcompendia/chemicals.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 175d9bb2..4abe9135 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -162,7 +162,7 @@ def write_drugbank_ids(infile,outfile): written = set() with open(infile,'r') as inf, open(outfile,'w') as outf: header_line = inf.readline() - assert(header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {infile}: {header_line}") + assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {infile}: {header_line}" for line in inf: x = line.rstrip().split('\t') if x[1] == drugbank_id: @@ -241,11 +241,11 @@ def write_unichem_concords(structfile,reffile,outdir): concfiles[num] = open(concname,'w') with open(reffile,'rt') as inf: header_line = inf.readline() - assert(header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {reffile}: {header_line}") + assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {reffile}: {header_line}" for line in inf: x = line.rstrip().split('\t') outf = concfiles[x[1]] - assert(x[3] == '1') # Only '1' (current) assignments should be in this file + assert x[3] == '1' # Only '1' (current) assignments should be in this file # (see https://chembl.gitbook.io/unichem/definitions/what-is-an-assignment). outf.write(f'{unichem_data_sources[x[1]]}:{x[2]}\toio:equivalent\t{inchikeys[x[0]]}\n') for outf in concfiles.values(): @@ -256,7 +256,7 @@ def read_inchikeys(struct_file): inchikeys = {} with gzip.open(struct_file, 'rt') as inf: header_line = inf.readline() - assert(header_line == "UCI\tSTANDARDINCHI\tSTANDARDINCHIKEY\n", f"Unexpected header line in {struct_file}: {header_line}") + assert header_line == "UCI\tSTANDARDINCHI\tSTANDARDINCHIKEY\n", f"Unexpected header line in {struct_file}: {header_line}" for sline in inf: line = sline.rstrip().split('\t') if len(line) == 0: From e37dd99c40f61c7964cfd644c727993b80226fcb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 15:41:16 -0400 Subject: [PATCH 03/40] Replaced pull_via_urllib() with requests.get(). --- src/datahandlers/unii.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/datahandlers/unii.py b/src/datahandlers/unii.py index 62895a05..f185963c 100644 --- a/src/datahandlers/unii.py +++ b/src/datahandlers/unii.py @@ -1,14 +1,25 @@ from zipfile import ZipFile from os import path,listdir,rename + +import requests + from src.prefixes import UNII -from src.babel_utils import pull_via_urllib +from src.babel_utils import pull_via_urllib, get_config + def pull_unii(): for (pullfile,originalprefix,finalname) in [('UNIIs.zip','UNII_Names','Latest_UNII_Names.txt'), ('UNII_Data.zip','UNII_Records','Latest_UNII_Records.txt')]: # Downloads also available from https://precision.fda.gov/uniisearch/archive - dname = pull_via_urllib('https://precision.fda.gov/uniisearch/archive/latest/',pullfile,decompress=False,subpath='UNII') - ddir = path.dirname(dname) + url = f"https://precision.fda.gov/uniisearch/archive/latest/{pullfile}" + response = requests.get(url, stream=True) + if not response.ok: + raise RuntimeError(f"Could not download {url}: {response}") + local_filename = path.join(get_config()['download_directory'], 'UNII', pullfile) + with open(local_filename, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + ddir = path.dirname(local_filename) with ZipFile(dname, 'r') as zipObj: zipObj.extractall(ddir) #this zip file unzips into a readme and a file named something like "UNII_Names_.txt" and we need to rename it for make From f6c28a33b52e24f33c9550daa7f000aa48d51195 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 15:42:04 -0400 Subject: [PATCH 04/40] Fixed reference to local_filename. --- README.md | 2 -- src/datahandlers/unii.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 238b34f4..7ee5596a 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,6 @@ strong dependencies against the Babel code. ## Configuration -Babel requires Python 3.11 or later. - Before running, edit `config.json` and set the `babel_downloads` and `babel_output` directories. Do not edit the remaining items, which are used to control the build process. diff --git a/src/datahandlers/unii.py b/src/datahandlers/unii.py index f185963c..9b82c140 100644 --- a/src/datahandlers/unii.py +++ b/src/datahandlers/unii.py @@ -20,7 +20,7 @@ def pull_unii(): for chunk in response.iter_content(chunk_size=8192): f.write(chunk) ddir = path.dirname(local_filename) - with ZipFile(dname, 'r') as zipObj: + with ZipFile(local_filename, 'r') as zipObj: zipObj.extractall(ddir) #this zip file unzips into a readme and a file named something like "UNII_Names_.txt" and we need to rename it for make files = listdir(ddir) From ad91a1af4498ea4b1c09f259882480314baf3f34 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 5 May 2023 19:08:15 -0400 Subject: [PATCH 05/40] Modified icRDF.tsv to be downloaded directly from UberGraph. Closes #123, hopefully. --- src/babel_utils.py | 1 + src/datahandlers/obo.py | 10 +++++++++- src/node.py | 4 ++-- src/snakefiles/datacollect.snakefile | 2 ++ src/ubergraph.py | 17 +++++++++++++++++ 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 45710ecc..95bb9a20 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -219,6 +219,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]): synonym_factory = SynonymFactory(make_local_name('')) description_factory = DescriptionFactory(make_local_name('')) ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv') + ic_factory = InformationContentFactory(f'{get_config()["download_directory"]}/icRDF.tsv') node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes) with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: for slist in synonym_list: diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py index 9c54c379..ce9f6660 100644 --- a/src/datahandlers/obo.py +++ b/src/datahandlers/obo.py @@ -1,11 +1,18 @@ from src.ubergraph import UberGraph -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name, pull_via_ftp, get_config from collections import defaultdict import os, gzip from json import loads,dumps from src.util import Text +def pull_uber_icRDF(): + """ + Download the icRDF.tsv file that contains normalizedInformationContent for all the entities in UberGraph. + """ + uber = UberGraph() + config = get_config() + _ = uber.write_normalized_information_content(os.path.join(config['download_directory'], 'icRDF.tsv')) def pull_uber_labels(expected): uber = UberGraph() @@ -56,6 +63,7 @@ def pull_uber_synonyms(expected): outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n') def pull_uber(expected_ontologies): + pull_uber_icRDF() pull_uber_labels(expected_ontologies) pull_uber_descriptions(expected_ontologies) pull_uber_synonyms(expected_ontologies) diff --git a/src/node.py b/src/node.py index 4483a1ca..3859f678 100644 --- a/src/node.py +++ b/src/node.py @@ -85,8 +85,8 @@ def __init__(self,ic_file): with open(ic_file, 'r') as inf: for line in inf: x = line.strip().split('\t') - node_id = Text.obo_to_curie(x[0][:-1]) # -1 takes off the > - ic = x[2] + node_id = Text.obo_to_curie(x[0]) + ic = x[1] self.ic[node_id] = ic print(f"Loaded {len(self.ic)} InformationContent values") diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 3e42643a..29c29ebb 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -148,6 +148,8 @@ rule get_ontology_labels_descriptions_and_synonyms: output: expand("{download_directory}/{onto}/labels", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), + config['download_directory']+'/icRDF.tsv', + expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), run: diff --git a/src/ubergraph.py b/src/ubergraph.py index 207690cc..1f2dcb1c 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -406,6 +406,22 @@ def get_subclasses_and_close(self,iri): results[k] = list(filter(lambda x: ':' in x, v)) return results + def write_normalized_information_content(self, filename): + """ + Download the normalized information content and write it to the specified filename. + + :param filename: The filename to write the normalized information content to -- we write them as `IRI\tNIC`. + :return: The number of normalized information content entries downloaded. + """ + query = "SELECT * WHERE { ?iri ?nic }" + resultmap = self.triplestore.query(query, ['iri', 'nic']) + + with open(filename, "w") as ftsv: + for row in resultmap: + ftsv.write(f"{row['iri']}\t{row['nic']}\n") + + print(f"Wrote {len(resultmap)} information content values into {filename}.") + return len(resultmap) def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there @@ -433,6 +449,7 @@ def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, if p in concordfiles: concordfiles[p].write(f'{k}\t{types2relations[set_type]}\t{x}\n') + if __name__ == '__main__': ug = UberGraph() ug.get_all_labels() From 38f3e238da0d5cc5c9001ff3dc2dd8962304b9da Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:15:07 -0400 Subject: [PATCH 06/40] Moved icrdf filename into Snakemake file. --- src/datahandlers/obo.py | 9 ++++----- src/snakefiles/datacollect.snakefile | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py index ce9f6660..328abd85 100644 --- a/src/datahandlers/obo.py +++ b/src/datahandlers/obo.py @@ -6,13 +6,12 @@ from src.util import Text -def pull_uber_icRDF(): +def pull_uber_icRDF(icrdf_filename): """ Download the icRDF.tsv file that contains normalizedInformationContent for all the entities in UberGraph. """ uber = UberGraph() - config = get_config() - _ = uber.write_normalized_information_content(os.path.join(config['download_directory'], 'icRDF.tsv')) + _ = uber.write_normalized_information_content(icrdf_filename) def pull_uber_labels(expected): uber = UberGraph() @@ -62,8 +61,8 @@ def pull_uber_synonyms(expected): for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n') -def pull_uber(expected_ontologies): - pull_uber_icRDF() +def pull_uber(expected_ontologies, icrdf_filename): + pull_uber_icRDF(icrdf_filename) pull_uber_labels(expected_ontologies) pull_uber_descriptions(expected_ontologies) pull_uber_synonyms(expected_ontologies) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 29c29ebb..6bd46624 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -148,12 +148,13 @@ rule get_ontology_labels_descriptions_and_synonyms: output: expand("{download_directory}/{onto}/labels", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), + icrdf_filename = config['download_directory']+'/icRDF.tsv' config['download_directory']+'/icRDF.tsv', expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), run: - obo.pull_uber(config['ubergraph_ontologies']) + obo.pull_uber(config['ubergraph_ontologies'], icrdf_filename) ### NCBIGene From d33cc6c0f543c759c7f070932605622a9535d43f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:23:03 -0400 Subject: [PATCH 07/40] Added icrdf_filename as a required parameter to write_compendium(). --- src/babel_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 95bb9a20..0d6d5d4f 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -1,3 +1,4 @@ +import logging from ftplib import FTP from io import BytesIO import gzip @@ -202,7 +203,7 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None # return the filename to the caller return out_file_name -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]): +def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): """ :param synonym_list: :param ofname: @@ -210,6 +211,11 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]): :param labels: :param extra_prefixes: We default to only allowing the prefixes allowed for a particular type in Biolink. If you want to allow additional prefixes, list them here. + :param icrdf_filename: (REQUIRED) The file to read the information content from (icRDF.tsv). Although this is a + named parameter to make it easier to specify this when calling write_compendium(), it is REQUIRED, and + write_compendium() will throw a RuntimeError if it is not specified. This is to ensure that it has been + properly specified as a prerequisite in a Snakemake file, so that write_compendium() is not run until after + icRDF.tsv has been generated. :return: """ config = get_config() @@ -220,6 +226,13 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]): description_factory = DescriptionFactory(make_local_name('')) ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv') ic_factory = InformationContentFactory(f'{get_config()["download_directory"]}/icRDF.tsv') + + # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download + # directory. + if not icrdf_filename: + raise RuntimeError("No icrdf_filename parameter provided to write_compendium() -- this is required!") + ic_factory = InformationContentFactory(icrdf_filename) + node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes) with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: for slist in synonym_list: From 79b6dbee33de504d6714f125af407a921f39cfe2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:24:35 -0400 Subject: [PATCH 08/40] Added icrdf_filename to anatomy. --- src/createcompendia/anatomy.py | 4 ++-- src/snakefiles/anatomy.snakefile | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 1bee6ed4..0b9a7f36 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -99,7 +99,7 @@ def build_anatomy_obo_relationships(outdir): def build_anatomy_umls_relationships(idfile,outfile): umls.build_sets(idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT}) -def build_compendia(concordances, identifiers): +def build_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -122,7 +122,7 @@ def build_compendia(concordances, identifiers): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}) + write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index f944e0f0..63747cd1 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -64,11 +64,12 @@ rule anatomy_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['anatomy_prefixes']), concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']), idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']), + icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']) run: - anatomy.build_compendia(input.concords,input.idlists) + anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename) rule check_anatomy_completeness: input: From 4173978b4c04724d61f421ea01fde3af94b6b3f9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:28:09 -0400 Subject: [PATCH 09/40] Added icrdf_filename to chemicals and diseasephenotype. --- src/createcompendia/chemicals.py | 4 ++-- src/createcompendia/diseasephenotype.py | 10 +++++----- src/snakefiles/chemical.snakefile | 3 ++- src/snakefiles/diseasephenotype.snakefile | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 4abe9135..742f3a50 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -511,7 +511,7 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c for s in untyped_sets: outf.write(f'{set(s)}\n') -def build_compendia(type_file,untyped_compendia_file): +def build_compendia(type_file, untyped_compendia_file, icrdf_filename): types = {} with open(type_file,'r') as inf: for line in inf: @@ -525,7 +525,7 @@ def build_compendia(type_file,untyped_compendia_file): typed_sets = create_typed_sets(untyped_sets, types) for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets, f'{baretype}.txt', biotype, {}) + write_compendium(sets, f'{baretype}.txt', biotype, {}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets, types): """ diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index bf647144..7fa9c189 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -123,7 +123,7 @@ def build_disease_doid_relationships(idfile,outfile): 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE}) -def build_compendium(concordances, identifiers, mondoclose, badxrefs): +def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -171,7 +171,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}) + write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into @@ -228,7 +228,7 @@ def read_badxrefs(fn): morebad.add( (x[0],x[1]) ) return morebad -def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs): +def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_filename): #print('disease/phenotype') #print('get and write hp sets') #bad_mappings = read_bad_hp_mappings(badhpos) @@ -299,8 +299,8 @@ def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs): print('dump it') fs = set([frozenset(x) for x in dicts.values()]) diseases,phenotypes = create_typed_sets(fs) - write_compendium(diseases,'disease.txt','biolink:Disease',labels) - write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels) + write_compendium(diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename) + write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename) if __name__ == '__main__': with open('crapfile','w') as crapfile: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index eb4a5488..6da34cc3 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -196,11 +196,12 @@ rule chemical_compendia: input: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', + icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs']) run: - chemicals.build_compendia(input.typesfile,input.untyped_file) + chemicals.build_compendia(input.typesfile,input.untyped_file, input.icrdf_filename) rule check_chemical_completeness: input: diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 467dd5a0..2d7d8cb5 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -122,13 +122,14 @@ rule disease_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['disease_labelsandsynonyms']), concords=expand("{dd}/disease/concords/{ap}",dd=config['intermediate_directory'],ap=config['disease_concords']), idlists=expand("{dd}/disease/ids/{ap}",dd=config['intermediate_directory'],ap=config['disease_ids']), + icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['disease_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['disease_outputs']) run: diseasephenotype.build_compendium(input.concords,input.idlists,input.close_matches,{'HP':input.bad_hpo_xrefs, 'MONDO':input.bad_mondo_xrefs, - 'UMLS':input.bad_umls_xrefs} ) + 'UMLS':input.bad_umls_xrefs}, input.icrdf_filename ) rule check_disease_completeness: input: From 29ae048cccd66c91eee6be69bc6305e65e65e4f4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:32:06 -0400 Subject: [PATCH 10/40] Added icrdf_filename to remaining targets. --- src/createcompendia/gene.py | 4 ++-- src/createcompendia/genefamily.py | 4 ++-- src/createcompendia/macromolecular_complex.py | 4 ++-- src/createcompendia/processactivitypathway.py | 4 ++-- src/createcompendia/protein.py | 4 ++-- src/createcompendia/taxon.py | 4 ++-- src/snakefiles/gene.snakefile | 3 ++- src/snakefiles/genefamily.snakefile | 3 ++- src/snakefiles/macromolecular_complex.snakefile | 3 ++- src/snakefiles/process.snakefile | 3 ++- src/snakefiles/protein.snakefile | 3 ++- src/snakefiles/taxon.snakefile | 3 ++- 12 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 29b41be7..eeff20a6 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -252,7 +252,7 @@ def build_gene_umls_hgnc_relationships(umls_idfile,outfile): #Could also add MESH, if that were a valid gene prefix umls.build_sets(umls_idfile, outfile, {'HGNC':HGNC}) -def build_gene_compendia(concordances, identifiers): +def build_gene_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -274,5 +274,5 @@ def build_gene_compendia(concordances, identifiers): glom(dicts, pairs, unique_prefixes=uniques) gene_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', GENE, {}) + write_compendium(gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/genefamily.py b/src/createcompendia/genefamily.py index 3c9e8f8d..fb26f9f4 100644 --- a/src/createcompendia/genefamily.py +++ b/src/createcompendia/genefamily.py @@ -2,7 +2,7 @@ from src.babel_utils import read_identifier_file,glom,write_compendium -def build_compendia(identifiers): +def build_compendia(identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -15,5 +15,5 @@ def build_compendia(identifiers): types.update(new_types) genefam_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE_FAMILY.split(':')[-1] - write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}) + write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/macromolecular_complex.py b/src/createcompendia/macromolecular_complex.py index 0bf89d1b..17c333d1 100644 --- a/src/createcompendia/macromolecular_complex.py +++ b/src/createcompendia/macromolecular_complex.py @@ -4,7 +4,7 @@ import src.datahandlers.complexportal as complexportal from src.babel_utils import read_identifier_file, glom, write_compendium -def build_compendia(identifiers): +def build_compendia(identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -17,4 +17,4 @@ def build_compendia(identifiers): types.update(new_types) sets = set([frozenset(x) for x in dicts.values()]) type = MACROMOLECULAR_COMPLEX.split(':')[-1] - write_compendium(sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL]) + write_compendium(sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL], icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index 4eabf4b2..3dbec522 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -43,7 +43,7 @@ def build_process_rhea_relationships(outfile): rhea.make_concord(outfile) -def build_compendia(concordances, identifiers): +def build_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" #These are concords that cause problems and are being special cased out. In disease/process we put these in some @@ -77,7 +77,7 @@ def build_compendia(concordances, identifiers): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}) + write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index ce83b989..d4cd2a20 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -133,7 +133,7 @@ def build_ncit_uniprot_relationships(infile,outfile): def build_umls_ncit_relationships(idfile,outfile): umls.build_sets(idfile, outfile, {'NCI': NCIT}) -def build_protein_compendia(concordances, identifiers): +def build_protein_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -168,5 +168,5 @@ def build_protein_compendia(concordances, identifiers): # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}) + write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename) diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index e23f5225..ac62d179 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -82,7 +82,7 @@ def build_relationships(outfile,mesh_ids): -def build_compendia(concordances, identifiers): +def build_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -106,5 +106,5 @@ def build_compendia(concordances, identifiers): baretype = ORGANISM_TAXON.split(':')[-1] # We need to use extra_prefixes since UMLS is not listed as an identifier prefix at # https://biolink.github.io/biolink-model/docs/OrganismTaxon.html - write_compendium(gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}) + write_compendium(gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}, icrdf_filename=icrdf_filename) diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 56568c03..aeb47221 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -97,11 +97,12 @@ rule gene_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['gene_labels']), concords=expand("{dd}/gene/concords/{ap}",dd=config['intermediate_directory'],ap=config['gene_concords']), idlists=expand("{dd}/gene/ids/{ap}",dd=config['intermediate_directory'],ap=config['gene_ids']), + icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']) run: - gene.build_gene_compendia(input.concords,input.idlists) + gene.build_gene_compendia(input.concords,input.idlists, input.icrdf_filename) rule check_gene_completeness: input: diff --git a/src/snakefiles/genefamily.snakefile b/src/snakefiles/genefamily.snakefile index 9d83d635..30f4957c 100644 --- a/src/snakefiles/genefamily.snakefile +++ b/src/snakefiles/genefamily.snakefile @@ -23,11 +23,12 @@ rule genefamily_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['genefamily_labels']), idlists=expand("{dd}/genefamily/ids/{ap}",dd=config['intermediate_directory'],ap=config['genefamily_ids']), + icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['genefamily_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs']) run: - genefamily.build_compendia(input.idlists) + genefamily.build_compendia(input.idlists, input.icrdf_filename) rule check_genefamily_completeness: input: diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index df3f00f6..835c03cf 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -14,11 +14,12 @@ rule macromolecular_complex_compendia: labels = config['download_directory']+'/ComplexPortal/559292_labels.tsv', synonyms = config['download_directory']+'/ComplexPortal/559292_synonyms.tsv', idlists = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', + icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: config['output_directory']+'/compendia/MacromolecularComplex.txt', config['output_directory']+'/synonyms/MacromolecularComplex.txt' run: - macromolecular_complex.build_compendia([input.idlists]) + macromolecular_complex.build_compendia([input.idlists], icrdf_filename=icrdf_filename) rule check_macromolecular_complex_completeness: input: diff --git a/src/snakefiles/process.snakefile b/src/snakefiles/process.snakefile index 7adb662a..bcd1be25 100644 --- a/src/snakefiles/process.snakefile +++ b/src/snakefiles/process.snakefile @@ -72,11 +72,12 @@ rule process_compendia: #synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['process_labelsandsynonyms']), concords=expand("{dd}/process/concords/{ap}",dd=config['intermediate_directory'],ap=config['process_concords']), idlists=expand("{dd}/process/ids/{ap}",dd=config['intermediate_directory'],ap=config['process_ids']), + icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['process_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['process_outputs']) run: - pap.build_compendia(input.concords,input.idlists) + pap.build_compendia(input.concords,input.idlists,input.icrdf_filename) rule check_process_completeness: input: diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 221fb6f9..bc86f5c6 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -69,11 +69,12 @@ rule protein_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['protein_synonyms']), concords=expand("{dd}/protein/concords/{ap}",dd=config['intermediate_directory'],ap=config['protein_concords']), idlists=expand("{dd}/protein/ids/{ap}",dd=config['intermediate_directory'],ap=config['protein_ids']), + icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']) run: - protein.build_protein_compendia(input.concords,input.idlists) + protein.build_protein_compendia(input.concords,input.idlists, input.icrdf_filename) rule check_protein_completeness: input: diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index 0778d43a..2faf5d43 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -47,11 +47,12 @@ rule taxon_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['taxon_synonyms']), concords=expand("{dd}/taxon/concords/{ap}",dd=config['intermediate_directory'],ap=config['taxon_concords']), idlists=expand("{dd}/taxon/ids/{ap}",dd=config['intermediate_directory'],ap=config['taxon_ids']), + icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['taxon_outputs']), expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs']) run: - taxon.build_compendia(input.concords,input.idlists) + taxon.build_compendia(input.concords,input.idlists, input.icrdf_filename) rule check_taxon_completeness: input: From 9ad765a0afb00eb87e15866b9bdeacb9cec6036e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:33:40 -0400 Subject: [PATCH 11/40] Fixed reference to icrdf_filename. --- src/snakefiles/datacollect.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 6bd46624..e5f28466 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -154,7 +154,7 @@ rule get_ontology_labels_descriptions_and_synonyms: # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), run: - obo.pull_uber(config['ubergraph_ontologies'], icrdf_filename) + obo.pull_uber(config['ubergraph_ontologies'], output.icrdf_filename) ### NCBIGene From 9f8006a85836a18632bdd6b550ff6d4e2ebadc22 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 12:52:46 -0400 Subject: [PATCH 12/40] Fixed SPARQL queries. --- src/ubergraph.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/ubergraph.py b/src/ubergraph.py index 1f2dcb1c..ed06b553 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -413,15 +413,28 @@ def write_normalized_information_content(self, filename): :param filename: The filename to write the normalized information content to -- we write them as `IRI\tNIC`. :return: The number of normalized information content entries downloaded. """ - query = "SELECT * WHERE { ?iri ?nic }" - resultmap = self.triplestore.query(query, ['iri', 'nic']) + count_query = "SELECT (COUNT(*) AS ?count) WHERE { ?iri ?nic }" + count_result = self.triplestore.query(count_query, ['count']) + total_count = int(count_result[0]['count']) + assert total_count > 0 + + write_count = 0 with open(filename, "w") as ftsv: - for row in resultmap: - ftsv.write(f"{row['iri']}\t{row['nic']}\n") + for start in range(0, total_count, UberGraph.QUERY_BATCH_SIZE): + print(f"Querying write_normalized_information_content() offset {start} limit {UberGraph.QUERY_BATCH_SIZE} (total count: {total_count})") + + query = "SELECT ?iri ?nic WHERE " \ + "{ ?iri ?nic }" \ + f"ORDER BY ASC(?iri) OFFSET {start} LIMIT {UberGraph.QUERY_BATCH_SIZE}" + results = self.triplestore.query(query, ['iri', 'nic']) + + for row in results: + ftsv.write(f"{row['iri']}\t{row['nic']}\n") + write_count += 1 - print(f"Wrote {len(resultmap)} information content values into {filename}.") - return len(resultmap) + print(f"Wrote {write_count} information content values into {filename}.") + return write_count def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there From cc01243227689b90432c905f1f35108d5908fcf0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 14 May 2023 16:51:14 -0400 Subject: [PATCH 13/40] Added UMLS version to config.yaml. --- config.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config.json b/config.json index 1b9d79f8..baaa92ca 100644 --- a/config.json +++ b/config.json @@ -3,7 +3,9 @@ "download_directory": "babel_downloads", "intermediate_directory": "babel_outputs/intermediate", "output_directory": "babel_outputs", + "biolink_version": "3.3.3", + "umls_version": "2023AA", "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"], "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"], From 918a7297667fdce520ee25b280ad40a79f2e101b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 14 May 2023 16:54:50 -0400 Subject: [PATCH 14/40] Added UMLS download instructions to datacollect Snakemake. --- src/datahandlers/umls.py | 55 +++++++++++++++++++++++++++- src/snakefiles/datacollect.snakefile | 7 ++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 1a278dbc..867c1961 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -1,4 +1,9 @@ -from src.babel_utils import make_local_name, pull_via_ftp +import shutil +from zipfile import ZipFile + +import requests + +from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib from src.prefixes import UMLS from collections import defaultdict import os @@ -123,7 +128,53 @@ def read_umls_priority(): prid = { x:i for i,x in enumerate(pris) } return prid -def pull_umls(): + +def download_umls(umls_version, download_dir): + """ + Download the latest UMLS into the specified download directory. In addition to downloading + and unzipping UMLS, this will move the files we use into the main directory. + + :param umls_version: The version of UMLS to download (e.g. `2023AA`). + :param download_dir: The directory to download UMLS to (e.g. `babel_downloads/UMLS`) + """ + umls_api_key = os.environ.get('UMLS_API_KEY') + if not umls_api_key: + print("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.") + print("See instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html") + exit(1) + + # Download umls-{umls_version}-metathesaurus-full.zip + # As described at https://documentation.uts.nlm.nih.gov/automating-downloads.html + umls_url = f"https://uts-ws.nlm.nih.gov/download" + req = requests.get(umls_url, { + "url": f"https://download.nlm.nih.gov/umls/kss/{umls_version}/umls-{umls_version}-metathesaurus-full.zip", + "apiKey": umls_api_key + }, stream=True) + if not req.ok: + print(f"Unable to download UMLS from ${umls_url}: ${req}") + exit(1) + + # Write file to {download_dir}/umls-{umls_version}-metathesaurus-full.zip + logging.info(f"Downloading umls-{umls_version}-metathesaurus-full.zip to {download_dir}") + os.makedirs(download_dir, exist_ok=True) + umls_download_zip = os.path.join(download_dir, f"umls-{umls_version}-metathesaurus-full.zip") + with open(umls_download_zip, 'wb') as fd: + for chunk in req.iter_content(chunk_size=128): + fd.write(chunk) + + # Unzip file. + logging.info(f"Uncompressing {umls_download_zip}") + with ZipFile(umls_download_zip, 'r') as zipObj: + zipObj.extractall(download_dir) + + # Move files we use to the main download directory. + # - MRCONSO.RRF + shutil.copy2(os.path.join(download_dir, umls_version, 'MRCONSO.RRF'), download_dir) + # - MRSTY.RRF + shutil.copy2(os.path.join(download_dir, umls_version, 'MRSTY.RRF'), download_dir) + + +def pull_umls(mrconso): """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT""" mrcon = os.path.join('input_data', 'private', 'MRCONSO.RRF') rows = defaultdict(list) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index e5f28466..2379210d 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -133,6 +133,13 @@ rule get_mesh_synonyms: ### UMLS / SNOMEDCT +rule download_umls: + output: + config['download_directory']+'/UMLS/MRCONSO.RRF', + config['download_directory']+'/UMLS/MRSTY.RRF', + run: + umls.download_umls(config['umls_version'], config['download_directory'] + '/UMLS') + rule get_umls_labels_and_synonyms: output: config['download_directory']+'/UMLS/labels', From 68fc61aa7b219352603026d8a2f55672b16210ec Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 14 May 2023 16:57:22 -0400 Subject: [PATCH 15/40] Explicitly added MRSTY.RRF to write_umls_ids(). --- src/createcompendia/anatomy.py | 4 ++-- src/createcompendia/chemicals.py | 4 ++-- src/createcompendia/diseasephenotype.py | 4 ++-- src/createcompendia/gene.py | 5 ++--- src/createcompendia/protein.py | 4 ++-- src/createcompendia/taxon.py | 4 ++-- src/datahandlers/umls.py | 4 +--- src/snakefiles/anatomy.snakefile | 5 +++-- src/snakefiles/chemical.snakefile | 4 +++- src/snakefiles/datacollect.snakefile | 4 +++- src/snakefiles/diseasephenotype.snakefile | 6 +++--- src/snakefiles/gene.snakefile | 5 ++++- src/snakefiles/leftover_umls.snakefile | 4 ++-- src/snakefiles/protein.snakefile | 4 +++- src/snakefiles/taxon.snakefile | 4 +++- 15 files changed, 37 insertions(+), 28 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 0b9a7f36..8f63c78c 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -62,7 +62,7 @@ def write_mesh_ids(outfile): meshmap['A11.284'] = CELLULAR_COMPONENT mesh.write_ids(meshmap,outfile) -def write_umls_ids(outfile): +def write_umls_ids(mrsty, outfile): #UMLS categories: #A1.2 Anatomical Structure #A1.2.1 Embryonic Structure @@ -77,7 +77,7 @@ def write_umls_ids(outfile): umlsmap = {x: ANATOMICAL_ENTITY for x in ['A1.2', 'A1.2.1', 'A1.2.3.1', 'A1.2.3.2', 'A2.1.4.1', 'A2.1.5.1', 'A2.1.5.2']} umlsmap['A1.2.3.3'] = CELL umlsmap['A1.2.3.4'] = CELLULAR_COMPONENT - umls.write_umls_ids(umlsmap,outfile) + umls.write_umls_ids(mrsty, umlsmap, outfile) #Ignore list notes: #The BTO and BAMs and HTTP (braininfo) identifiers promote over-glommed nodes diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 742f3a50..33cc15e3 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -23,7 +23,7 @@ def get_type_from_smiles(smiles): else: return SMALL_MOLECULE -def write_umls_ids(outfile): +def write_umls_ids(mrsty, outfile): groups = ['A1.4.1.1.1.1', #antibiotic 'A1.4.1.1.3.2', # Hormone 'A1.4.1.1.3.3',# Enzyme @@ -40,7 +40,7 @@ def write_umls_ids(outfile): #'A1.4.1.1.3.6',# Receptor #'A1.4.1.2.1.7 Amino Acid, Peptide, or Protein umlsmap = {a:CHEMICAL_ENTITY for a in groups} - umls.write_umls_ids(umlsmap, outfile) + umls.write_umls_ids(mrsty, umlsmap, outfile) def build_chemical_umls_relationships(idfile,outfile): diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 7fa9c189..b2c3fe50 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -54,7 +54,7 @@ def write_mesh_ids(outfile): meshmap['C23'] = PHENOTYPIC_FEATURE mesh.write_ids(meshmap,outfile,order=[DISEASE,PHENOTYPIC_FEATURE]) -def write_umls_ids(outfile,badumlsfile): +def write_umls_ids(mrsty, outfile,badumlsfile): badumls=set() with open(badumlsfile,'r') as inf: for line in inf: @@ -81,7 +81,7 @@ def write_umls_ids(outfile,badumlsfile): #A2.2.2 Sign or Symptom umlsmap['A2.2.1'] = PHENOTYPIC_FEATURE umlsmap['A2.2.2'] = PHENOTYPIC_FEATURE - umls.write_umls_ids(umlsmap,outfile,blacklist=badumls) + umls.write_umls_ids(mrsty, umlsmap, outfile, blacklist=badumls) def build_disease_obo_relationships(outdir): diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index eeff20a6..5f4a2fff 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -1,5 +1,6 @@ import re +from src import babel_utils from src.prefixes import OMIM,ENSEMBL,NCBIGENE,WORMBASE, MGI, ZFIN, DICTYBASE, FLYBASE, RGD, SGD, HGNC, UMLS from src.categories import GENE @@ -97,7 +98,7 @@ def write_omim_ids(infile,outfile): if chunks[1] == 'gene': outf.write(f'{OMIM}:{chunks[0]}\n') -def write_umls_ids(outfile): +def write_umls_ids(mrconso, mrsty, outfile): """Find the UMLS entities that are genes. This is complicated by the fact that UMLS semantic type doesn't have a corresponding GENE class. It has something (A1.2.3.5) which includes genes, but also includes genomes and variants and gene properties and gene families. We can do some filtering by looking around in the MRCONSO as well @@ -111,7 +112,6 @@ def write_umls_ids(outfile): blacklist=set(['C0017361', #recessive genes 'C0017346', #Gag viral gene family ]) - mrsty = os.path.join('input_data', 'private', 'MRSTY.RRF') umls_keepers = set() with open(mrsty, 'r') as inf: for line in inf: @@ -121,7 +121,6 @@ def write_umls_ids(outfile): umls_keepers.add(x[0]) umls_keepers.difference_update(blacklist) #Now filter out OMIM variants - mrconso = os.path.join('input_data', 'private', 'MRCONSO.RRF') with open(mrconso,'r') as inf: for line in inf: x = line.strip().split('|') diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index d4cd2a20..978d3561 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -46,10 +46,10 @@ def write_ensembl_ids(ensembl_dir, outfile): wrote.add(gid) outf.write(f'{gid}\n') -def write_umls_ids(outfile): +def write_umls_ids(mrsty, outfile): umlsmap = {} umlsmap['A1.4.1.2.1.7'] = PROTEIN - umls.write_umls_ids(umlsmap, outfile) + umls.write_umls_ids(mrsty, umlsmap, outfile) def write_pr_ids(outfile): protein_id = f'{PR}:000000001' diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index ac62d179..ec28b196 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -22,7 +22,7 @@ def write_mesh_ids(outfile): #Also add anything from SCR_Chemical, if it doesn't have a tree map mesh.write_ids(meshmap,outfile,order=[ORGANISM_TAXON],extra_vocab={'SCR_Organism':ORGANISM_TAXON}) -def write_umls_ids(outfile): +def write_umls_ids(mrsty, outfile): # UMLS categories that should be classified as taxa: # - A1.1.3: Eukaryote (https://uts.nlm.nih.gov/uts/umls/semantic-network/T204) # - A1.1.2: Bacterium (https://uts.nlm.nih.gov/uts/umls/semantic-network/T007) @@ -59,7 +59,7 @@ def write_umls_ids(outfile): 'A1.1', 'A1.1.3.1.1' ]} - umls.write_umls_ids(umlsmap,outfile) + umls.write_umls_ids(mrsty, umlsmap,outfile) def build_taxon_umls_relationships(idfile,outfile): umls.build_sets(idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 867c1961..eaff3b96 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -41,9 +41,8 @@ def check_mrconso_line(line): return True -def write_umls_ids(category_map,umls_output,blacklist=set()): +def write_umls_ids(mrsty, category_map,umls_output,blacklist=set()): categories = set(category_map.keys()) - mrsty = os.path.join('input_data', 'private', 'MRSTY.RRF') umls_keepers = set() with open(mrsty,'r') as inf, open(umls_output,'w') as outf: for line in inf: @@ -176,7 +175,6 @@ def download_umls(umls_version, download_dir): def pull_umls(mrconso): """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT""" - mrcon = os.path.join('input_data', 'private', 'MRCONSO.RRF') rows = defaultdict(list) priority = read_umls_priority() snomed_label_name = make_local_name('labels', subpath='SNOMEDCT') diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 63747cd1..30b5d8f8 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -36,11 +36,12 @@ rule anatomy_mesh_ids: anatomy.write_mesh_ids(output.outfile) rule anatomy_umls_ids: - #The location of the RRFs is known to the guts, but should probably come out here. + input: + mrsty=config['download_directory'] + "/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/anatomy/ids/UMLS" run: - anatomy.write_umls_ids(output.outfile) + anatomy.write_umls_ids(input.mrsty, output.outfile) rule get_anatomy_obo_relationships: output: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 6da34cc3..26f90d35 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -2,10 +2,12 @@ import src.createcompendia.chemicals as chemicals import src.assess_compendia as assessments rule chemical_umls_ids: + input: + mrsty=config['download_directory'] + "/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/chemicals/ids/UMLS" run: - chemicals.write_umls_ids(output.outfile) + chemicals.write_umls_ids(input.mrsty, output.outfile) rule chemical_mesh_ids: input: diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 2379210d..8fe2e149 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -141,13 +141,15 @@ rule download_umls: umls.download_umls(config['umls_version'], config['download_directory'] + '/UMLS') rule get_umls_labels_and_synonyms: + input: + mrconso=config['download_directory']+'/UMLS/MRCONSO.RRF' output: config['download_directory']+'/UMLS/labels', config['download_directory']+'/UMLS/synonyms', config['download_directory']+'/SNOMEDCT/labels', config['download_directory']+'/SNOMEDCT/synonyms' run: - umls.pull_umls() + umls.pull_umls(input.mrconso) ### OBO Ontologies diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 2d7d8cb5..dc882ec4 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -53,13 +53,13 @@ rule disease_mesh_ids: diseasephenotype.write_mesh_ids(output.outfile) rule disease_umls_ids: - #The location of the RRFs is known to the guts, but should probably come out here. input: - badumls = config['input_directory']+"/badumls" + badumls = config['input_directory']+"/badumls", + mrsty = config['download_directory'] + "/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/disease/ids/UMLS" run: - diseasephenotype.write_umls_ids(output.outfile,input.badumls) + diseasephenotype.write_umls_ids(input.mrsty, output.outfile, input.badumls) rule disease_hp_ids: #The location of the RRFs is known to the guts, but should probably come out here. diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index aeb47221..48244fc0 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -43,10 +43,13 @@ rule gene_hgnc_ids: gene.write_hgnc_ids(input.infile,output.outfile) rule gene_umls_ids: + input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", + mrsty=config['download_directory']+"/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/gene/ids/UMLS" run: - gene.write_umls_ids(output.outfile) + gene.write_umls_ids(input.mrconso, input.mrsty, output.outfile) rule get_gene_ncbigene_ensembl_relationships: input: diff --git a/src/snakefiles/leftover_umls.snakefile b/src/snakefiles/leftover_umls.snakefile index 50592279..eb5efbcc 100644 --- a/src/snakefiles/leftover_umls.snakefile +++ b/src/snakefiles/leftover_umls.snakefile @@ -29,8 +29,8 @@ rule leftover_umls: config['chemical_outputs'] + config['genefamily_outputs'] + config['taxon_outputs']), - mrconso = config['input_directory'] + '/private/MRCONSO.RRF', - mrsty = config['input_directory'] + '/private/MRSTY.RRF', + mrconso = config['download_directory'] + '/UMLS/MRCONSO.RRF', + mrsty = config['download_directory'] + '/UMLS/MRSTY.RRF', synonyms = config['download_directory'] + '/UMLS/synonyms' output: umls_compendium = config['output_directory'] + "/compendia/umls.txt", diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index bc86f5c6..9f984a4e 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -20,10 +20,12 @@ rule protein_uniprotkb_ids: "awk '{{print $1}}' {input.infile} > {output.outfile}" rule protein_umls_ids: + input: + mrsty=config['download_directory']+"/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/protein/ids/UMLS" run: - protein.write_umls_ids(output.outfile) + protein.write_umls_ids(input.mrsty, output.outfile) rule protein_ensembl_ids: input: diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index 2faf5d43..84f5caa6 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -19,10 +19,12 @@ rule taxon_mesh_ids: taxon.write_mesh_ids(output.outfile) rule taxon_umls_ids: + input: + mrsty=config['download_directory'] + "/UMLS/MRSTY.RRF" output: outfile=config['intermediate_directory']+"/taxon/ids/UMLS" run: - taxon.write_umls_ids(output.outfile) + taxon.write_umls_ids(input.mrsty, output.outfile) rule get_taxon_umls_relationships: input: From 3cc146dda615b37e7b41828e744af3e6572d8455 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 14 May 2023 17:37:37 -0400 Subject: [PATCH 16/40] Removed UMLS files from README. --- README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7ee5596a..5b024230 100644 --- a/README.md +++ b/README.md @@ -119,14 +119,10 @@ You can also run Babel with [Docker](https://www.docker.com/). There are two directories you need to bind or mount from outside the container: ``` -$ docker run -it --rm --mount type=bind,source=...,target=/home/runner/babel/input_data/private --mount type=bind,source=...,target=/home/runner/babel/babel_downloads --entrypoint /bin/bash ggvaidya/babel +$ docker run -it --rm --mount type=bind,source=...,target=/home/runner/babel/babel_downloads --entrypoint /bin/bash ggvaidya/babel ``` -These two directories should be set up as following: -* `babel/input_data/private` is used to store some input files - that you will need to download yourself: - * `MRCONSO.RRF` and `MRSTY.RRF`: parts of the UMLS release, need to be downloaded from [the UMLS download website](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html). -* `babel/babel_downloads` is used to store data files downloaded during Babel assembly. +The download directory (`babel/babel_downloads`) is used to store data files downloaded during Babel assembly. The script `scripts/build-babel.sh` can be used to run `snakemake` with a few useful settings (although just running `snakemake --cores 5` should work just fine.) From f34677a3dfbd4e6edfaebc5d9aada81b433020e7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 14 May 2023 17:44:09 -0400 Subject: [PATCH 17/40] Added MRCONSO as explicit input to umls.build_sets(). --- src/createcompendia/anatomy.py | 4 ++-- src/createcompendia/chemicals.py | 4 ++-- src/createcompendia/diseasephenotype.py | 4 ++-- src/createcompendia/gene.py | 4 ++-- src/createcompendia/protein.py | 4 ++-- src/createcompendia/taxon.py | 4 ++-- src/datahandlers/umls.py | 3 +-- src/snakefiles/anatomy.snakefile | 3 ++- src/snakefiles/chemical.snakefile | 3 ++- src/snakefiles/diseasephenotype.snakefile | 3 ++- src/snakefiles/gene.snakefile | 3 ++- src/snakefiles/protein.snakefile | 3 ++- src/snakefiles/taxon.snakefile | 3 ++- 13 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 8f63c78c..d776620a 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -96,8 +96,8 @@ def build_anatomy_obo_relationships(outdir): build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) -def build_anatomy_umls_relationships(idfile,outfile): - umls.build_sets(idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT}) +def build_anatomy_umls_relationships(mrconso, idfile,outfile): + umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT}) def build_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 33cc15e3..6346ac01 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -43,8 +43,8 @@ def write_umls_ids(mrsty, outfile): umls.write_umls_ids(mrsty, umlsmap, outfile) -def build_chemical_umls_relationships(idfile,outfile): - umls.build_sets(idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}) +def build_chemical_umls_relationships(mrconso, idfile, outfile): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}) def write_pubchem_ids(labelfile,smilesfile,outfile): diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index b2c3fe50..084cd227 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -105,7 +105,7 @@ def build_disease_efo_relationships(idfile,outfile): efo.make_concords(idfile, outfile) -def build_disease_umls_relationships(idfile,outfile,omimfile,ncitfile): +def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile): #UMLS contains xrefs between a disease UMLS and a gene OMIM. So here we are saying: if you are going to link to # an omim identifier, make sure it's a disease omim, not some other thing. good_ids = {} @@ -115,7 +115,7 @@ def build_disease_umls_relationships(idfile,outfile,omimfile,ncitfile): for line in inf: x = line.split()[0] good_ids[prefix].add(x) - umls.build_sets(idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids) + umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids) def build_disease_doid_relationships(idfile,outfile): doid.build_xrefs(idfile, outfile, other_prefixes={'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 5f4a2fff..1c89da02 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -247,9 +247,9 @@ def write_ensembl_ids(ensembl_dir, outfile): outf.write(f'{gid}\n') -def build_gene_umls_hgnc_relationships(umls_idfile,outfile): +def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile): #Could also add MESH, if that were a valid gene prefix - umls.build_sets(umls_idfile, outfile, {'HGNC':HGNC}) + umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}) def build_gene_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 978d3561..976e7e91 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -130,8 +130,8 @@ def build_ncit_uniprot_relationships(infile,outfile): uniprot_id = f'{UNIPROTKB}:{x[1]}' outf.write(f'{ncit_id}\teq\t{uniprot_id}\n') -def build_umls_ncit_relationships(idfile,outfile): - umls.build_sets(idfile, outfile, {'NCI': NCIT}) +def build_umls_ncit_relationships(mrconso, idfile, outfile): + umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}) def build_protein_compendia(concordances, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index ec28b196..f9a71661 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -61,8 +61,8 @@ def write_umls_ids(mrsty, outfile): ]} umls.write_umls_ids(mrsty, umlsmap,outfile) -def build_taxon_umls_relationships(idfile,outfile): - umls.build_sets(idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}) +def build_taxon_umls_relationships(mrconso, idfile, outfile): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}) def build_relationships(outfile,mesh_ids): regis = mesh.pull_mesh_registry() diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index eaff3b96..7922bac9 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -57,7 +57,7 @@ def write_umls_ids(mrsty, category_map,umls_output,blacklist=set()): # One is to keep from having to pass through the umls file more than once, but that's a bad reason # The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another # way. I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it. -def build_sets(umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}): +def build_sets(mrconso, umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}): """Given a list of umls identifiers we want to generate all the concordances between UMLS and that other entity""" # On UMLS / MESH: we have been getting all UMLS / MESH relationships. This has led to some clear mistakes @@ -72,7 +72,6 @@ def build_sets(umls_input, umls_output , other_prefixes, bad_mappings=defaultdic umls_ids.add(u) lookfor = set(other_prefixes.keys()) acceptable_mesh_tty = set(["MH","NM","HT","QAB"]) - mrconso = os.path.join('input_data', 'private', 'MRCONSO.RRF') pairs = set() #test_cui = 'C0026827' with open(mrconso,'r') as inf, open(umls_output,'w') as concordfile: diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 30b5d8f8..0231a5b0 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -53,11 +53,12 @@ rule get_anatomy_obo_relationships: rule get_anatomy_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/anatomy/ids/UMLS" output: outfile=config['intermediate_directory']+'/anatomy/concords/UMLS', run: - anatomy.build_anatomy_umls_relationships(input.infile,output.outfile) + anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile) rule anatomy_compendia: input: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 26f90d35..273d2c9c 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -106,11 +106,12 @@ rule get_chemical_drugcentral_relationships: rule get_chemical_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/chemicals/ids/UMLS", output: outfile=config['intermediate_directory']+'/chemicals/concords/UMLS', run: - chemicals.build_chemical_umls_relationships(input.infile,output.outfile) + chemicals.build_chemical_umls_relationships(input.mrconso, input.infile, output.outfile) rule get_chemical_wikipedia_relationships: output: diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index dc882ec4..a3d80f6e 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -96,13 +96,14 @@ rule get_disease_efo_relationships: rule get_disease_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/disease/ids/UMLS", omim=config['intermediate_directory']+'/disease/ids/OMIM', ncit=config['intermediate_directory'] + '/disease/ids/NCIT' output: outfile=config['intermediate_directory']+'/disease/concords/UMLS', run: - diseasephenotype.build_disease_umls_relationships(input.infile,output.outfile,input.omim,input.ncit) + diseasephenotype.build_disease_umls_relationships(input.mrconso, input.infile,output.outfile,input.omim,input.ncit) rule get_disease_doid_relationships: input: diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 48244fc0..4ca59e7a 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -88,11 +88,12 @@ rule get_gene_medgen_relationships: rule get_gene_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+'/gene/ids/UMLS' output: outfile=config['intermediate_directory']+'/gene/concords/UMLS' run: - gene.build_gene_umls_hgnc_relationships(input.infile, output.outfile) + gene.build_gene_umls_hgnc_relationships(input.mrconso, input.infile, output.outfile) rule gene_compendia: input: diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 9f984a4e..8a1c7f4b 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -59,11 +59,12 @@ rule get_protein_ncit_uniprotkb_relationships: rule get_protein_ncit_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/protein/ids/UMLS" output: outfile=config['intermediate_directory']+'/protein/concords/NCIT_UMLS', run: - protein.build_umls_ncit_relationships(input.infile,output.outfile) + protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile) rule protein_compendia: input: diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index 84f5caa6..32ea73e8 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -28,11 +28,12 @@ rule taxon_umls_ids: rule get_taxon_umls_relationships: input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/taxon/ids/UMLS" output: outfile=config['intermediate_directory']+'/taxon/concords/UMLS', run: - taxon.build_taxon_umls_relationships(input.infile,output.outfile) + taxon.build_taxon_umls_relationships(input.mrconso, input.infile, output.outfile) rule get_taxon_relationships: input: From e17d2f4ccb1e6a2016d1a2e06a709d8b17b398e1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 26 Mar 2023 18:02:56 -0400 Subject: [PATCH 18/40] First stab at downloading all the descriptions from UberGraph. --- src/snakefiles/datacollect.snakefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 8fe2e149..008b42fc 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -160,8 +160,7 @@ rule get_ontology_labels_descriptions_and_synonyms: icrdf_filename = config['download_directory']+'/icRDF.tsv' config['download_directory']+'/icRDF.tsv', expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), - # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. - # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), + expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), run: obo.pull_uber(config['ubergraph_ontologies'], output.icrdf_filename) From 907d5a3031f63b2246beb5e3833612ae3b17cad2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 27 Mar 2023 23:00:20 -0400 Subject: [PATCH 19/40] First stab at writing descriptions. --- src/babel_utils.py | 7 +++++++ src/node.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/src/babel_utils.py b/src/babel_utils.py index 0d6d5d4f..3d87c7be 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -226,6 +226,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i description_factory = DescriptionFactory(make_local_name('')) ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv') ic_factory = InformationContentFactory(f'{get_config()["download_directory"]}/icRDF.tsv') + description_factory = DescriptionFactory(make_local_name('')) # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download # directory. @@ -257,6 +258,12 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i nw['identifiers'].append(id_info) + nw['identifiers'] = [ {k[0]:v for k,v in nids.items()} for nids in node['identifiers']] + + descs = description_factory.get_descriptions(node) + if len(descs) > 0: + nw['descriptions'] = descs + outf.write( nw ) # get_synonyms() returns tuples in the form ('http://www.geneontology.org/formats/oboInOwl#hasExactSynonym', 'Caudal articular process of eighteenth thoracic vertebra') diff --git a/src/node.py b/src/node.py index 3859f678..9c5c86de 100644 --- a/src/node.py +++ b/src/node.py @@ -47,6 +47,37 @@ def get_synonyms(self,node): return node_synonyms +class DescriptionFactory: + """ A factory for loading descriptions where available. + """ + + def __init__(self,rootdir): + self.root_dir = rootdir + self.descriptions = {} + + def load_descriptions(self,prefix): + print(f'Loading descriptions for {prefix}') + descs = defaultdict(set) + descfname = os.path.join(self.root_dir, prefix, 'descriptions') + if os.path.exists(descfname): + with open(descfname, 'r') as inf: + for line in inf: + x = line.strip().split('\t') + descs[x[0]].add("\t".join(x[1:])) + self.descriptions[prefix] = descs + print(f'Loaded') + + def get_descriptions(self,node): + node_descriptions = set() + for ident in node['identifiers']: + thisid = ident['identifier'] + pref = Text.get_curie(thisid) + if not pref in self.descriptions: + self.load_descriptions(pref) + node_descriptions.update( self.descriptions[pref][thisid] ) + return node_descriptions + + class DescriptionFactory: """ A factory for loading descriptions where available. """ From 88cbad8d497443ba43c1a50cb93bbfe33aa5d9bc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 27 Mar 2023 23:19:47 -0400 Subject: [PATCH 20/40] Make description downloads from Ubergraph optional. --- src/snakefiles/datacollect.snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 008b42fc..8fe2e149 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -160,7 +160,8 @@ rule get_ontology_labels_descriptions_and_synonyms: icrdf_filename = config['download_directory']+'/icRDF.tsv' config['download_directory']+'/icRDF.tsv', expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), - expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), + # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. + # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), run: obo.pull_uber(config['ubergraph_ontologies'], output.icrdf_filename) From 4a28cb5d429321defca9b33e80260727c02a14fe Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 18 Apr 2023 19:01:46 -0400 Subject: [PATCH 21/40] Make synonym files inputs to the final tasks. This causes the synonym files to be regenerated if they are deleted. --- src/snakefiles/macromolecular_complex.snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index 835c03cf..ad127436 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -41,6 +41,7 @@ rule macromolecular_complex: input: config['output_directory']+'/synonyms/MacromolecularComplex.txt', config['output_directory']+'/reports/macromolecular_complex_completeness.txt', + config['output_directory']+'/synonyms/MacromolecularComplex.txt', reports = config['output_directory']+'/reports/MacromolecularComplex.txt' output: x = config['output_directory']+'/reports/macromolecular_complex_done' From 5ab247c3778f9a67f8ae37cb0994f4dba2790cae Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 23 Mar 2023 17:12:16 -0400 Subject: [PATCH 22/40] Fixed some additional references to MacromolecularComplexMixin. --- src/snakefiles/macromolecular_complex.snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index ad127436..835c03cf 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -41,7 +41,6 @@ rule macromolecular_complex: input: config['output_directory']+'/synonyms/MacromolecularComplex.txt', config['output_directory']+'/reports/macromolecular_complex_completeness.txt', - config['output_directory']+'/synonyms/MacromolecularComplex.txt', reports = config['output_directory']+'/reports/MacromolecularComplex.txt' output: x = config['output_directory']+'/reports/macromolecular_complex_done' From 16fb6c9436edbfe2ec9e1e75244f40dc6ed1ddb0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 11 Feb 2023 22:40:56 -0500 Subject: [PATCH 23/40] Added `push` trigger for testing. --- .github/workflows/release.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index bccc815a..766743de 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -7,6 +7,7 @@ name: 'Release a new version to Github Packages' on: + push release: types: [published] From 23a87f9e3965b2327e0352d013af1f556eeeccd4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 01:30:13 -0400 Subject: [PATCH 24/40] Fixed UMLS path. --- src/datahandlers/umls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 7922bac9..4c309c8d 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -167,9 +167,9 @@ def download_umls(umls_version, download_dir): # Move files we use to the main download directory. # - MRCONSO.RRF - shutil.copy2(os.path.join(download_dir, umls_version, 'MRCONSO.RRF'), download_dir) + shutil.copy2(os.path.join(download_dir, umls_version, 'META', 'MRCONSO.RRF'), download_dir) # - MRSTY.RRF - shutil.copy2(os.path.join(download_dir, umls_version, 'MRSTY.RRF'), download_dir) + shutil.copy2(os.path.join(download_dir, umls_version, 'META', 'MRSTY.RRF'), download_dir) def pull_umls(mrconso): From 57196d0476b9f860dc0a7be82ba463cc28eb5779 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 15:27:58 -0400 Subject: [PATCH 25/40] Fixed incorrect merge. --- src/snakefiles/datacollect.snakefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 8fe2e149..046ec29e 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -157,8 +157,7 @@ rule get_ontology_labels_descriptions_and_synonyms: output: expand("{download_directory}/{onto}/labels", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), - icrdf_filename = config['download_directory']+'/icRDF.tsv' - config['download_directory']+'/icRDF.tsv', + icrdf_filename = config['download_directory']+'/icRDF.tsv', expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), From a46fa559583a80ac118cbd293d67c80c9f2df81b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 16:02:57 -0400 Subject: [PATCH 26/40] Fixed input parameter name in Snakemake rule. --- src/snakefiles/macromolecular_complex.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index 835c03cf..4a43f0eb 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -19,7 +19,7 @@ rule macromolecular_complex_compendia: config['output_directory']+'/compendia/MacromolecularComplex.txt', config['output_directory']+'/synonyms/MacromolecularComplex.txt' run: - macromolecular_complex.build_compendia([input.idlists], icrdf_filename=icrdf_filename) + macromolecular_complex.build_compendia([input.idlists], icrdf_filename=input.icrdf_filename) rule check_macromolecular_complex_completeness: input: From cf14247b65c0aff37b38e23c0722d752d5815f66 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 16:12:51 -0400 Subject: [PATCH 27/40] Included descriptions in the identifiers structure. --- src/babel_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/babel_utils.py b/src/babel_utils.py index 3d87c7be..c548da2e 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -261,6 +261,16 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i nw['identifiers'] = [ {k[0]:v for k,v in nids.items()} for nids in node['identifiers']] descs = description_factory.get_descriptions(node) + nw['identifiers'] = [] + for nids in node['identifiers']: + print(f"FOUND NIDS: {nids}") + id_info = {} + id_info['i'] = nids['identifier'] + id_info['l'] = nids['label'] + if id_info['i'] in descs: + # Sort from the shortest description to the longest. + id_info['d'] = sorted(list(descs[id_info['i']]), key=lambda x: len(x)) + nw['identifiers'].append(id_info) if len(descs) > 0: nw['descriptions'] = descs From 199442cf23dc7de06b4235aea06f6ea43f58b4b8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 16 May 2023 16:56:59 -0400 Subject: [PATCH 28/40] Fixed typo in pull_umls(). --- src/datahandlers/umls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 4c309c8d..b8a5b22a 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -178,7 +178,7 @@ def pull_umls(mrconso): priority = read_umls_priority() snomed_label_name = make_local_name('labels', subpath='SNOMEDCT') snomed_syn_name = make_local_name('synonyms', subpath='SNOMEDCT') - with open(mrcon, 'r') as inf, open(snomed_label_name,'w') as snolabels, open(snomed_syn_name,'w') as snosyns: + with open(mrconso, 'r') as inf, open(snomed_label_name,'w') as snolabels, open(snomed_syn_name,'w') as snosyns: for line in inf: if not check_mrconso_line(line): continue From 7fe5f971d255a02a8d2ce3c151ce67c4424fa9cb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 17 May 2023 01:39:42 -0400 Subject: [PATCH 29/40] Replace ftp:// with identical https:// URL. We're having some difficulties downloading this file with FTP; maybe HTTPS will work? --- src/datahandlers/uniprotkb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index 97fefad5..57e0876d 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -18,7 +18,7 @@ def readlabels(which): return swissprot_labels def pull_uniprotkb(): - pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB') + pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB') for which in ['sprot','trembl']: pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') From 0ffa6a83eab610217929b5b3023c86a38c239b8c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 17 May 2023 09:36:28 -0400 Subject: [PATCH 30/40] Replaced other UniProtKB FTP URL with HTTPS URL. --- src/datahandlers/uniprotkb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index 57e0876d..461356e1 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -20,7 +20,7 @@ def readlabels(which): def pull_uniprotkb(): pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB') for which in ['sprot','trembl']: - pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') + pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') def pull_uniprot_labels(sprotfile,tremblfile,fname): slabels = readlabels('sprot') From 9ea63dbc0eb19ed298d77f4cea4f4a47a0927307 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 18 May 2023 11:53:10 -0400 Subject: [PATCH 31/40] Fixed icrdf_filename call to write_compendium() in build_protein_compendia(). --- src/createcompendia/protein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 976e7e91..05fc705d 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -168,5 +168,5 @@ def build_protein_compendia(concordances, identifiers, icrdf_filename): # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename) + write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) From 43e95290ce544a23f2f6852474fc5497b80fdf5c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 18 May 2023 12:20:43 -0400 Subject: [PATCH 32/40] Fixed incorrect GitHub action (incorrect commit?). --- .github/workflows/release.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 766743de..bccc815a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -7,7 +7,6 @@ name: 'Release a new version to Github Packages' on: - push release: types: [published] From c02a5da5ebfd668746d7492f0bec8ce00ebec12a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 17:05:09 -0400 Subject: [PATCH 33/40] Fixed incorrect order and duplication in datacollect.snakefile. This is in target get_ontology_labels_descriptions_and_synonyms. --- src/snakefiles/datacollect.snakefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 046ec29e..d8c5ab72 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -157,10 +157,9 @@ rule get_ontology_labels_descriptions_and_synonyms: output: expand("{download_directory}/{onto}/labels", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), - icrdf_filename = config['download_directory']+'/icRDF.tsv', - expand("{download_directory}/{onto}/synonyms", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), # This would make sense if we had descriptions for every ontology, but since we don't, we can't make these outputs explicit. # expand("{download_directory}/{onto}/descriptions", download_directory = config['download_directory'], onto = config['ubergraph_ontologies']), + icrdf_filename = config['download_directory']+'/icRDF.tsv', run: obo.pull_uber(config['ubergraph_ontologies'], output.icrdf_filename) From c9fa420938aa337575b3545e72abef703b730cc9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 17:14:13 -0400 Subject: [PATCH 34/40] Protected label in case of unlabeled cliques. --- src/babel_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index fe37d6c0..688afdf2 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -266,7 +266,8 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i print(f"FOUND NIDS: {nids}") id_info = {} id_info['i'] = nids['identifier'] - id_info['l'] = nids['label'] + if 'label' in nids: + id_info['l'] = nids['label'] if id_info['i'] in descs: # Sort from the shortest description to the longest. id_info['d'] = sorted(list(descs[id_info['i']]), key=lambda x: len(x)) From e7e9751e6ba02dfa0a3a71a3e75d0513fbc79aa1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 17:17:21 -0400 Subject: [PATCH 35/40] Fixed duplications in babel_utils.py. --- src/babel_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 688afdf2..79345474 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -224,9 +224,6 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) description_factory = DescriptionFactory(make_local_name('')) - ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv') - ic_factory = InformationContentFactory(f'{get_config()["download_directory"]}/icRDF.tsv') - description_factory = DescriptionFactory(make_local_name('')) # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download # directory. From 2175b1ae6dc34932e7c45408b2845f5e207c4736 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 17:22:00 -0400 Subject: [PATCH 36/40] Removed incorrect merge. --- src/babel_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 79345474..653648ad 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -269,8 +269,6 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i # Sort from the shortest description to the longest. id_info['d'] = sorted(list(descs[id_info['i']]), key=lambda x: len(x)) nw['identifiers'].append(id_info) - if len(descs) > 0: - nw['descriptions'] = descs outf.write( nw ) From bd5a420820e72b53737e8da9404714b517849131 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 27 Jun 2023 17:25:27 -0400 Subject: [PATCH 37/40] Commented out unnecessary print statement. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 653648ad..a9a156e2 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -260,7 +260,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i descs = description_factory.get_descriptions(node) nw['identifiers'] = [] for nids in node['identifiers']: - print(f"FOUND NIDS: {nids}") + # print(f"FOUND NIDS: {nids}") id_info = {} id_info['i'] = nids['identifier'] if 'label' in nids: From 966b9714a08bd3c2ea022f2aeabbbbdd7687b3ed Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 28 Jun 2023 02:01:05 -0400 Subject: [PATCH 38/40] Removed incorrectly duplicated code. --- src/babel_utils.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index a9a156e2..e24fa5b9 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -255,21 +255,6 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i nw['identifiers'].append(id_info) - nw['identifiers'] = [ {k[0]:v for k,v in nids.items()} for nids in node['identifiers']] - - descs = description_factory.get_descriptions(node) - nw['identifiers'] = [] - for nids in node['identifiers']: - # print(f"FOUND NIDS: {nids}") - id_info = {} - id_info['i'] = nids['identifier'] - if 'label' in nids: - id_info['l'] = nids['label'] - if id_info['i'] in descs: - # Sort from the shortest description to the longest. - id_info['d'] = sorted(list(descs[id_info['i']]), key=lambda x: len(x)) - nw['identifiers'].append(id_info) - outf.write( nw ) # get_synonyms() returns tuples in the form ('http://www.geneontology.org/formats/oboInOwl#hasExactSynonym', 'Caudal articular process of eighteenth thoracic vertebra') @@ -280,6 +265,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i document = {"curie": node["identifiers"][0]["identifier"], "names": synonyms_list, "types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink: + if "label" in node["identifiers"][0]: document["preferred_name"] = node["identifiers"][0]["label"] From 85ea21002403d148cc74a478138bcf46d0511722 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 29 Jun 2023 02:09:16 -0400 Subject: [PATCH 39/40] Increased storage on Babel. --- kubernetes/babel-downloads.k8s.yaml | 2 +- kubernetes/babel-outputs.k8s.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/babel-downloads.k8s.yaml b/kubernetes/babel-downloads.k8s.yaml index dbec8228..7bc07929 100644 --- a/kubernetes/babel-downloads.k8s.yaml +++ b/kubernetes/babel-downloads.k8s.yaml @@ -13,5 +13,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 500Gi + storage: 600Gi storageClassName: basic diff --git a/kubernetes/babel-outputs.k8s.yaml b/kubernetes/babel-outputs.k8s.yaml index 2013ddfc..8ef07be0 100644 --- a/kubernetes/babel-outputs.k8s.yaml +++ b/kubernetes/babel-outputs.k8s.yaml @@ -15,5 +15,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 300Gi + storage: 400Gi storageClassName: basic From 8d364e155078402dc148c57d22d8619ae46c863f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 10 Jul 2023 13:47:34 -0400 Subject: [PATCH 40/40] Added counts for next step. --- src/datahandlers/ncbitaxon.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/datahandlers/ncbitaxon.py b/src/datahandlers/ncbitaxon.py index 81efd126..f183187c 100644 --- a/src/datahandlers/ncbitaxon.py +++ b/src/datahandlers/ncbitaxon.py @@ -14,6 +14,23 @@ def make_labels_and_synonyms(infile,labelfile,synfile): for line in l: sline = line.decode('utf-8').strip().split('|') parts = [x.strip() for x in sline] + + name_class = parts[3] + # name_class can be one of the following values (counts from May 1 release of NCBITaxon, + # possibly -- from https://github.com/TranslatorSRI/NameResolution/issues/71#issuecomment-1618909473): + # 25 genbank acronym + # 230 blast name + # 667 in-part + # 2086 acronym + # 14641 common name + # 30328 genbank common name + # 56575 equivalent name + # 75081 includes + # 220185 type material + # 245827 synonym + # 670412 authority + # 2503930 scientific name + if 'scientific name' == parts[3]: outf.write(f'NCBITaxon:{parts[0]}\t{parts[1]}\n') elif 'synonym' == parts[3]: