diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 0b778429..813fa6e3 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -31,6 +31,7 @@ def __init__(self): end = dt.now() print('loading complete') print(f'took {end-start}') + def pull_EFO_labels_and_synonyms(self,lname,sname): with open(lname, 'w') as labelfile, open(sname,'w') as synfile: #for labeltype in ['skos:prefLabel','skos:altLabel','rdfs:label']: @@ -59,6 +60,7 @@ def pull_EFO_labels_and_synonyms(self,lname,sname): synfile.write(f'{EFO}:{efo_id}\t{labeltype}\t{label}\n') if not labeltype == 'skos:altLabel': labelfile.write(f'{EFO}:{efo_id}\t{label}\n') + def pull_EFO_ids(self,roots,idfname): with open(idfname, 'w') as idfile: for root,rtype in roots: @@ -75,6 +77,7 @@ def pull_EFO_ids(self,roots,idfname): if efoid.startswith("EFO_"): efo_id = efoid.split("_")[-1] idfile.write(f'{EFO}:{efo_id}\t{rtype}\n') + def get_exacts(self, iri, outfile): query = f""" prefix rdfs: @@ -98,7 +101,12 @@ def get_exacts(self, iri, outfile): qres = self.m.query(query) for row in list(qres): other = str(row["match"]) - otherid = Text.opt_to_curie(other[1:-1]) + try: + otherid = Text.opt_to_curie(other[1:-1]) + except ValueError as verr: + print(f"Could not translate {other[1:-1]} into a CURIE, will be used as-is: {verr}") + otherid = other[1:-1] + if otherid.startswith("ORPHANET"): print(row["match"]) print(other) diff --git a/src/node.py b/src/node.py index 110338ed..7a740ccd 100644 --- a/src/node.py +++ b/src/node.py @@ -354,6 +354,12 @@ def clean_list(self,input_identifiers): return cleaned def load_extra_labels(self,prefix): + if self.label_dir is None: + print (f"WARNING: no label_dir specified in load_extra_labels({self}, {prefix}), can't load extra labels for {prefix}. Skipping.") + return + if prefix is None: + print (f"WARNING: no prefix specified in load_extra_labels({self}, {prefix}), can't load extra labels. Skipping.") + return labelfname = os.path.join(self.label_dir,prefix,'labels') lbs = {} if os.path.exists(labelfname): @@ -375,7 +381,11 @@ def apply_labels(self, input_identifiers, labels): if iid in labels: labeled_list.append( LabeledID(identifier=iid, label = labels[iid])) else: - prefix = Text.get_prefix(iid) + try: + prefix = Text.get_prefix(iid) + except ValueError as e: + print(f"ERROR: Unable to apply_labels({self}, {input_identifiers}, {labels}): could not obtain prefix for identifier {iid}") + raise e if prefix not in self.extra_labels: self.load_extra_labels(prefix) if iid in self.extra_labels[prefix]: diff --git a/src/prefixes.py b/src/prefixes.py index 42571cec..88bbbe75 100644 --- a/src/prefixes.py +++ b/src/prefixes.py @@ -73,6 +73,7 @@ HGNCFAMILY='HGNC.FAMILY' PANTHERFAMILY='PANTHER.FAMILY' COMPLEXPORTAL='ComplexPortal' +ICD11FOUNDATION='icd11.foundation' PMID = 'PMID' DOI = 'doi' diff --git a/src/ubergraph.py b/src/ubergraph.py index ed06b553..be158ac2 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -61,7 +61,11 @@ def get_all_labels(self): ) for x in rr: y = {} - y['iri'] = Text.opt_to_curie(x['thing']) + try: + y['iri'] = Text.opt_to_curie(x['thing']) + except ValueError as verr: + print(f"WARNING: Unable to translate {x['thing']} to a CURIE; it will be used as-is: {verr}") + y['iri'] = x['thing'] y['label'] = x['label'] results.append(y) @@ -113,7 +117,11 @@ def get_all_descriptions(self): ) for x in rr: y = {} - y['iri'] = Text.opt_to_curie(x['thing']) + try: + y['iri'] = Text.opt_to_curie(x['thing']) + except ValueError as verr: + print(f"WARNING: Unable to translate {x['thing']} to a CURIE; it will be used as-is: {verr}") + y['iri'] = x['thing'] y['description'] = x['description'] results.append(y) @@ -185,7 +193,12 @@ def get_all_synonyms(self): template_text=text \ ) for x in rr: - y = ( Text.opt_to_curie(x['cls']), x['pred'], x['val']) + try: + cls_curie = Text.opt_to_curie(x['cls']) + except ValueError as verr: + print(f"Unable to convert {x['cls']} to a CURIE; it will be used as-is: {verr}") + cls_curie = x['cls'] + y = ( cls_curie, x['pred'], x['val']) results.append(y) return results @@ -221,7 +234,11 @@ def get_subclasses_of(self,iri): results = [] for x in rr: y = {} - y['descendent'] = Text.opt_to_curie(x['descendent']) + try: + y['descendent'] = Text.opt_to_curie(x['descendent']) + except ValueError as verr: + print(f"Descendent {x['descendent']} could not be converted to a CURIE, will be used as-is: {verr}") + y['descendent'] = x['descendent'] y['descendentLabel'] = x['descendentLabel'] results.append(y) return results @@ -258,7 +275,11 @@ def get_subclasses_and_smiles(self,iri): results = [] for x in rr: y = {} - y['descendent'] = Text.opt_to_curie(x['descendent']) + try: + y['descendent'] = Text.opt_to_curie(x['descendent']) + except ValueError as verr: + print(f"Descendent {x['descendent']} could not be converted to a CURIE, will be used as-is: {verr}") + y['descendent'] = x['descendent'] if x['descendentSmiles'] is not None: y['SMILES'] = x['descendentSmiles'] results.append(y) @@ -295,12 +316,14 @@ def get_subclasses_and_xrefs(self,iri): ) results = defaultdict(set) for row in resultmap: - dcurie = Text.opt_to_curie(row['descendent']) - #Sometimes we're getting back just strings that aren't curies, skip those (but complain) - if ':' not in row['xref']: - print(f'Bad XREF from {row["descendent"]} to {row["xref"]}') + # Sometimes we're getting back just strings that aren't curies, skip those (but complain) + try: + dcurie = Text.opt_to_curie(row['descendent']) + results[ dcurie ].add( (Text.opt_to_curie(row['xref']) )) + except ValueError as verr: + print(f'Bad XREF from {row["descendent"]} to {row["xref"]}: {verr}') continue - results[ dcurie ].add( (Text.opt_to_curie(row['xref']) )) + return results def get_subclasses_and_exacts(self,iri): @@ -346,15 +369,23 @@ def get_subclasses_and_exacts(self,iri): }, outputs=[ 'descendent', 'match'] ) results = defaultdict(list) for row in resultmap: - desc=Text.opt_to_curie(row['descendent']) + try: + desc = Text.opt_to_curie(row['descendent']) + except ValueError as verr: + print(f"Descendant {row['descendent']} could not be converted to a CURIE, will be used as-is: {verr}") + desc = row['descendent'] + if row['match'] is None: results[desc] += [] else: - results[ desc ].append( (Text.opt_to_curie(row['match']) )) - #Sometimes, if there are no exact_matches, we'll get some kind of blank node id - # like 't19830198'. Want to filter those out. - for k,v in results.items(): - results[k] = list(filter(lambda x: ':' in x, v)) + # Sometimes, if there are no exact_matches, we'll get some kind of blank node id + # like 't19830198'. Want to filter those out. + try: + results[ desc ].append(Text.opt_to_curie(row['match'])) + except ValueError as verr: + print(f'Row {row} could not be converted to a CURIE: {verr}') + continue + return results def get_subclasses_and_close(self,iri): @@ -395,15 +426,23 @@ def get_subclasses_and_close(self,iri): }, outputs=[ 'descendent', 'match' ] ) results = defaultdict(list) for row in resultmap: - desc = Text.opt_to_curie(row['descendent']) + try: + desc = Text.opt_to_curie(row['descendent']) + except ValueError as verr: + print(f"Descendant {row['descendent']} could not be converted to a CURIE, will be used as-is: {verr}") + desc = row['descendent'] + if row['match'] is None: results[desc] += [] else: - results[ desc].append( (Text.opt_to_curie(row['match']) )) - #Sometimes, if there are no exact_matches, we'll get some kind of blank node id - # like 't19830198'. Want to filter those out. - for k,v in results.items(): - results[k] = list(filter(lambda x: ':' in x, v)) + try: + results[ desc].append( (Text.opt_to_curie(row['match']) )) + except ValueError as verr: + # Sometimes, if there are no exact_matches, we'll get some kind of blank node id + # like 't19830198'. Want to filter those out. + print(f"Value {row['match']} in row {row} could not be converted to a CURIE: {verr}") + continue + return results def write_normalized_information_content(self, filename): diff --git a/src/util.py b/src/util.py index b04ca15a..99c1d605 100644 --- a/src/util.py +++ b/src/util.py @@ -6,7 +6,7 @@ import copy from logging.handlers import RotatingFileHandler from src.LabeledID import LabeledID -from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM +from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes #loggers = {} @@ -79,10 +79,15 @@ def get_curie (text): return text.upper().split(':', 1)[0] if ':' in text else None @staticmethod - def get_prefix (text): - if isinstance(text,LabeledID): - text = text.identifier - return text.split(':', 1)[0] if ':' in text else None + def get_prefix (id): + if isinstance(id,LabeledID): + text = id.identifier + else: + text = id + if ':' in text: + return text.split(':', 1)[0] + raise ValueError(f"Unable to get_prefix({id}) with text '{text}': no colons found in identifier.") + @classmethod def recurie(cls,text,new_prefix=None): @@ -124,7 +129,10 @@ def opt_to_curie (text): if text is None: return None #grumble, I should be better about handling prefixes - if text.startswith('http://purl.obolibrary.org') or text.startswith('http://www.orpha.net') or text.startswith('http://www.ebi.ac.uk/efo'): + if text.startswith('http://purl.obolibrary.org/obo/mondo/sources/icd11foundation/'): + # This has to go on top because it's a 'purl.obolibrary.org' which doesn't follow the same pattern as the others. + r = f'{ICD11FOUNDATION}:{text[61:]}' + elif text.startswith('http://purl.obolibrary.org') or text.startswith('http://www.orpha.net') or text.startswith('http://www.ebi.ac.uk/efo'): p = text.split('/')[-1].split('_') r = ':'.join( p ) elif text.startswith('https://omim.org/'): @@ -153,8 +161,12 @@ def opt_to_curie (text): r = Text.recurie(text,KEGGREACTION) else: r = text + if ':' in r: return Text.recurie(r) + else: + raise ValueError(f"Unable to opt_to_curie({text}): output calculated as {r}, which has no colon.") + return r @staticmethod