diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml index a8d1d6fa..1228ec87 100644 --- a/graph_specs/default-graph-spec.yml +++ b/graph_specs/default-graph-spec.yml @@ -53,7 +53,6 @@ graphs: subgraphs: - graph_id: Baseline sources: - # using an older version of GWASCatalog because EFO terms are not normalizing right now - source_id: GWASCatalog - source_id: GTEx diff --git a/parsers/CTD/src/loadCTD.py b/parsers/CTD/src/loadCTD.py index 3acf5dd6..814891b2 100644 --- a/parsers/CTD/src/loadCTD.py +++ b/parsers/CTD/src/loadCTD.py @@ -30,11 +30,12 @@ class CTDLoader(SourceDataLoader): source_data_url = "http://ctdbase.org/reports/" license = "http://ctdbase.org/about/publications/#citing" attribution = "http://ctdbase.org/about/" - parsing_version: str = '1.2' + parsing_version: str = '1.3' predicate_conversion_map = { 'CTD:decreases_molecular_interaction_with': 'CTD:decreases_molecular_interaction', - 'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction' + 'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction', + 'CTD:ameliorates': 'biolink:treats_or_applied_or_studied_to_treat' } def __init__(self, test_mode: bool = False, source_data_dir: str = None): @@ -201,7 +202,7 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) -> continue # get the edge predicate - predicate = self.normalize_predicate(f"{CTD}:{predicate_label}") + predicate = self.convert_predicates(f"{CTD}:{predicate_label}") # capitalize the node IDs chemical_id: str = r['chemicalID'].upper() @@ -274,7 +275,7 @@ def disease_to_exposure(self, file_path: str) -> (list, list, int, int): skipped_record_counter += 1 continue else: - predicate: str = self.normalize_predicate(f"{CTD}:{predicate_label}") + predicate: str = self.convert_predicates(f"{CTD}:{predicate_label}") # save the disease node disease_id = f'{MESH}:' + r['diseaseid'] @@ -445,6 +446,7 @@ def disease_to_chemical(self, file_path: str): self.output_file_writer.write_kgx_node(chemical_node) # add the edge + predicate = self.convert_predicates(predicate) new_edge = kgxedge(chemical_id, cur_disease_id.upper(), predicate=predicate, @@ -518,9 +520,10 @@ def check_expanded_gene_chemical_row(r): return good_row, predicate_label, props @staticmethod - def normalize_predicate(predicate): + def convert_predicates(predicate): """ Removes ^ / and ` ` from the predicate id + If applicable converts predicates to preferred one according to predicate_conversion_map :param predicate: :return: diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 3fb35598..ddf74632 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -85,52 +85,52 @@ def test_node_norm_lenient(test_nodes): def test_variant_node_norm(): variant_nodes = [ + # should split into CA771890008 and CA14401342 {"id": "DBSNP:rs12602172", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, + # should split into CA290493185, CA625954562, CA983647756, CA625954561 {"id": "DBSNP:rs34762051", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, - {"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, + {"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA290466079 + {"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA6146346 / rs369602258 + {"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA267021 / rs398123953 + {"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA8609461 / rs775219016 + {"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # nothing + {"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA128085 / rs671 + {"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # none ] variant_nodes_2 = variant_nodes.copy() node_normalizer = NodeNormalizer(strict_normalization=True) node_normalizer.normalize_sequence_variants(variant_nodes) - assert len(variant_nodes) >= 10 - assert len(node_normalizer.variant_node_splits) == 1 + assert len(variant_nodes) == 11 + assert len(node_normalizer.variant_node_splits) == 2 + + # these should be removed from the list + assert not get_node_from_list('BOGUS:rs999999999999', variant_nodes) + assert not get_node_from_list('HGVS:NC_000001.40:fakehgvs.1231234A>C', variant_nodes) + + # the lookup and failed list should reflect failure to normalize assert not node_normalizer.node_normalization_lookup['BOGUS:rs999999999999'] assert 'BOGUS:rs999999999999' in node_normalizer.failed_to_normalize_variant_ids - assert node_normalizer.failed_to_normalize_variant_ids['BOGUS:rs999999999999'] + assert 'HGVS:NC_000001.40:fakehgvs.1231234A>C' in node_normalizer.failed_to_normalize_variant_ids + # check some lookup mappings assert node_normalizer.node_normalization_lookup['HGVS:NC_000011.10:g.68032291C>G'] == ['CAID:CA6146346'] + assert node_normalizer.node_normalization_lookup['DBSNP:rs12602172'] == ['CAID:CA771890008', 'CAID:CA14401342'] + assert len(node_normalizer.node_normalization_lookup['DBSNP:rs34762051']) == 4 - it_worked = False - for node in variant_nodes: - if node['id'] == 'CAID:CA6146346': - if node['name'] == 'rs369602258': - if ROOT_ENTITY in node[NODE_TYPES]: - if SEQUENCE_VARIANT in node[NODE_TYPES]: - it_worked = True - assert it_worked + # check name uses dbSNP + node = get_node_from_list('CAID:CA6146346', variant_nodes) + assert node['name'] == 'rs369602258' + # make sure nodes aren't thrown out with strict normalization off node_normalizer = NodeNormalizer(strict_normalization=False) node_normalizer.normalize_sequence_variants(variant_nodes_2) - - assert len(variant_nodes_2) >= 12 - - it_worked = False - for node in variant_nodes_2: - print(node) - if node['id'] == 'BOGUS:rs999999999999': - if node['name'] == 'BOGUS:rs999999999999': - if ROOT_ENTITY in node[NODE_TYPES]: - if SEQUENCE_VARIANT in node[NODE_TYPES]: - it_worked = True - assert it_worked + assert len(variant_nodes_2) == 13 + bogus_node_after_normalization = get_node_from_list('BOGUS:rs999999999999', variant_nodes_2) + assert bogus_node_after_normalization['name'] == 'BOGUS:rs999999999999' + assert ROOT_ENTITY in bogus_node_after_normalization[NODE_TYPES] + assert SEQUENCE_VARIANT in bogus_node_after_normalization[NODE_TYPES] def test_edge_normalization():