Merge pull request #204 from RobokopU24/treats_refactor

Treats refactor
RobokopU24 · Mar 5, 2024 · d0ca276 · d0ca276
2 parents 0daa63f + dda70ae
commit d0ca276
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 36 deletions.
diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml
@@ -53,7 +53,6 @@ graphs:
     subgraphs:
       - graph_id: Baseline
     sources:
-        # using an older version of GWASCatalog because EFO terms are not normalizing right now
       - source_id: GWASCatalog
       - source_id: GTEx
 

diff --git a/parsers/CTD/src/loadCTD.py b/parsers/CTD/src/loadCTD.py
@@ -30,11 +30,12 @@ class CTDLoader(SourceDataLoader):
     source_data_url = "http://ctdbase.org/reports/"
     license = "http://ctdbase.org/about/publications/#citing"
     attribution = "http://ctdbase.org/about/"
-    parsing_version: str = '1.2'
+    parsing_version: str = '1.3'
 
     predicate_conversion_map = {
         'CTD:decreases_molecular_interaction_with': 'CTD:decreases_molecular_interaction',
-        'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction'
+        'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction',
+        'CTD:ameliorates': 'biolink:treats_or_applied_or_studied_to_treat'
     }
 
     def __init__(self, test_mode: bool = False, source_data_dir: str = None):
@@ -201,7 +202,7 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) ->
                     continue
 
                 # get the edge predicate
-                predicate = self.normalize_predicate(f"{CTD}:{predicate_label}")
+                predicate = self.convert_predicates(f"{CTD}:{predicate_label}")
 
                 # capitalize the node IDs
                 chemical_id: str = r['chemicalID'].upper()
@@ -274,7 +275,7 @@ def disease_to_exposure(self, file_path: str) -> (list, list, int, int):
                     skipped_record_counter += 1
                     continue
                 else:
-                    predicate: str = self.normalize_predicate(f"{CTD}:{predicate_label}")
+                    predicate: str = self.convert_predicates(f"{CTD}:{predicate_label}")
 
                 # save the disease node
                 disease_id = f'{MESH}:' + r['diseaseid']
@@ -445,6 +446,7 @@ def disease_to_chemical(self, file_path: str):
                     self.output_file_writer.write_kgx_node(chemical_node)
 
                     # add the edge
+                    predicate = self.convert_predicates(predicate)
                     new_edge = kgxedge(chemical_id,
                                        cur_disease_id.upper(),
                                        predicate=predicate,
@@ -518,9 +520,10 @@ def check_expanded_gene_chemical_row(r):
         return good_row, predicate_label, props
 
     @staticmethod
-    def normalize_predicate(predicate):
+    def convert_predicates(predicate):
         """
         Removes ^ / and ` ` from the predicate id
+        If applicable converts predicates to preferred one according to predicate_conversion_map
 
         :param predicate:
         :return:

diff --git a/tests/test_normalization.py b/tests/test_normalization.py
@@ -85,52 +85,52 @@ def test_node_norm_lenient(test_nodes):
 def test_variant_node_norm():
 
     variant_nodes = [
+        # should split into CA771890008 and CA14401342
         {"id": "DBSNP:rs12602172", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
+        # should split into CA290493185, CA625954562, CA983647756, CA625954561
         {"id": "DBSNP:rs34762051", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
-        {"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
+        {"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # CA290466079
+        {"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # CA6146346 / rs369602258
+        {"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # CA267021 / rs398123953
+        {"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # CA8609461 / rs775219016
+        {"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # nothing
+        {"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # CA128085 / rs671
+        {"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},  # none
     ]
     variant_nodes_2 = variant_nodes.copy()
 
     node_normalizer = NodeNormalizer(strict_normalization=True)
     node_normalizer.normalize_sequence_variants(variant_nodes)
-    assert len(variant_nodes) >= 10
-    assert len(node_normalizer.variant_node_splits) == 1
 
+    assert len(variant_nodes) == 11
+    assert len(node_normalizer.variant_node_splits) == 2
+
+    # these should be removed from the list
+    assert not get_node_from_list('BOGUS:rs999999999999', variant_nodes)
+    assert not get_node_from_list('HGVS:NC_000001.40:fakehgvs.1231234A>C', variant_nodes)
+
+    # the lookup and failed list should reflect failure to normalize
     assert not node_normalizer.node_normalization_lookup['BOGUS:rs999999999999']
     assert 'BOGUS:rs999999999999' in node_normalizer.failed_to_normalize_variant_ids
-    assert node_normalizer.failed_to_normalize_variant_ids['BOGUS:rs999999999999']
+    assert 'HGVS:NC_000001.40:fakehgvs.1231234A>C' in node_normalizer.failed_to_normalize_variant_ids
 
+    # check some lookup mappings
     assert node_normalizer.node_normalization_lookup['HGVS:NC_000011.10:g.68032291C>G'] == ['CAID:CA6146346']
+    assert node_normalizer.node_normalization_lookup['DBSNP:rs12602172'] == ['CAID:CA771890008', 'CAID:CA14401342']
+    assert len(node_normalizer.node_normalization_lookup['DBSNP:rs34762051']) == 4
 
-    it_worked = False
-    for node in variant_nodes:
-        if node['id'] == 'CAID:CA6146346':
-            if node['name'] == 'rs369602258':
-                if ROOT_ENTITY in node[NODE_TYPES]:
-                    if SEQUENCE_VARIANT in node[NODE_TYPES]:
-                        it_worked = True
-    assert it_worked
+    # check name uses dbSNP
+    node = get_node_from_list('CAID:CA6146346', variant_nodes)
+    assert node['name'] == 'rs369602258'
 
+    # make sure nodes aren't thrown out with strict normalization off
     node_normalizer = NodeNormalizer(strict_normalization=False)
     node_normalizer.normalize_sequence_variants(variant_nodes_2)
-
-    assert len(variant_nodes_2) >= 12
-
-    it_worked = False
-    for node in variant_nodes_2:
-        print(node)
-        if node['id'] == 'BOGUS:rs999999999999':
-            if node['name'] == 'BOGUS:rs999999999999':
-                if ROOT_ENTITY in node[NODE_TYPES]:
-                    if SEQUENCE_VARIANT in node[NODE_TYPES]:
-                        it_worked = True
-    assert it_worked
+    assert len(variant_nodes_2) == 13
+    bogus_node_after_normalization = get_node_from_list('BOGUS:rs999999999999', variant_nodes_2)
+    assert bogus_node_after_normalization['name'] == 'BOGUS:rs999999999999'
+    assert ROOT_ENTITY in bogus_node_after_normalization[NODE_TYPES]
+    assert SEQUENCE_VARIANT in bogus_node_after_normalization[NODE_TYPES]
 
 
 def test_edge_normalization():