Skip to content

Commit

Permalink
Merge pull request #204 from RobokopU24/treats_refactor
Browse files Browse the repository at this point in the history
Treats refactor
  • Loading branch information
EvanDietzMorris authored Mar 5, 2024
2 parents 0daa63f + dda70ae commit d0ca276
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 36 deletions.
1 change: 0 additions & 1 deletion graph_specs/default-graph-spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ graphs:
subgraphs:
- graph_id: Baseline
sources:
# using an older version of GWASCatalog because EFO terms are not normalizing right now
- source_id: GWASCatalog
- source_id: GTEx

Expand Down
13 changes: 8 additions & 5 deletions parsers/CTD/src/loadCTD.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ class CTDLoader(SourceDataLoader):
source_data_url = "http://ctdbase.org/reports/"
license = "http://ctdbase.org/about/publications/#citing"
attribution = "http://ctdbase.org/about/"
parsing_version: str = '1.2'
parsing_version: str = '1.3'

predicate_conversion_map = {
'CTD:decreases_molecular_interaction_with': 'CTD:decreases_molecular_interaction',
'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction'
'CTD:increases_molecular_interaction_with': 'CTD:increases_molecular_interaction',
'CTD:ameliorates': 'biolink:treats_or_applied_or_studied_to_treat'
}

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
Expand Down Expand Up @@ -201,7 +202,7 @@ def chemical_to_gene_exp(self, archive_path: str, chemical_to_gene_file: str) ->
continue

# get the edge predicate
predicate = self.normalize_predicate(f"{CTD}:{predicate_label}")
predicate = self.convert_predicates(f"{CTD}:{predicate_label}")

# capitalize the node IDs
chemical_id: str = r['chemicalID'].upper()
Expand Down Expand Up @@ -274,7 +275,7 @@ def disease_to_exposure(self, file_path: str) -> (list, list, int, int):
skipped_record_counter += 1
continue
else:
predicate: str = self.normalize_predicate(f"{CTD}:{predicate_label}")
predicate: str = self.convert_predicates(f"{CTD}:{predicate_label}")

# save the disease node
disease_id = f'{MESH}:' + r['diseaseid']
Expand Down Expand Up @@ -445,6 +446,7 @@ def disease_to_chemical(self, file_path: str):
self.output_file_writer.write_kgx_node(chemical_node)

# add the edge
predicate = self.convert_predicates(predicate)
new_edge = kgxedge(chemical_id,
cur_disease_id.upper(),
predicate=predicate,
Expand Down Expand Up @@ -518,9 +520,10 @@ def check_expanded_gene_chemical_row(r):
return good_row, predicate_label, props

@staticmethod
def normalize_predicate(predicate):
def convert_predicates(predicate):
"""
Removes ^ / and ` ` from the predicate id
If applicable converts predicates to preferred one according to predicate_conversion_map
:param predicate:
:return:
Expand Down
60 changes: 30 additions & 30 deletions tests/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,52 +85,52 @@ def test_node_norm_lenient(test_nodes):
def test_variant_node_norm():

variant_nodes = [
# should split into CA771890008 and CA14401342
{"id": "DBSNP:rs12602172", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
# should split into CA290493185, CA625954562, CA983647756, CA625954561
{"id": "DBSNP:rs34762051", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]},
{"id": "DBSNP:rs146890554", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA290466079
{"id": "HGVS:NC_000011.10:g.68032291C>G", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA6146346 / rs369602258
{"id": "HGVS:NC_000023.9:g.32317682G>A", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA267021 / rs398123953
{"id": "HGVS:NC_000017.10:g.43009127delG", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA8609461 / rs775219016
{"id": "HGVS:NC_000001.40:fakehgvs.1231234A>C", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # nothing
{"id": "CLINVARVARIANT:18390", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # CA128085 / rs671
{"id": "BOGUS:rs999999999999", "name": "", NODE_TYPES: ["biolink:SequenceVariant"]}, # none
]
variant_nodes_2 = variant_nodes.copy()

node_normalizer = NodeNormalizer(strict_normalization=True)
node_normalizer.normalize_sequence_variants(variant_nodes)
assert len(variant_nodes) >= 10
assert len(node_normalizer.variant_node_splits) == 1

assert len(variant_nodes) == 11
assert len(node_normalizer.variant_node_splits) == 2

# these should be removed from the list
assert not get_node_from_list('BOGUS:rs999999999999', variant_nodes)
assert not get_node_from_list('HGVS:NC_000001.40:fakehgvs.1231234A>C', variant_nodes)

# the lookup and failed list should reflect failure to normalize
assert not node_normalizer.node_normalization_lookup['BOGUS:rs999999999999']
assert 'BOGUS:rs999999999999' in node_normalizer.failed_to_normalize_variant_ids
assert node_normalizer.failed_to_normalize_variant_ids['BOGUS:rs999999999999']
assert 'HGVS:NC_000001.40:fakehgvs.1231234A>C' in node_normalizer.failed_to_normalize_variant_ids

# check some lookup mappings
assert node_normalizer.node_normalization_lookup['HGVS:NC_000011.10:g.68032291C>G'] == ['CAID:CA6146346']
assert node_normalizer.node_normalization_lookup['DBSNP:rs12602172'] == ['CAID:CA771890008', 'CAID:CA14401342']
assert len(node_normalizer.node_normalization_lookup['DBSNP:rs34762051']) == 4

it_worked = False
for node in variant_nodes:
if node['id'] == 'CAID:CA6146346':
if node['name'] == 'rs369602258':
if ROOT_ENTITY in node[NODE_TYPES]:
if SEQUENCE_VARIANT in node[NODE_TYPES]:
it_worked = True
assert it_worked
# check name uses dbSNP
node = get_node_from_list('CAID:CA6146346', variant_nodes)
assert node['name'] == 'rs369602258'

# make sure nodes aren't thrown out with strict normalization off
node_normalizer = NodeNormalizer(strict_normalization=False)
node_normalizer.normalize_sequence_variants(variant_nodes_2)

assert len(variant_nodes_2) >= 12

it_worked = False
for node in variant_nodes_2:
print(node)
if node['id'] == 'BOGUS:rs999999999999':
if node['name'] == 'BOGUS:rs999999999999':
if ROOT_ENTITY in node[NODE_TYPES]:
if SEQUENCE_VARIANT in node[NODE_TYPES]:
it_worked = True
assert it_worked
assert len(variant_nodes_2) == 13
bogus_node_after_normalization = get_node_from_list('BOGUS:rs999999999999', variant_nodes_2)
assert bogus_node_after_normalization['name'] == 'BOGUS:rs999999999999'
assert ROOT_ENTITY in bogus_node_after_normalization[NODE_TYPES]
assert SEQUENCE_VARIANT in bogus_node_after_normalization[NODE_TYPES]


def test_edge_normalization():
Expand Down

0 comments on commit d0ca276

Please sign in to comment.