From 67a46c9ab8daace14dabbd166535fb721f30679b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 10:48:46 -0500 Subject: [PATCH] Get rid of trying to sync preferred label algorithm. --- node_normalizer/normalizer.py | 48 +++++++++++++++-------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index a00a0b5..ff70603 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,33 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # Note that types[canonical_id] goes from most specific to least specific, so we - # need to reverse it in order to apply preferred_name_boost_prefixes for the most - # specific type. - possible_labels = [] - for typ in types[canonical_id][::-1]: - if typ in config['preferred_name_boost_prefixes']: - # This is the most specific matching type, so we use this and then break. - possible_labels = list(map(lambda identifier: identifier.get('l', ''), - sort_identifiers_with_boosted_prefixes( - eids, - config['preferred_name_boost_prefixes'][typ] - ))) - - # Add in all the other labels -- we'd still like to consider them, but at a lower priority. - for eid in eids: - label = eid.get('l', '') - if label not in possible_labels: - possible_labels.append(label) - - # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) - # categories: so let's break here. - break - - # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their - # Biolink prefix order. - if not possible_labels: - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # HOWEVER, there are three reasons not to do that here: + # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique + # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that + # goes with the identifier we've normalized to, so we should probably go with that label + # unless that would be annoying (e.g. if it's very long). + # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes + # conflation in Babel doesn't pick the preferred label across all possible labels within the + # conflated clique, but instead picks the preferred label for each subclique, and then chooses + # the first preferred label in order of conflation. Which is what we should be doing, but by + # this point we've lost track of each subclique that went into this conflated clique. + # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code + # in Babel -- the ideal solution here would be to use the preferred_name being generated by + # Babel, but that will require some large changes to NodeNorm. + # + # For these reasons, I'm going to try to replace this with a simplified algorithm: + # - Order labels in clique identifier order. + # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. + # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. + # + # Step 1. Get all possible labels. + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if