Merge pull request #256 from ranking-agent/transitive_rules

Transitive rules
ranking-agent · Jul 18, 2024 · 378aeda · 378aeda
2 parents 58e677c + 5cb757e
commit 378aeda
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 271 deletions.
diff --git a/openapi-config.yaml b/openapi-config.yaml
@@ -11,7 +11,7 @@ servers:
 #  url: http://127.0.0.1:5000
 termsOfService: http://robokop.renci.org:7055/tos?service_long=ARAGORN&provider_long=RENCI
 title: ARAGORN
-version: 2.8.0
+version: 2.8.1
 tags:
 - name: translator
 - name: ARA

diff --git a/src/rules/kara_typed_rules/rules_with_types_cleaned_finalized.json b/src/rules/kara_typed_rules/rules_with_types_cleaned_finalized.json
@@ -662,136 +662,6 @@
                 }
             }
         },
-        {
-            "Rule": "?i  biolink:treats_or_applied_or_studied_to_treat  ?b  ?a  biolink:has_part  ?f  ?i  biolink:has_part  ?f   => ?a  biolink:treats  ?b",
-            "Head Coverage": "0.036715444",
-            "Std Confidence": "0.726559093",
-            "PCA Confidence": "0.919426049",
-            "Positive Examples": "1666",
-            "Body size": "2293",
-            "PCA Body size": "1812",
-            "Functional variable": "?a",
-            "template": {
-                "query_graph": {
-                    "nodes": {
-                        "$source": {
-                            "ids": [
-                                "$source_id"
-                            ],
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "$target": {
-                            "ids": [
-                                "$target_id"
-                            ],
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        },
-                        "i": {
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "f": {
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        }
-                    },
-                    "edges": {
-                        "edge_0": {
-                            "subject": "i",
-                            "object": "$target",
-                            "predicates": [
-                                "biolink:treats_or_applied_or_studied_to_treat"
-                            ]
-                        },
-                        "edge_1": {
-                            "subject": "$source",
-                            "object": "f",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        },
-                        "edge_2": {
-                            "subject": "i",
-                            "object": "f",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        }
-                    }
-                }
-            }
-        },
-        {
-            "Rule": "?e  biolink:treats_or_applied_or_studied_to_treat  ?b  ?i  biolink:has_part  ?a  ?i  biolink:has_part  ?e   => ?a  biolink:treats  ?b",
-            "Head Coverage": "0.090400212",
-            "Std Confidence": "0.623688612",
-            "PCA Confidence": "0.72345679",
-            "Positive Examples": "4102",
-            "Body size": "6577",
-            "PCA Body size": "5670",
-            "Functional variable": "?a",
-            "template": {
-                "query_graph": {
-                    "nodes": {
-                        "$source": {
-                            "ids": [
-                                "$source_id"
-                            ],
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "$target": {
-                            "ids": [
-                                "$target_id"
-                            ],
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        },
-                        "e": {
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "i": {
-                            "categories": [
-                                "biolink:NamedThing"
-                            ]
-                        }
-                    },
-                    "edges": {
-                        "edge_0": {
-                            "subject": "e",
-                            "object": "$target",
-                            "predicates": [
-                                "biolink:treats_or_applied_or_studied_to_treat"
-                            ]
-                        },
-                        "edge_1": {
-                            "subject": "i",
-                            "object": "$source",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        },
-                        "edge_2": {
-                            "subject": "i",
-                            "object": "e",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        }
-                    }
-                }
-            }
-        },
         {
             "Rule": "?a  biolink:treats_or_applied_or_studied_to_treat  ?f  ?f  biolink:has_phenotype  ?b  ?b  biolink:has_phenotype  ?f   => ?a  biolink:treats  ?b",
             "Head Coverage": "0.041233251",
@@ -2113,136 +1983,6 @@
                 }
             }
         },
-        {
-            "Rule": "?e  biolink:contraindicated_for  ?b  ?i  biolink:has_part  ?a  ?i  biolink:has_part  ?e   => ?a  biolink:contraindicated_for  ?b",
-            "Head Coverage": "0.22202098",
-            "Std Confidence": "0.746321664",
-            "PCA Confidence": "0.945220884",
-            "Positive Examples": "5884",
-            "Body size": "7884",
-            "PCA Body size": "6225",
-            "Functional variable": "?a",
-            "template": {
-                "query_graph": {
-                    "nodes": {
-                        "$source": {
-                            "ids": [
-                                "$source_id"
-                            ],
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "$target": {
-                            "ids": [
-                                "$target_id"
-                            ],
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        },
-                        "e": {
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "i": {
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        }
-                    },
-                    "edges": {
-                        "edge_0": {
-                            "subject": "e",
-                            "object": "$target",
-                            "predicates": [
-                                "biolink:contraindicated_for"
-                            ]
-                        },
-                        "edge_1": {
-                            "subject": "i",
-                            "object": "$source",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        },
-                        "edge_2": {
-                            "subject": "i",
-                            "object": "e",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        }
-                    }
-                }
-            }
-        },
-        {
-            "Rule": "?a  biolink:contraindicated_for  ?f  ?i  biolink:has_part  ?b  ?i  biolink:has_part  ?f   => ?a  biolink:contraindicated_for  ?b",
-            "Head Coverage": "0.029997736",
-            "Std Confidence": "0.722727273",
-            "PCA Confidence": "0.722727273",
-            "Positive Examples": "795",
-            "Body size": "1100",
-            "PCA Body size": "1100",
-            "Functional variable": "?a",
-            "template": {
-                "query_graph": {
-                    "nodes": {
-                        "$source": {
-                            "ids": [
-                                "$source_id"
-                            ],
-                            "categories": [
-                                "biolink:ChemicalEntity"
-                            ]
-                        },
-                        "$target": {
-                            "ids": [
-                                "$target_id"
-                            ],
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        },
-                        "f": {
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        },
-                        "i": {
-                            "categories": [
-                                "biolink:DiseaseOrPhenotypicFeature"
-                            ]
-                        }
-                    },
-                    "edges": {
-                        "edge_0": {
-                            "subject": "$source",
-                            "object": "f",
-                            "predicates": [
-                                "biolink:contraindicated_for"
-                            ]
-                        },
-                        "edge_1": {
-                            "subject": "i",
-                            "object": "$target",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        },
-                        "edge_2": {
-                            "subject": "i",
-                            "object": "f",
-                            "predicates": [
-                                "biolink:has_part"
-                            ]
-                        }
-                    }
-                }
-            }
-        },
         {
             "Rule": "?a  biolink:contraindicated_for  ?f  ?i  biolink:has_part  ?b  ?i  biolink:related_to  ?f   => ?a  biolink:contraindicated_for  ?b",
             "Head Coverage": "0.029997736",

diff --git a/src/service_aggregator.py b/src/service_aggregator.py
@@ -408,21 +408,46 @@ async def filter_promiscuous_results(response,guid):
     MAX_C = 10
     if len(response["message"]["results"]) < MAX_C:
         return
-    prom_qnodes = get_promiscuous_qnodes(response)
+    prom_qnodes = await get_promiscuous_qnodes(response)
     #This is a dictionary from bound knodes to the index of their result
-    prom_counter = defaultdict(list)
     #There should only be one such node
     for qnode in prom_qnodes:
-        #How many distinct results have the same bozo in this spot?
+        # It's possible that there are multiple knodes that could be filtered.  But when we filter out the first one
+        # then the indices of the rest will change.  So we need to do this one at a time.
+        await remove_promiscuous_knode_results(MAX_C, qnode, response)
+
+
+async def remove_promiscuous_knode_results(MAX_C, qnode, response):
+    """Given a response and a qnode, look at all the results and count how many of the results have the
+    same knode bound to that qnode.   If that number is greater than MAX_C, remove those results."""
+    still_going = True
+    #This is written as a loop with the idea that once we've removed one promiscuous node, it might require
+    # recalculating everything since the results change.  In retrospect, that might not be true because we are
+    # specifiying the qnode.  I'm still think it's possible (but perhaps unlikely) if there are multiple knodes
+    # bound to the same qnode.
+    while still_going:
+        still_going = False
+        # How many distinct results have the same bozo in this spot?
+        prom_counter = defaultdict(list)
         for result_i, result in enumerate(response["message"]["results"]):
             for binding in result["node_bindings"][qnode]:
                 knode = binding["id"]
                 prom_counter[knode].append(result_i)
-        # If there's too many results with the same knode in one of these spots,then they gotta go.
-        for knode, mapped_results in prom_counter.items():
-            if len(mapped_results) > MAX_C:
-                for index in reversed(mapped_results):
-                    del response["message"]["results"][index]
+        # now figure out the most common knode
+        max_knode = None
+        max_count = 0
+        for knode, mapped_result_indices in prom_counter.items():
+            if len(mapped_result_indices) > max_count:
+                max_knode = knode
+                max_count = len(mapped_result_indices)
+        # Now remove all the results with that knode (if it occurs in more than MAX_C results)
+        if max_count > MAX_C:
+            still_going = True
+            #These are the indices of the results that we want to remove
+            mapped_result_indices = prom_counter[max_knode]
+            #Remove them from right to left, otherwise the indices change on you
+            for index in reversed(mapped_result_indices):
+                del response["message"]["results"][index]
 
 
 async def get_promiscuous_qnodes(response):
@@ -763,7 +788,7 @@ async def aragorn_lookup(input_message, params, guid, infer, answer_qnode, bypas
             if "knowledge_graph" not in rmessage["message"] or "results" not in rmessage["message"]:
                 continue
             await filter_repeated_nodes(rmessage, guid)
-            #await filter_promiscuous_results(rmessage, guid)
+            await filter_promiscuous_results(rmessage, guid)
             result_messages.append(rmessage)
     logger.info(f"{guid}: strider complete")
     #Clean out the repeat node stuff

diff --git a/tests/test_expand.py b/tests/test_expand.py
@@ -31,7 +31,7 @@ def test_expand_query():
         }
     }
     m = expand_query(q,{},"abcd")
-    assert len(m) > 20 #This depends on how many rules we're allowing
+    assert len(m) > 15 #This depends on how many rules we're allowing
 
 def test_expand_qualified_query():
     q = {