Merge pull request #143 from TranslatorSRI/filter-by-taxon

This PR combines several improvements to search, results and filtering: * It updates the search query to no longer duplicate the search query when doing an autocomplete query (see #142). * This breaks hyphenated search terms in the autocomplete query, and I can't figure out why. For now, I've set it up so that we replace special characters with spaces in the autocomplete query (i.e. beta-secretase becomes `(beta secretase*)`) but we escape special characters in the non-autocomplete query (i.e. beta-secretase becomes `(beta\-secretase*)` since that still appears to work. I'll dig into this more deeply in #146. * It adds taxon and clique identifier count to values indexed during data loading. * It incorporates clique identifier count into both the returned results as well as the boosting and sorting of the returned results. It also tweaks the boosting values used in query fields and phrase fields. * It adds an `only_taxa` input field that allows filtering results to a list of NCBITaxon taxon identifiers (note that this will only work for terms that have taxon information, which at the moment is only cliques containing NCBIGene identifiers).
TranslatorSRI · Apr 25, 2024 · a913bdb · a913bdb
2 parents 1544a1a + 2976821
commit a913bdb
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 35 deletions.
diff --git a/api/server.py b/api/server.py
@@ -161,8 +161,10 @@ class LookupResult(BaseModel):
     curie:str
     label: str
     synonyms: List[str]
+    taxa: List[str]
     types: List[str]
     score: float
+    clique_identifier_count: int
 
 
 @app.get("/lookup",
@@ -203,12 +205,18 @@ async def lookup_curies_get(
             description="Pipe-separated, case-sensitive list of prefixes to exclude, e.g. `UMLS|EFO`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="UMLS|EFO"
+        )] = None,
+        only_taxa: Annotated[Union[str, None], Query(
+            description="Pipe-separated, case-sensitive list of taxa to filter, "
+                        "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
+            # We can't use `example` here because otherwise it gets filled in when filling this in.
+            # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
         )] = None
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes)
+    return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
 
 
 @app.post("/lookup",
@@ -249,12 +257,18 @@ async def lookup_curies_post(
             description="Pipe-separated, case-sensitive list of prefixes to exclude, e.g. `UMLS|EFO`.",
             # We can't use `example` here because otherwise it gets filled in when filling this in.
             # example="UMLS|EFO"
+        )] = None,
+        only_taxa: Annotated[Union[str, None], Query(
+            description="Pipe-separated, case-sensitive list of taxa to filter, "
+                        "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
+            # We can't use `example` here because otherwise it gets filled in when filling this in.
+            # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
         )] = None
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
     """
-    return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes)
+    return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)
 
 
 async def lookup(string: str,
@@ -263,7 +277,8 @@ async def lookup(string: str,
            limit: conint(le=1000) = 10,
            biolink_type: str = None,
            only_prefixes: str = "",
-           exclude_prefixes: str = ""
+           exclude_prefixes: str = "",
+           only_taxa: str = ""
 ) -> List[LookupResult]:
     """
     Returns cliques with a name or synonym that contains a specified string.
@@ -286,18 +301,29 @@ async def lookup(string: str,
     # This version of the code replaces the previous facet-based multiple-query search NameRes used to have
     # (see https://github.com/TranslatorSRI/NameResolution/blob/v1.2.0/api/server.py#L79-L165)
 
-    # First, we need forms of the query that are (1) lowercase, and (2) missing any Lucene special characters
-    # (as listed at https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
+    # For reasons we don't fully understand, we can put escaped special characters into the non-autocomplete
+    # query but not the autocomplete query. So we handle those cases separately here. See
+    # https://github.com/TranslatorSRI/NameResolution/issues/146 for a deeper dive into what's going on.
     string_lc = string.lower()
-    string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', r'\\\g<0>', string_lc)
+    if autocomplete:
+        # Remove any Lucene special characters (as listed at
+        # https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
+        string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', ' ', string_lc)
 
-    # We need to escape '&&' and '||' specially, since they are double-character sequences.
-    string_lc_escaped = string_lc_escaped.replace('&&', '\\&\\&').replace('||', '\\|\\|')
+        # We need to remove '&&' and '||' specially, since they are double-character sequences.
+        string_lc_escaped = string_lc_escaped.replace('&&', ' ').replace('||', ' ')
 
-    # If in autocomplete mode, we combine it into a query that allows for incomplete words.
-    if autocomplete:
-        query = f"({string_lc_escaped}) OR ({string_lc_escaped}*)"
+        # Construct query with an asterisk at the end so we look for incomplete terms.
+        query = f"({string_lc_escaped}*)"
     else:
+        # Escape any Lucene special characters (as listed at
+        # https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
+        string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', r'\\\g<0>', string_lc)
+
+        # We need to escape '&&' and '||' specially, since they are double-character sequences.
+        string_lc_escaped = string_lc_escaped.replace('&&', '\\&\\&').replace('||', '\\|\\|')
+
+        # Construct query.
         query = f"({string_lc_escaped})"
 
     # Apply filters as needed.
@@ -322,19 +348,38 @@ async def lookup(string: str,
             prefix_exclude_filters.append(f"NOT curie:/{prefix}:.*/")
         filters.append(" AND ".join(prefix_exclude_filters))
 
+    # Taxa filter.
+    # only_taxa is like: 'NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955'
+    if only_taxa:
+        taxa_filters = []
+        for taxon in re.split('\\s*\\|\\s*', only_taxa):
+            taxa_filters.append(f'taxa:"{taxon}"')
+        filters.append(" OR ".join(taxa_filters))
+
+    # Boost queries
+    boost_queries = 'clique_identifier_count:[10 TO *]^20 ' + \
+                    'clique_identifier_count:[4 TO 9]^10 '
+    #                'clique_identifier_count:[2 TO 3]^1 '
+    #                         'clique_identifier_count:1^0.1 ' +     # - clique identifier count.
+    #                         'shortest_name_length[1 TO 5]^10 ' +        # - prioritize smaller names
+    #                         'shortest_name_length[5 TO 10]^5 ' +        # - prioritize smaller names
+    #                         ''
+
     params = {
         "query": {
             "edismax": {
                 "query": query,
                 # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
                 # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
-                "qf": "preferred_name_exactish^10 preferred_name^1 names^1",
+                "qf": "preferred_name_exactish^30 preferred_name^20 names^10",
                 # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
                 # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
-                "pf": "preferred_name_exactish^20 preferred_name^3 names^2"
+                "pf": "preferred_name_exactish^35 preferred_name^25 names^15",
+                # Boost by:
+                "bq":   boost_queries,
             },
         },
-        "sort": "score DESC, curie_suffix ASC",
+        "sort": "score DESC, clique_identifier_count DESC, shortest_name_length ASC, curie_suffix ASC",
         "limit": limit,
         "offset": offset,
         "filter": filters,
@@ -351,6 +396,8 @@ async def lookup(string: str,
     response = response.json()
     output = [ LookupResult(curie=doc.get("curie", ""), label=doc.get("preferred_name", ""), synonyms=doc.get("names", []),
                 score=doc.get("score", ""),
+                taxa=doc.get("taxa", []),
+                clique_identifier_count=doc.get("clique_identifier_count", 0),
                 types=[f"biolink:{d}" for d in doc.get("types", [])])
                for doc in response["response"]["docs"]]
 

diff --git a/data-loading/Makefile b/data-loading/Makefile
@@ -5,7 +5,7 @@
 #
 
 # Configuration
-SYNONYMS_URL=https://stars.renci.org/var/babel_outputs/2023nov5/synonyms/
+SYNONYMS_URL=https://stars.renci.org/var/babel_outputs/2024mar24/synonyms/
 
 # How much memory should Solr use.
 SOLR_MEM=220G
@@ -19,8 +19,8 @@ SOLR_MEM=220G
 .PHONY: all clean
 all: data/setup.done
 	echo Solr has now been set up and loaded with the synonym data.
-	echo Run `make start-solr-backup` to start a backup. Run `make check-solr-backup` to check
-	echo if the backup has completed. Once that has completed, run `make data/backup.done` to
+	echo Run 'make start-solr-backup' to start a backup. Run 'make check-solr-backup' to check
+	echo if the backup has completed. Once that has completed, run 'make data/backup.done' to
 	echo generate a snapshot.backup.tar.gz file that can be used in NameRes.
 
 clean:
@@ -36,9 +36,10 @@ data/synonyms/done:
 	gunzip data/synonyms/*.txt.gz
 	echo Downloaded synonyms from ${SYNONYMS_URL}
 	split -d -l 10000000 data/synonyms/Protein.txt data/synonyms/Protein.txt. && rm data/synonyms/Protein.txt
-	split -d -l 10000000 data/synonyms/SmallMolecule.txt data/synonyms/SmallMolecule.txt. && rm data/synonyms/SmallMolecule.txt
+	# split -d -l 10000000 data/synonyms/SmallMolecule.txt data/synonyms/SmallMolecule.txt. && rm data/synonyms/SmallMolecule.txt
+	split -d -l 10000000 data/synonyms/DrugChemicalConflated.txt data/synonyms/DrugChemicalConflated.txt. && rm data/synonyms/DrugChemicalConflated.txt
 	split -d -l 10000000 data/synonyms/Gene.txt data/synonyms/Gene.txt. && rm data/synonyms/Gene.txt
-	echo Split Protein.txt, SmallMolecule.txt and Gene.txt, and deleted the original files.
+	echo Split Protein.txt, DrugChemicalConflated.txt and Gene.txt, and deleted the original files.
 	touch $@
 
 # Step 3. Start Solr server.

diff --git a/data-loading/setup-and-load-solr.sh b/data-loading/setup-and-load-solr.sh
@@ -78,13 +78,13 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
             "type":"LowerTextField",
             "stored":true
         },
-	{
-	    "name":"preferred_name_exactish",
-	    "type":"exactish",
-	    "indexed":true,
-	    "stored":false,
-	    "multiValued":false
-	},
+        {
+            "name":"preferred_name_exactish",
+            "type":"exactish",
+            "indexed":true,
+            "stored":false,
+            "multiValued":false
+        },
         {
             "name":"types",
             "type":"string",
@@ -95,15 +95,26 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
             "name":"shortest_name_length",
             "type":"pint",
             "stored":true
-    	},
-	{
-	    "name":"curie_suffix",
-	    "type":"plong",
-	    "docValues":true,
-	    "stored":true,
-	    "required":false,
-	    "sortMissingLast":true
-	}
+    	  },
+        {
+            "name":"curie_suffix",
+            "type":"plong",
+            "docValues":true,
+            "stored":true,
+            "required":false,
+            "sortMissingLast":true
+        },
+        {
+            "name":"taxa",
+            "type":"string",
+            "stored":true,
+            "multiValued":true
+        },
+        {
+            "name":"clique_identifier_count",
+            "type":"pint",
+            "stored":true
+        }
     ] }' 'http://localhost:8983/solr/name_lookup/schema'
 
 # Add a copy field to copy preferred_name into preferred_name_exactish.

diff --git a/tests/test_service.py b/tests/test_service.py
@@ -1,6 +1,11 @@
+import logging
+
 from api.server import app
 from fastapi.testclient import TestClient
 
+# Turn on debugging for tests.
+logging.basicConfig(level=logging.DEBUG)
+
 def test_simple_check():
     client = TestClient(app)
     params = {'string':'alzheimer'}