Skip to content

Commit

Permalink
Merge pull request #143 from TranslatorSRI/filter-by-taxon
Browse files Browse the repository at this point in the history
This PR combines several improvements to search, results and filtering:
* It updates the search query to no longer duplicate the search query when doing an autocomplete query (see #142).
  * This breaks hyphenated search terms in the autocomplete query, and I can't figure out why. For now, I've set it up so that we replace special characters with spaces in the autocomplete query (i.e. beta-secretase becomes `(beta secretase*)`) but we escape special characters in the non-autocomplete query (i.e. beta-secretase becomes `(beta\-secretase*)` since that still appears to work. I'll dig into this more deeply in #146.
* It adds taxon and clique identifier count to values indexed during data loading.
* It incorporates clique identifier count into both the returned results as well as the boosting and sorting of the returned results. It also tweaks the boosting values used in query fields and phrase fields.
* It adds an `only_taxa` input field that allows filtering results to a list of NCBITaxon taxon identifiers (note that this will only work for terms that have taxon information, which at the moment is only cliques containing NCBIGene identifiers).
  • Loading branch information
gaurav authored Apr 25, 2024
2 parents 1544a1a + 2976821 commit a913bdb
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 35 deletions.
75 changes: 61 additions & 14 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ class LookupResult(BaseModel):
curie:str
label: str
synonyms: List[str]
taxa: List[str]
types: List[str]
score: float
clique_identifier_count: int


@app.get("/lookup",
Expand Down Expand Up @@ -203,12 +205,18 @@ async def lookup_curies_get(
description="Pipe-separated, case-sensitive list of prefixes to exclude, e.g. `UMLS|EFO`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="UMLS|EFO"
)] = None,
only_taxa: Annotated[Union[str, None], Query(
description="Pipe-separated, case-sensitive list of taxa to filter, "
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes)
return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)


@app.post("/lookup",
Expand Down Expand Up @@ -249,12 +257,18 @@ async def lookup_curies_post(
description="Pipe-separated, case-sensitive list of prefixes to exclude, e.g. `UMLS|EFO`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="UMLS|EFO"
)] = None,
only_taxa: Annotated[Union[str, None], Query(
description="Pipe-separated, case-sensitive list of taxa to filter, "
"e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.",
# We can't use `example` here because otherwise it gets filled in when filling this in.
# example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955"
)] = None
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
"""
return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes)
return await lookup(string, autocomplete, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa)


async def lookup(string: str,
Expand All @@ -263,7 +277,8 @@ async def lookup(string: str,
limit: conint(le=1000) = 10,
biolink_type: str = None,
only_prefixes: str = "",
exclude_prefixes: str = ""
exclude_prefixes: str = "",
only_taxa: str = ""
) -> List[LookupResult]:
"""
Returns cliques with a name or synonym that contains a specified string.
Expand All @@ -286,18 +301,29 @@ async def lookup(string: str,
# This version of the code replaces the previous facet-based multiple-query search NameRes used to have
# (see https://github.com/TranslatorSRI/NameResolution/blob/v1.2.0/api/server.py#L79-L165)

# First, we need forms of the query that are (1) lowercase, and (2) missing any Lucene special characters
# (as listed at https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
# For reasons we don't fully understand, we can put escaped special characters into the non-autocomplete
# query but not the autocomplete query. So we handle those cases separately here. See
# https://github.com/TranslatorSRI/NameResolution/issues/146 for a deeper dive into what's going on.
string_lc = string.lower()
string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', r'\\\g<0>', string_lc)
if autocomplete:
# Remove any Lucene special characters (as listed at
# https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', ' ', string_lc)

# We need to escape '&&' and '||' specially, since they are double-character sequences.
string_lc_escaped = string_lc_escaped.replace('&&', '\\&\\&').replace('||', '\\|\\|')
# We need to remove '&&' and '||' specially, since they are double-character sequences.
string_lc_escaped = string_lc_escaped.replace('&&', ' ').replace('||', ' ')

# If in autocomplete mode, we combine it into a query that allows for incomplete words.
if autocomplete:
query = f"({string_lc_escaped}) OR ({string_lc_escaped}*)"
# Construct query with an asterisk at the end so we look for incomplete terms.
query = f"({string_lc_escaped}*)"
else:
# Escape any Lucene special characters (as listed at
# https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html#escaping-special-characters)
string_lc_escaped = re.sub(r'([!(){}\[\]^"~*?:/+-])', r'\\\g<0>', string_lc)

# We need to escape '&&' and '||' specially, since they are double-character sequences.
string_lc_escaped = string_lc_escaped.replace('&&', '\\&\\&').replace('||', '\\|\\|')

# Construct query.
query = f"({string_lc_escaped})"

# Apply filters as needed.
Expand All @@ -322,19 +348,38 @@ async def lookup(string: str,
prefix_exclude_filters.append(f"NOT curie:/{prefix}:.*/")
filters.append(" AND ".join(prefix_exclude_filters))

# Taxa filter.
# only_taxa is like: 'NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955'
if only_taxa:
taxa_filters = []
for taxon in re.split('\\s*\\|\\s*', only_taxa):
taxa_filters.append(f'taxa:"{taxon}"')
filters.append(" OR ".join(taxa_filters))

# Boost queries
boost_queries = 'clique_identifier_count:[10 TO *]^20 ' + \
'clique_identifier_count:[4 TO 9]^10 '
# 'clique_identifier_count:[2 TO 3]^1 '
# 'clique_identifier_count:1^0.1 ' + # - clique identifier count.
# 'shortest_name_length[1 TO 5]^10 ' + # - prioritize smaller names
# 'shortest_name_length[5 TO 10]^5 ' + # - prioritize smaller names
# ''

params = {
"query": {
"edismax": {
"query": query,
# qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter
"qf": "preferred_name_exactish^10 preferred_name^1 names^1",
"qf": "preferred_name_exactish^30 preferred_name^20 names^10",
# pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase.
# https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter
"pf": "preferred_name_exactish^20 preferred_name^3 names^2"
"pf": "preferred_name_exactish^35 preferred_name^25 names^15",
# Boost by:
"bq": boost_queries,
},
},
"sort": "score DESC, curie_suffix ASC",
"sort": "score DESC, clique_identifier_count DESC, shortest_name_length ASC, curie_suffix ASC",
"limit": limit,
"offset": offset,
"filter": filters,
Expand All @@ -351,6 +396,8 @@ async def lookup(string: str,
response = response.json()
output = [ LookupResult(curie=doc.get("curie", ""), label=doc.get("preferred_name", ""), synonyms=doc.get("names", []),
score=doc.get("score", ""),
taxa=doc.get("taxa", []),
clique_identifier_count=doc.get("clique_identifier_count", 0),
types=[f"biolink:{d}" for d in doc.get("types", [])])
for doc in response["response"]["docs"]]

Expand Down
11 changes: 6 additions & 5 deletions data-loading/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#

# Configuration
SYNONYMS_URL=https://stars.renci.org/var/babel_outputs/2023nov5/synonyms/
SYNONYMS_URL=https://stars.renci.org/var/babel_outputs/2024mar24/synonyms/

# How much memory should Solr use.
SOLR_MEM=220G
Expand All @@ -19,8 +19,8 @@ SOLR_MEM=220G
.PHONY: all clean
all: data/setup.done
echo Solr has now been set up and loaded with the synonym data.
echo Run `make start-solr-backup` to start a backup. Run `make check-solr-backup` to check
echo if the backup has completed. Once that has completed, run `make data/backup.done` to
echo Run 'make start-solr-backup' to start a backup. Run 'make check-solr-backup' to check
echo if the backup has completed. Once that has completed, run 'make data/backup.done' to
echo generate a snapshot.backup.tar.gz file that can be used in NameRes.

clean:
Expand All @@ -36,9 +36,10 @@ data/synonyms/done:
gunzip data/synonyms/*.txt.gz
echo Downloaded synonyms from ${SYNONYMS_URL}
split -d -l 10000000 data/synonyms/Protein.txt data/synonyms/Protein.txt. && rm data/synonyms/Protein.txt
split -d -l 10000000 data/synonyms/SmallMolecule.txt data/synonyms/SmallMolecule.txt. && rm data/synonyms/SmallMolecule.txt
# split -d -l 10000000 data/synonyms/SmallMolecule.txt data/synonyms/SmallMolecule.txt. && rm data/synonyms/SmallMolecule.txt
split -d -l 10000000 data/synonyms/DrugChemicalConflated.txt data/synonyms/DrugChemicalConflated.txt. && rm data/synonyms/DrugChemicalConflated.txt
split -d -l 10000000 data/synonyms/Gene.txt data/synonyms/Gene.txt. && rm data/synonyms/Gene.txt
echo Split Protein.txt, SmallMolecule.txt and Gene.txt, and deleted the original files.
echo Split Protein.txt, DrugChemicalConflated.txt and Gene.txt, and deleted the original files.
touch $@

# Step 3. Start Solr server.
Expand Down
43 changes: 27 additions & 16 deletions data-loading/setup-and-load-solr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
"type":"LowerTextField",
"stored":true
},
{
"name":"preferred_name_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":false
},
{
"name":"preferred_name_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":false
},
{
"name":"types",
"type":"string",
Expand All @@ -95,15 +95,26 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{
"name":"shortest_name_length",
"type":"pint",
"stored":true
},
{
"name":"curie_suffix",
"type":"plong",
"docValues":true,
"stored":true,
"required":false,
"sortMissingLast":true
}
},
{
"name":"curie_suffix",
"type":"plong",
"docValues":true,
"stored":true,
"required":false,
"sortMissingLast":true
},
{
"name":"taxa",
"type":"string",
"stored":true,
"multiValued":true
},
{
"name":"clique_identifier_count",
"type":"pint",
"stored":true
}
] }' 'http://localhost:8983/solr/name_lookup/schema'

# Add a copy field to copy preferred_name into preferred_name_exactish.
Expand Down
5 changes: 5 additions & 0 deletions tests/test_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging

from api.server import app
from fastapi.testclient import TestClient

# Turn on debugging for tests.
logging.basicConfig(level=logging.DEBUG)

def test_simple_check():
client = TestClient(app)
params = {'string':'alzheimer'}
Expand Down

0 comments on commit a913bdb

Please sign in to comment.