Skip to content

Commit

Permalink
NN-394 Fix problems with alias detection
Browse files Browse the repository at this point in the history
  • Loading branch information
Maluuck committed Nov 16, 2023
1 parent 1fc37a8 commit 6c23400
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
12 changes: 9 additions & 3 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def proteins_subgraph_api():
selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None
threshold = int(float(request.form.get("threshold")) * 1000)

proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)
proteins, protein_ids, symbol_alias_mapping = queries.get_protein_ids_for_names(driver, protein_names, species_id)

stopwatch.round("Setup")

Expand Down Expand Up @@ -164,18 +164,24 @@ def proteins_subgraph_api():
ensembl_id = node["id"]
df_node = ensembl_to_node.get(ensembl_id)
if df_node:
symbol_value = df_node.SYMBOL
if ensembl_id in node_mapping:
mapped_node_id = node_mapping[ensembl_id]
# Use node mapping to add corresponding values of betweenness and pagerank
node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
node["attributes"]["Description"] = df_node.annotation
node["attributes"]["Ensembl ID"] = df_node.external_id
node["attributes"]["Name"] = df_node.SYMBOL
node["attributes"]["Name"] = symbol_value
if not (request.files.get("file") is None):
if selected_d != None:
for column in selected_d:
node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item()
if symbol_value in symbol_alias_mapping:
# If a symbol was found through its alias we have
# to keep the alias name so the value can be taken
# from the input file correctly
symbol_value = symbol_alias_mapping[symbol_value]
node["attributes"][column] = panda_file.loc[panda_file["name"] == symbol_value, column].item()
node["label"] = df_node.SYMBOL
node["species"] = str(10090)

Expand Down
22 changes: 16 additions & 6 deletions backend/src/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], sp
return _convert_to_connection_info_score(result=result, _int=False, protein=False)


def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]):
def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str], dict):
"""
Returns: protein, protein_id and a dictionary of format (Symbol: Alias) of all the symbols found from aliases
"""
# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
if species_id == 10090:
species = "Mus_Musculus"
Expand All @@ -44,7 +47,7 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
# Retrieve all the symbols that correspond to aliases found in names
with driver.session() as session:
result = session.run(query)
symbols_set, aliases_set = _convert_to_symbol_alias(result)
symbols_set, aliases_set, mapping = _convert_to_symbol_alias(result)
# To make less calls to the database, remove the aliases and add their corresponding symbol
genes_set = set(names)
result_names = list(genes_set - aliases_set) + list(symbols_set - genes_set)
Expand All @@ -56,7 +59,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
"""
with driver.session() as session:
result = session.run(query)
return _convert_to_protein_id(result)
protein, id = _convert_to_protein_id(result)
return protein, id, mapping


def get_protein_neighbours(
Expand Down Expand Up @@ -151,10 +155,16 @@ def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
def _convert_to_symbol_alias(result: neo4j.Result) -> (set[str], set[str]):
symbols = set()
aliases = set()
mapping = {}
for row in result:
symbols.add(row["symbol"])
aliases.add(row["found_alias"])
return symbols, aliases
symbol = row["symbol"]
alias = row["found_alias"]
symbols.add(symbol)
aliases.add(alias)
# Only add the (symbol: alias) if the symbol isnt there already
if row["symbol"] not in mapping:
mapping[symbol.title()] = alias.title()
return symbols, aliases, mapping


def _convert_to_connection_info_score(
Expand Down

0 comments on commit 6c23400

Please sign in to comment.