Skip to content

Commit

Permalink
Merge pull request #45 from BackofenLab/WIP_query_update
Browse files Browse the repository at this point in the history
Update of queries to use new database
  • Loading branch information
dyusuf authored Oct 26, 2023
2 parents 174d00f + b1baa44 commit fb57357
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 58 deletions.
5 changes: 3 additions & 2 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch = Stopwatch()

# Get number of all proteins in the organism (from Cypher)
bg_proteins = queries.get_number_of_proteins(driver)
bg_proteins = queries.get_number_of_proteins(driver, species_id)
num_in_prot = len(in_proteins)
prots = set(in_proteins)
# pandas DataFrames for nodes and edges
csv.field_size_limit(sys.maxsize)

# Read Terms and put into Dataframe
df_terms = pd.DataFrame(queries.get_enrichment_terms(driver))
df_terms = pd.DataFrame(queries.get_enrichment_terms(driver, species_id))
tot_tests = len(df_terms)

stopwatch.round("setup_enrichment")
Expand All @@ -95,6 +95,7 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
new_prots = []
new_p = []
arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]

with multiprocessing.Pool() as pool:
# Apply the function to each input value in parallel and collect the results
for a, b in pool.starmap(calc_proteins_pval, arguments):
Expand Down
16 changes: 8 additions & 8 deletions backend/src/enrichment_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
_BACKEND_JAR_PATH = "../gephi/target/gephi.backend-1.0-SNAPSHOT.jar"


def get_functional_graph(list_enrichment):
def get_functional_graph(list_enrichment, species_id):
stopwatch = Stopwatch()

list_term = []
Expand All @@ -24,19 +24,19 @@ def get_functional_graph(list_enrichment):
driver = database.get_driver()

# Execute the query and retrieve the CSV data
terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term)
terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id)

stopwatch.round("Neo4j")

nodes = pd.DataFrame(terms).drop_duplicates(subset="external_id")
nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id")

nodesterm = pd.DataFrame(list_enrichment)

df2 = nodesterm.rename({"id": "external_id"}, axis=1)
df2 = nodesterm.rename(columns={"id": "external_id"})
merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id")

# Add the two columns to df2
nodes = merged
nodes = merged.drop_duplicates()

nodes["fdr_rate"] = nodes["fdr_rate"].fillna(0)
nodes["p_value"] = nodes["p_value"].fillna(0)
Expand Down Expand Up @@ -100,9 +100,9 @@ def get_functional_graph(list_enrichment):
node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
node["attributes"]["Ensembl ID"] = df_node.external_id
node["attributes"]["Name"] = df_node.name
node["label"] = df_node.name # Comment this out if you want no node labels displayed
node["attributes"]["Category"] = df_node.category
node["attributes"]["Name"] = df_node.Name
node["label"] = df_node.Name # Comment this out if you want no node labels displayed
node["attributes"]["Category"] = df_node.Category
node["attributes"]["FDR"] = df_node.fdr_rate
node["attributes"]["P Value"] = df_node.p_value

Expand Down
23 changes: 12 additions & 11 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def files(path):
def proteins_enrichment():
driver = database.get_driver()
proteins = request.form.get("proteins").split(",")
species_id = request.form.get("species_id")
species_id = int(request.form.get("species_id"))

# in-house functional enrichment
list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
Expand Down Expand Up @@ -95,18 +95,18 @@ def proteins_subgraph_api():
selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None
threshold = int(float(request.form.get("threshold")) * 1000)

protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)
proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)

stopwatch.round("Setup")

if len(protein_ids) > 1:
proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold)
_, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id)
else:
proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold)
_, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id)

stopwatch.round("Neo4j")

nodes = pd.DataFrame(proteins).drop_duplicates(subset="external_id")
nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")

edges = pd.DataFrame({"source": source, "target": target, "score": score})
edges = edges.drop_duplicates(subset=["source", "target"])
Expand All @@ -130,7 +130,7 @@ def proteins_subgraph_api():
# D-Value categorize via percentage
if not (request.files.get("file") is None):
panda_file.rename(columns={"SYMBOL": "name"}, inplace=True)
panda_file["name"] = panda_file["name"].str.upper()
panda_file["name"] = panda_file["name"].str.title()

stopwatch.round("Enrichment")

Expand Down Expand Up @@ -167,14 +167,14 @@ def proteins_subgraph_api():
# Use node mapping to add corresponding values of betweenness and pagerank
node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
node["attributes"]["Description"] = df_node.description
node["attributes"]["Description"] = df_node.annotation
node["attributes"]["Ensembl ID"] = df_node.external_id
node["attributes"]["Name"] = df_node.name
node["attributes"]["Name"] = df_node.SYMBOL
if not (request.files.get("file") is None):
if selected_d != None:
for column in selected_d:
node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item()
node["label"] = df_node.name
node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item()
node["label"] = df_node.SYMBOL
node["species"] = str(10090)

# Identify subgraph nodes and update their attributes
Expand Down Expand Up @@ -213,8 +213,9 @@ def terms_subgraph_api():

# Functional terms
list_enrichment = ast.literal_eval(request.form.get("func-terms"))
species_id = int(request.form.get("species_id"))

json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment)
json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment, species_id=species_id)

stopwatch.total("terms_subgraph_api")

Expand Down
116 changes: 79 additions & 37 deletions backend/src/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,88 +7,116 @@
import neo4j


def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str]):
def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], species_id: int):
""":returns: terms, source, target, score"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (source:Terms)-[association:OVERLAP]->(target:Terms)
WHERE source.external_id IN {term_ids}
AND target.external_id IN {term_ids}
AND source.category IN ["KEGG", "Reactome Pathways"]
AND target.category IN ["KEGG", "Reactome Pathways"]
MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species})
WHERE source.Term IN {term_ids}
AND target.Term IN {term_ids}
RETURN source, target, association.Score AS score;
"""
with driver.session() as session:
result = session.run(query)
# custom conversion is needed because otherwise it takes 10s with neo4j (for unknown reasons)
return _convert_to_connection_info_score(result=result, _int=False)
return _convert_to_connection_info_score(result=result, _int=False, protein=False)


def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]:
def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]):
# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (protein:Protein)
WHERE protein.species_id = {species_id}
AND protein.name IN {str([n.upper() for n in names])}
WITH collect(protein.external_id) AS ids
RETURN ids
MATCH (protein:Protein:{species})
WHERE protein.SYMBOL IN {str([n.title() for n in names])}
OR protein.ENSEMBL IN {str([n.title() for n in names])}
RETURN protein, protein.ENSEMBL AS id
"""
with driver.session() as session:
return session.run(query).single(strict=True).value()
result = session.run(query)
return _convert_to_protein_id(result)


def get_protein_neighbours(
driver: neo4j.Driver, protein_ids: list[str], threshold: int
driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
) -> (list[str], list[str], list[str], list[int]):
"""
:returns: proteins, source_ids, target_ids, scores
"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
WHERE source.external_id IN {protein_ids}
AND target.external_id IN {protein_ids}
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
AND association.combined >= {threshold}
RETURN source, target, association.combined AS score
"""

with driver.session() as session:
result = session.run(query).single(strict=True).value()
return _convert_to_connection_info_score(result=result, _int=True)
result = session.run(query)
return _convert_to_connection_info_score(result=result, _int=True, protein=True)


def get_protein_associations(
driver: neo4j.Driver, protein_ids: list[str], threshold: int
driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
) -> (list[str], list[str], list[str], list[int]):
"""
:returns: proteins (nodes), source_ids, target_ids, score
"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

# unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
WHERE source.external_id IN {protein_ids}
AND target.external_id IN {protein_ids}
AND association.combined >= {threshold}
RETURN source, target, association.combined AS score
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
AND association.Score >= {threshold}
RETURN source, target, association.Score AS score
"""
with driver.session() as session:
result = session.run(query)
return _convert_to_connection_info_score(result=result, _int=True)
return _convert_to_connection_info_score(result=result, _int=True, protein=True)


def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str, Any]]:
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]:
query = """
MATCH (term:Terms)
RETURN term.external_id AS id, term.name AS name, term.category AS category, term.proteins AS proteins
query = f"""
MATCH (term:FT:{species})
RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins
"""

with driver.session() as session:
result = session.run(query)
return result.data()


def get_number_of_proteins(driver: neo4j.Driver) -> int:
query = """
MATCH (n:Protein)
def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int:
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (n:Protein:{species})
RETURN count(n) AS num_proteins
"""
with driver.session() as session:
Expand All @@ -97,14 +125,28 @@ def get_number_of_proteins(driver: neo4j.Driver) -> int:
return int(num_proteins)


def _convert_to_connection_info_score(result: neo4j.Result, _int: bool) -> (list[str], list[str], list[str], list[int]):
def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
proteins, ids = list(), list()
for row in result:
proteins.append(row["protein"])
ids.append(row["id"])
return proteins, ids


def _convert_to_connection_info_score(
result: neo4j.Result, _int: bool, protein: bool
) -> (list[str], list[str], list[str], list[int]):
nodes, source, target, score = list(), list(), list(), list()

for row in result:
nodes.append(row["source"])
nodes.append(row["target"])
source.append(row["source"].get("external_id"))
target.append(row["target"].get("external_id"))
if protein:
source.append(row["source"].get("ENSEMBL"))
target.append(row["target"].get("ENSEMBL"))
else:
source.append(row["source"].get("Term"))
target.append(row["target"].get("Term"))
if _int:
score.append(int(row["score"]))
else:
Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/enrichment/EnrichmentTool.vue
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
var formData = new FormData()
formData.append('func-terms', JSON.stringify(com.terms))
formData.append('species_id', com.gephi_data.nodes[0].species)
this.axios
.post("/api/subgraph/terms", formData)
Expand Down

0 comments on commit fb57357

Please sign in to comment.