Merge pull request #45 from BackofenLab/WIP_query_update

Update of queries to use new database
BackofenLab · Oct 26, 2023 · fb57357 · fb57357
2 parents 174d00f + b1baa44
commit fb57357
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 58 deletions.
diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py
@@ -76,14 +76,14 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
     stopwatch = Stopwatch()
 
     # Get number of all proteins in the organism (from Cypher)
-    bg_proteins = queries.get_number_of_proteins(driver)
+    bg_proteins = queries.get_number_of_proteins(driver, species_id)
     num_in_prot = len(in_proteins)
     prots = set(in_proteins)
     # pandas DataFrames for nodes and edges
     csv.field_size_limit(sys.maxsize)
 
     # Read Terms and put into Dataframe
-    df_terms = pd.DataFrame(queries.get_enrichment_terms(driver))
+    df_terms = pd.DataFrame(queries.get_enrichment_terms(driver, species_id))
     tot_tests = len(df_terms)
 
     stopwatch.round("setup_enrichment")
@@ -95,6 +95,7 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
     new_prots = []
     new_p = []
     arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]
+
     with multiprocessing.Pool() as pool:
         # Apply the function to each input value in parallel and collect the results
         for a, b in pool.starmap(calc_proteins_pval, arguments):

diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py
@@ -14,7 +14,7 @@
 _BACKEND_JAR_PATH = "../gephi/target/gephi.backend-1.0-SNAPSHOT.jar"
 
 
-def get_functional_graph(list_enrichment):
+def get_functional_graph(list_enrichment, species_id):
     stopwatch = Stopwatch()
 
     list_term = []
@@ -24,19 +24,19 @@ def get_functional_graph(list_enrichment):
     driver = database.get_driver()
 
     # Execute the query and retrieve the CSV data
-    terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term)
+    terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id)
 
     stopwatch.round("Neo4j")
 
-    nodes = pd.DataFrame(terms).drop_duplicates(subset="external_id")
+    nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id")
 
     nodesterm = pd.DataFrame(list_enrichment)
 
-    df2 = nodesterm.rename({"id": "external_id"}, axis=1)
+    df2 = nodesterm.rename(columns={"id": "external_id"})
     merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id")
 
     # Add the two columns to df2
-    nodes = merged
+    nodes = merged.drop_duplicates()
 
     nodes["fdr_rate"] = nodes["fdr_rate"].fillna(0)
     nodes["p_value"] = nodes["p_value"].fillna(0)
@@ -100,9 +100,9 @@ def get_functional_graph(list_enrichment):
                 node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
                 node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
             node["attributes"]["Ensembl ID"] = df_node.external_id
-            node["attributes"]["Name"] = df_node.name
-            node["label"] = df_node.name  # Comment this out if you want no node labels displayed
-            node["attributes"]["Category"] = df_node.category
+            node["attributes"]["Name"] = df_node.Name
+            node["label"] = df_node.Name  # Comment this out if you want no node labels displayed
+            node["attributes"]["Category"] = df_node.Category
             node["attributes"]["FDR"] = df_node.fdr_rate
             node["attributes"]["P Value"] = df_node.p_value
 

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -51,7 +51,7 @@ def files(path):
 def proteins_enrichment():
     driver = database.get_driver()
     proteins = request.form.get("proteins").split(",")
-    species_id = request.form.get("species_id")
+    species_id = int(request.form.get("species_id"))
 
     # in-house functional enrichment
     list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
@@ -95,18 +95,18 @@ def proteins_subgraph_api():
     selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None
     threshold = int(float(request.form.get("threshold")) * 1000)
 
-    protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)
+    proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)
 
     stopwatch.round("Setup")
 
     if len(protein_ids) > 1:
-        proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold)
+        _, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id)
     else:
-        proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold)
+        _, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id)
 
     stopwatch.round("Neo4j")
 
-    nodes = pd.DataFrame(proteins).drop_duplicates(subset="external_id")
+    nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")
 
     edges = pd.DataFrame({"source": source, "target": target, "score": score})
     edges = edges.drop_duplicates(subset=["source", "target"])
@@ -130,7 +130,7 @@ def proteins_subgraph_api():
     # D-Value categorize via percentage
     if not (request.files.get("file") is None):
         panda_file.rename(columns={"SYMBOL": "name"}, inplace=True)
-        panda_file["name"] = panda_file["name"].str.upper()
+        panda_file["name"] = panda_file["name"].str.title()
 
     stopwatch.round("Enrichment")
 
@@ -167,14 +167,14 @@ def proteins_subgraph_api():
                 # Use node mapping to add corresponding values of betweenness and pagerank
                 node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
                 node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
-            node["attributes"]["Description"] = df_node.description
+            node["attributes"]["Description"] = df_node.annotation
             node["attributes"]["Ensembl ID"] = df_node.external_id
-            node["attributes"]["Name"] = df_node.name
+            node["attributes"]["Name"] = df_node.SYMBOL
             if not (request.files.get("file") is None):
                 if selected_d != None:
                     for column in selected_d:
-                        node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item()
-            node["label"] = df_node.name
+                        node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item()
+            node["label"] = df_node.SYMBOL
             node["species"] = str(10090)
 
     # Identify subgraph nodes and update their attributes
@@ -213,8 +213,9 @@ def terms_subgraph_api():
 
     # Functional terms
     list_enrichment = ast.literal_eval(request.form.get("func-terms"))
+    species_id = int(request.form.get("species_id"))
 
-    json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment)
+    json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment, species_id=species_id)
 
     stopwatch.total("terms_subgraph_api")
 

diff --git a/backend/src/queries.py b/backend/src/queries.py
@@ -7,88 +7,116 @@
 import neo4j
 
 
-def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str]):
+def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], species_id: int):
     """:returns: terms, source, target, score"""
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
+
     query = f"""
-        MATCH (source:Terms)-[association:OVERLAP]->(target:Terms)
-        WHERE source.external_id IN {term_ids}
-            AND target.external_id IN {term_ids}
-            AND source.category IN ["KEGG", "Reactome Pathways"]
-            AND target.category IN ["KEGG", "Reactome Pathways"]
+        MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species})
+        WHERE source.Term IN {term_ids}
+            AND target.Term IN {term_ids}
         RETURN source, target, association.Score AS score;
         """
     with driver.session() as session:
         result = session.run(query)
         # custom conversion is needed because otherwise it takes 10s with neo4j (for unknown reasons)
-        return _convert_to_connection_info_score(result=result, _int=False)
+        return _convert_to_connection_info_score(result=result, _int=False, protein=False)
 
 
-def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]:
+def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]):
     # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
+
     query = f"""
-        MATCH (protein:Protein)
-        WHERE protein.species_id = {species_id}
-            AND protein.name IN {str([n.upper() for n in names])} 
-        WITH collect(protein.external_id) AS ids
-        RETURN ids
+        MATCH (protein:Protein:{species})
+        WHERE protein.SYMBOL IN {str([n.title() for n in names])} 
+            OR protein.ENSEMBL IN {str([n.title() for n in names])} 
+        RETURN protein, protein.ENSEMBL AS id
     """
     with driver.session() as session:
-        return session.run(query).single(strict=True).value()
+        result = session.run(query)
+        return _convert_to_protein_id(result)
 
 
 def get_protein_neighbours(
-    driver: neo4j.Driver, protein_ids: list[str], threshold: int
+    driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
 ) -> (list[str], list[str], list[str], list[int]):
     """
     :returns: proteins, source_ids, target_ids, scores
     """
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
+
     # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
     query = f"""
-        MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
-        WHERE source.external_id IN {protein_ids}
-            AND target.external_id IN {protein_ids}
+        MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
+        WHERE source.ENSEMBL IN {protein_ids}
+            AND target.ENSEMBL IN {protein_ids}
             AND association.combined >= {threshold}
         RETURN source, target, association.combined AS score
     """
 
     with driver.session() as session:
-        result = session.run(query).single(strict=True).value()
-        return _convert_to_connection_info_score(result=result, _int=True)
+        result = session.run(query)
+        return _convert_to_connection_info_score(result=result, _int=True, protein=True)
 
 
 def get_protein_associations(
-    driver: neo4j.Driver, protein_ids: list[str], threshold: int
+    driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
 ) -> (list[str], list[str], list[str], list[int]):
     """
     :returns: proteins (nodes), source_ids, target_ids, score
     """
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
+
     # unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons
     query = f"""
-        MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
-        WHERE source.external_id IN {protein_ids}
-            AND target.external_id IN {protein_ids}
-            AND association.combined >= {threshold}
-        RETURN source, target, association.combined AS score
+        MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
+        WHERE source.ENSEMBL IN {protein_ids}
+            AND target.ENSEMBL IN {protein_ids}
+            AND association.Score >= {threshold}
+        RETURN source, target, association.Score AS score
     """
     with driver.session() as session:
         result = session.run(query)
-        return _convert_to_connection_info_score(result=result, _int=True)
+        return _convert_to_connection_info_score(result=result, _int=True, protein=True)
+
 
+def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str, Any]]:
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
 
-def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]:
-    query = """
-        MATCH (term:Terms)
-        RETURN term.external_id AS id, term.name AS name, term.category AS category, term.proteins AS proteins
+    query = f"""
+        MATCH (term:FT:{species})
+        RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins
     """
 
     with driver.session() as session:
         result = session.run(query)
         return result.data()
 
 
-def get_number_of_proteins(driver: neo4j.Driver) -> int:
-    query = """
-        MATCH (n:Protein)
+def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int:
+    if species_id == 10090:
+        species = "Mus_Musculus"
+    elif species_id == 9606:
+        species = "Homo_Sapiens"
+
+    query = f"""
+        MATCH (n:Protein:{species})
         RETURN count(n) AS num_proteins
     """
     with driver.session() as session:
@@ -97,14 +125,28 @@ def get_number_of_proteins(driver: neo4j.Driver) -> int:
         return int(num_proteins)
 
 
-def _convert_to_connection_info_score(result: neo4j.Result, _int: bool) -> (list[str], list[str], list[str], list[int]):
+def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
+    proteins, ids = list(), list()
+    for row in result:
+        proteins.append(row["protein"])
+        ids.append(row["id"])
+    return proteins, ids
+
+
+def _convert_to_connection_info_score(
+    result: neo4j.Result, _int: bool, protein: bool
+) -> (list[str], list[str], list[str], list[int]):
     nodes, source, target, score = list(), list(), list(), list()
 
     for row in result:
         nodes.append(row["source"])
         nodes.append(row["target"])
-        source.append(row["source"].get("external_id"))
-        target.append(row["target"].get("external_id"))
+        if protein:
+            source.append(row["source"].get("ENSEMBL"))
+            target.append(row["target"].get("ENSEMBL"))
+        else:
+            source.append(row["source"].get("Term"))
+            target.append(row["target"].get("Term"))
         if _int:
             score.append(int(row["score"]))
         else:

diff --git a/frontend/src/components/enrichment/EnrichmentTool.vue b/frontend/src/components/enrichment/EnrichmentTool.vue
@@ -88,6 +88,7 @@
 
                 var formData = new FormData()
                 formData.append('func-terms', JSON.stringify(com.terms))
+                formData.append('species_id', com.gephi_data.nodes[0].species)
 
                 this.axios
                     .post("/api/subgraph/terms", formData)