Merge pull request #55 from BackofenLab/workMalek

Work malek
BackofenLab · Jan 7, 2024 · 78a629e · 78a629e
2 parents 772dd56 + ed4301b
commit 78a629e
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 34 deletions.
diff --git a/Requirements.mk b/Requirements.mk
@@ -9,7 +9,7 @@ help:
 
 .NOTPARALLEL:
 
-all: prepare conda node java maven neo4j apoc
+all: prepare conda node java maven neo4j apoc dummydata
 
 prepare:
 	@echo -n "Are you sure? [y/N] " && read ans && [ $${ans:-N} = y ]
@@ -66,6 +66,6 @@ dummydata:
 # explanation: https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive
 # docs: https://pypi.org/project/gdown/
 	pip install gdown
-	cd $$HOME/Downloads && gdown 1BfpXGdwcdmt8zh6K8MjrQf360ZaOpY_A
-	sudo neo4j-admin load --from=$$HOME/Downloads/newmouse2db.dump --database=neo4j --force
-	rm $$HOME/Downloads/newmouse2db.dump
+	cd $$HOME/Downloads && gdown 15yt-hNmCI1WODWvslrXwOdeSGUAT-e9F -O latest.dump
+	sudo neo4j-admin load --from=$$HOME/Downloads/latest.dump --database=neo4j --force
+	rm $$HOME/Downloads/latest.dump
diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py
@@ -135,4 +135,4 @@ def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):
 
     stopwatch.round("fdr_enrichment")
     stopwatch.total("functional_enrichment")
-    return df_terms
+    return df_terms
diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py
@@ -27,8 +27,9 @@ def get_functional_graph(list_enrichment, species_id):
     terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id)
 
     stopwatch.round("Neo4j")
-
-    if len(terms) == 0: return
+
+    if len(terms) == 0:
+        return
 
     nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id")
 

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -252,4 +252,4 @@ def run_flask():
 if __name__ == "__main__":
     signal.signal(signal.SIGINT, signal_handler)
     flask_process = Process(target=run_flask)
-    flask_process.start()
+    flask_process.start()
diff --git a/backend/src/pathway_data/kegg.py b/backend/src/pathway_data/kegg.py
@@ -182,7 +182,8 @@ def scrapping(path, species):
         if has_genes:
             pathway_gene_symbols = []
             for i in pathway_genes:
-                pathway_gene_symbols.append(i[1])
+                if i:
+                    pathway_gene_symbols.append(i[1])
             kegg2external_genes = symbols_to_ensemble(pathway_gene_symbols, species, "gene")
         # Diseases
         has_diseases = pathway_diseases is not None

diff --git a/backend/src/pathway_data/pathway_data.py b/backend/src/pathway_data/pathway_data.py
@@ -30,18 +30,31 @@ def get_url(species):
 
     Arguments:
     species: the species which data we want to download
+    string: whether to download from string or not
     """
-    url = f"http://download.baderlab.org/EM_Genesets/current_release/{species.capitalize()}/symbol/"
-    response = requests.get(url)
-    if response.status_code == 404:
-        print(f"The URL {url} returned a 404 error")
+    if species == "mouse":
+        string_species = "mus+musculus"
+    else:
+        string_species = "homo+sapiens"
+    url_string = f"https://string-db.org/cgi/download?sessionId=bthCAcyLVvFS&species_text={string_species}"
+    url_bader = f"http://download.baderlab.org/EM_Genesets/current_release/{species.capitalize()}/symbol/"
+    response_string = requests.get(url_string)
+    response_bader = requests.get(url_bader)
+    if response_bader.status_code == 404:
+        print(f"The URL {url_bader} returned a 404 error")
         return 0
+    pattern_proteins = r"(\S+\.protein\.info\.[\w.]+)"
+    pattern_pathway = r"(\S+\.protein\.enrichment.terms\.[\w.]+)"
+    match_proteins = re.findall(pattern_proteins, response_string.text)
+    match_pathway = re.findall(pattern_pathway, response_string.text)
+    result_protein = match_proteins[0].split("href=")[1].replace('"', "")
+    result_pathway = match_pathway[0].split("href=")[1].replace('"', "")
     match = re.search(
         rf'{species.capitalize()}_GO_AllPathways_with_GO_iea_[A-Z][a-z]+_\d{{2}}_\d{{4}}_symbol\.gmt(?=">)',
-        response.text,
+        response_bader.text,
     )
     result = match.group()
-    return url + result
+    return [url_bader + result, result_protein, result_pathway]
 
 
 def download_data(species):
@@ -51,14 +64,22 @@ def download_data(species):
     Arguments:
     species: the species which data we want to download
     """
-    url = get_url(species)
-    if url == 0:
+    urls = get_url(species)
+    url_bader = urls[0]
+    string = [urls[1], urls[2]]
+    if url_bader == 0:
         return 0
-
-    response = requests.get(url)
+    # For string download
+    response_protein = requests.get(string[0])
+    response_pathway = requests.get(string[1])
+    with open(f"data/proteins_string_{species}.txt.gz", "wb") as file:
+        file.write(response_protein.content)
+    with open(f"data/pathways_string_{species}.txt.gz", "wb") as file:
+        file.write(response_pathway.content)
+    response = requests.get(url_bader)
     # Handle 404 error (Case: database updated for human but mouse not yet available)
     if response.status_code == 404:
-        print(f"The URL {url} returned a 404 error")
+        print(f"The URL {url_bader} returned a 404 error")
         return 0
     if len(response.text) == 0:
         print("Empty gmt file, come back later")
@@ -95,6 +116,58 @@ def genes_to_proteins(genes, species):
     return gene_mapping
 
 
+def format_string_data(species):
+    """
+    Format the data acquired from String to the correct format
+
+    Argument:
+    species: species of interest
+    """
+    df_proteins = pd.read_csv(f"data/proteins_string_{species}.txt.gz", compression="gzip", sep="\t")
+    df_pathways = pd.read_csv(f"data/pathways_string_{species}.txt.gz", compression="gzip", sep="\t")
+    df_pathways = df_pathways[df_pathways["category"] == "Reactome Pathways"]
+    df_pathways = (
+        df_pathways.groupby("term")
+        .agg(
+            {
+                "#string_protein_id": list,  # convert string_protein_id to list
+                "category": "first",  # keep the first category encountered
+                "term": "first",
+                "description": "first",  # keep the first description encountered
+            }
+        )
+        .reset_index(drop=True)
+    )
+    protein_dict = df_proteins.set_index("#string_protein_id")["preferred_name"].to_dict()
+    df_pathways["#string_protein_id"] = df_pathways["#string_protein_id"].apply(
+        lambda ids: [protein_dict.get(id, id) for id in ids]
+    )
+    all_symbols = set()
+    for symbol_list in df_pathways["#string_protein_id"]:
+        all_symbols.update(symbol_list)
+    unique_symbols_list = list(all_symbols)
+    gene_mapping, genes_to_map = symbols_to_ensembl(unique_symbols_list, f"{species}", "gene")
+    gene_lis = []
+    for i in df_pathways["#string_protein_id"]:
+        genes = []
+        if i:
+            for j in i:
+                if j in gene_mapping:
+                    g = gene_mapping[j]
+                    if isinstance(g, list):
+                        for k in g:
+                            genes.append(k)
+                    else:
+                        genes.append(gene_mapping[j])
+        gene_lis.append(genes)
+    df_pathways["genes"] = gene_lis
+    df_pathways = df_pathways.rename(columns={"term": "id", "#string_protein_id": "symbols", "description": "name"})
+    df_pathways = df_pathways[["id", "name", "category", "symbols", "genes"]]
+    df_pathways.to_csv(f"data/reactome_pathways_{species}.csv", index=False)
+    df_proteins.to_csv(f"data/string_proteins_{species}.csv", index=False)
+    return
+
+
 def read_data(species, file_name):
     """
     Reads the data from the specified file.
@@ -110,13 +183,18 @@ def read_data(species, file_name):
         for line in f:
             fields = line.strip().split("\t")
             name = fields[0].split("%")
-            source = name[1]
-            ids = name[2]
-            descr = fields[1]
-            symbols = fields[2:]
-            data.append([ids, descr, source, symbols])
-            symbol.append(symbols)
-            unique_symbols.update(symbols)
+            if len(name) >= 2:
+                source = name[1]
+                ids = name[2]
+                descr = fields[1]
+                symbols = fields[2:]
+                # Exclude lines where source starts with "REACTOME"
+                if not source.startswith("REACTOME"):
+                    data.append([ids, descr, source, symbols])
+                    symbol.append(symbols)
+                    unique_symbols.update(symbols)
+            else:
+                pass
 
     unique_symbols = list(unique_symbols)
     gene_mapping, genes_to_map = symbols_to_ensembl(unique_symbols, f"{species}", "gene")
@@ -206,8 +284,8 @@ def data_formatting(species, folder):
     df = pd.read_csv(f"data/bader_{species}.csv.gz", compression="gzip")
     # Read the KEGG data
     kegg_df = read_kegg_data(species.lower())
-
-    merged_df = pd.concat([df, kegg_df], ignore_index=True)
+    reactome = pd.read_csv(f"data/reactome_pathways_{species}.csv")
+    merged_df = pd.concat([df, kegg_df, reactome], ignore_index=True)
     merged_df = merged_df.drop_duplicates(subset=["name", "category"])
     merged_df = merged_df.loc[merged_df["genes"].str.len() > 2]
     merged_df["id"] = merged_df.apply(lambda row: f"{row['id']}~{row['category']}", axis=1)
@@ -271,11 +349,15 @@ def main():
         if download_data("mouse") == 0:
             print("Mouse file not available on the server yet")
             return
+        download_data("mouse")
+        format_string_data("mouse")
         print("Pathway download succesfull for mouse")
         print("Downloading Pathway data for human")
         if download_data("human") == 0:
             print("Human file not available on the server yet")
             return
+        download_data("human")
+        format_string_data("human")
         print("Pathway download succesfull for human")
         util.update_line(filepath, gene_pattern, geneset_name)
     if kegg_update:

diff --git a/backend/src/queries.py b/backend/src/queries.py
@@ -51,8 +51,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
     result_names = list(genes_set - aliases_set) + list(symbols_set - genes_set)
     query = f"""
         MATCH (protein:Protein:{species})
-        WHERE protein.SYMBOL IN {str([n.title() for n in result_names])} 
-            OR protein.ENSEMBL_PROTEIN IN {str([n.title() for n in result_names])} 
+        WHERE protein.SYMBOL IN {str([n.capitalize() for n in result_names])} 
+            OR protein.ENSEMBL_PROTEIN IN {str([n.capitalize() for n in result_names])} 
         RETURN protein, protein.ENSEMBL_PROTEIN AS id
     """
     with driver.session() as session:
@@ -161,7 +161,7 @@ def _convert_to_symbol_alias(result: neo4j.Result) -> (set[str], set[str]):
         aliases.add(alias)
         # Only add the (symbol: alias) if the symbol isnt there already
         if row["symbol"] not in mapping:
-            mapping[symbol.title()] = alias.title()
+            mapping[symbol.capitalize()] = alias.capitalize()
     return symbols, aliases, mapping
 
 
@@ -184,4 +184,4 @@ def _convert_to_connection_info_score(
         else:
             score.append(float(row["score"]))
 
-    return nodes, source, target, score
+    return nodes, source, target, score
diff --git a/backend/src/util/data_util.py b/backend/src/util/data_util.py
@@ -31,7 +31,11 @@ def parse_drug_line(line):
         return drug_id, drug_name
 
     def parse_gene_line(line):
-        gene_id, gene_names = line.strip().split("  ")
+        parts = line.strip().split("  ")
+        if len(parts) >= 2:
+            gene_id, gene_names = parts
+        else:
+            return
         if ";" in gene_names:  # Mutliple names
             names = list(map(lambda string: string.strip(), gene_names.split(";")))
             short_name, long_name = names[0], "; ".join(names[1:])