Skip to content

Commit

Permalink
Merge pull request #55 from BackofenLab/workMalek
Browse files Browse the repository at this point in the history
Work malek
  • Loading branch information
Maluuck authored Jan 7, 2024
2 parents 772dd56 + ed4301b commit 78a629e
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 34 deletions.
8 changes: 4 additions & 4 deletions Requirements.mk
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ help:

.NOTPARALLEL:

all: prepare conda node java maven neo4j apoc
all: prepare conda node java maven neo4j apoc dummydata

prepare:
@echo -n "Are you sure? [y/N] " && read ans && [ $${ans:-N} = y ]
Expand Down Expand Up @@ -66,6 +66,6 @@ dummydata:
# explanation: https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive
# docs: https://pypi.org/project/gdown/
pip install gdown
cd $$HOME/Downloads && gdown 1BfpXGdwcdmt8zh6K8MjrQf360ZaOpY_A
sudo neo4j-admin load --from=$$HOME/Downloads/newmouse2db.dump --database=neo4j --force
rm $$HOME/Downloads/newmouse2db.dump
cd $$HOME/Downloads && gdown 15yt-hNmCI1WODWvslrXwOdeSGUAT-e9F -O latest.dump
sudo neo4j-admin load --from=$$HOME/Downloads/latest.dump --database=neo4j --force
rm $$HOME/Downloads/latest.dump
2 changes: 1 addition & 1 deletion backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,4 @@ def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):

stopwatch.round("fdr_enrichment")
stopwatch.total("functional_enrichment")
return df_terms
return df_terms
5 changes: 3 additions & 2 deletions backend/src/enrichment_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def get_functional_graph(list_enrichment, species_id):
terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id)

stopwatch.round("Neo4j")

if len(terms) == 0: return

if len(terms) == 0:
return

nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id")

Expand Down
2 changes: 1 addition & 1 deletion backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,4 +252,4 @@ def run_flask():
if __name__ == "__main__":
signal.signal(signal.SIGINT, signal_handler)
flask_process = Process(target=run_flask)
flask_process.start()
flask_process.start()
3 changes: 2 additions & 1 deletion backend/src/pathway_data/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ def scrapping(path, species):
if has_genes:
pathway_gene_symbols = []
for i in pathway_genes:
pathway_gene_symbols.append(i[1])
if i:
pathway_gene_symbols.append(i[1])
kegg2external_genes = symbols_to_ensemble(pathway_gene_symbols, species, "gene")
# Diseases
has_diseases = pathway_diseases is not None
Expand Down
122 changes: 102 additions & 20 deletions backend/src/pathway_data/pathway_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,31 @@ def get_url(species):
Arguments:
species: the species which data we want to download
string: whether to download from string or not
"""
url = f"http://download.baderlab.org/EM_Genesets/current_release/{species.capitalize()}/symbol/"
response = requests.get(url)
if response.status_code == 404:
print(f"The URL {url} returned a 404 error")
if species == "mouse":
string_species = "mus+musculus"
else:
string_species = "homo+sapiens"
url_string = f"https://string-db.org/cgi/download?sessionId=bthCAcyLVvFS&species_text={string_species}"
url_bader = f"http://download.baderlab.org/EM_Genesets/current_release/{species.capitalize()}/symbol/"
response_string = requests.get(url_string)
response_bader = requests.get(url_bader)
if response_bader.status_code == 404:
print(f"The URL {url_bader} returned a 404 error")
return 0
pattern_proteins = r"(\S+\.protein\.info\.[\w.]+)"
pattern_pathway = r"(\S+\.protein\.enrichment.terms\.[\w.]+)"
match_proteins = re.findall(pattern_proteins, response_string.text)
match_pathway = re.findall(pattern_pathway, response_string.text)
result_protein = match_proteins[0].split("href=")[1].replace('"', "")
result_pathway = match_pathway[0].split("href=")[1].replace('"', "")
match = re.search(
rf'{species.capitalize()}_GO_AllPathways_with_GO_iea_[A-Z][a-z]+_\d{{2}}_\d{{4}}_symbol\.gmt(?=">)',
response.text,
response_bader.text,
)
result = match.group()
return url + result
return [url_bader + result, result_protein, result_pathway]


def download_data(species):
Expand All @@ -51,14 +64,22 @@ def download_data(species):
Arguments:
species: the species which data we want to download
"""
url = get_url(species)
if url == 0:
urls = get_url(species)
url_bader = urls[0]
string = [urls[1], urls[2]]
if url_bader == 0:
return 0

response = requests.get(url)
# For string download
response_protein = requests.get(string[0])
response_pathway = requests.get(string[1])
with open(f"data/proteins_string_{species}.txt.gz", "wb") as file:
file.write(response_protein.content)
with open(f"data/pathways_string_{species}.txt.gz", "wb") as file:
file.write(response_pathway.content)
response = requests.get(url_bader)
# Handle 404 error (Case: database updated for human but mouse not yet available)
if response.status_code == 404:
print(f"The URL {url} returned a 404 error")
print(f"The URL {url_bader} returned a 404 error")
return 0
if len(response.text) == 0:
print("Empty gmt file, come back later")
Expand Down Expand Up @@ -95,6 +116,58 @@ def genes_to_proteins(genes, species):
return gene_mapping


def format_string_data(species):
"""
Format the data acquired from String to the correct format
Argument:
species: species of interest
"""
df_proteins = pd.read_csv(f"data/proteins_string_{species}.txt.gz", compression="gzip", sep="\t")
df_pathways = pd.read_csv(f"data/pathways_string_{species}.txt.gz", compression="gzip", sep="\t")
df_pathways = df_pathways[df_pathways["category"] == "Reactome Pathways"]
df_pathways = (
df_pathways.groupby("term")
.agg(
{
"#string_protein_id": list, # convert string_protein_id to list
"category": "first", # keep the first category encountered
"term": "first",
"description": "first", # keep the first description encountered
}
)
.reset_index(drop=True)
)
protein_dict = df_proteins.set_index("#string_protein_id")["preferred_name"].to_dict()
df_pathways["#string_protein_id"] = df_pathways["#string_protein_id"].apply(
lambda ids: [protein_dict.get(id, id) for id in ids]
)
all_symbols = set()
for symbol_list in df_pathways["#string_protein_id"]:
all_symbols.update(symbol_list)
unique_symbols_list = list(all_symbols)
gene_mapping, genes_to_map = symbols_to_ensembl(unique_symbols_list, f"{species}", "gene")
gene_lis = []
for i in df_pathways["#string_protein_id"]:
genes = []
if i:
for j in i:
if j in gene_mapping:
g = gene_mapping[j]
if isinstance(g, list):
for k in g:
genes.append(k)
else:
genes.append(gene_mapping[j])
gene_lis.append(genes)
df_pathways["genes"] = gene_lis
df_pathways = df_pathways.rename(columns={"term": "id", "#string_protein_id": "symbols", "description": "name"})
df_pathways = df_pathways[["id", "name", "category", "symbols", "genes"]]
df_pathways.to_csv(f"data/reactome_pathways_{species}.csv", index=False)
df_proteins.to_csv(f"data/string_proteins_{species}.csv", index=False)
return


def read_data(species, file_name):
"""
Reads the data from the specified file.
Expand All @@ -110,13 +183,18 @@ def read_data(species, file_name):
for line in f:
fields = line.strip().split("\t")
name = fields[0].split("%")
source = name[1]
ids = name[2]
descr = fields[1]
symbols = fields[2:]
data.append([ids, descr, source, symbols])
symbol.append(symbols)
unique_symbols.update(symbols)
if len(name) >= 2:
source = name[1]
ids = name[2]
descr = fields[1]
symbols = fields[2:]
# Exclude lines where source starts with "REACTOME"
if not source.startswith("REACTOME"):
data.append([ids, descr, source, symbols])
symbol.append(symbols)
unique_symbols.update(symbols)
else:
pass

unique_symbols = list(unique_symbols)
gene_mapping, genes_to_map = symbols_to_ensembl(unique_symbols, f"{species}", "gene")
Expand Down Expand Up @@ -206,8 +284,8 @@ def data_formatting(species, folder):
df = pd.read_csv(f"data/bader_{species}.csv.gz", compression="gzip")
# Read the KEGG data
kegg_df = read_kegg_data(species.lower())

merged_df = pd.concat([df, kegg_df], ignore_index=True)
reactome = pd.read_csv(f"data/reactome_pathways_{species}.csv")
merged_df = pd.concat([df, kegg_df, reactome], ignore_index=True)
merged_df = merged_df.drop_duplicates(subset=["name", "category"])
merged_df = merged_df.loc[merged_df["genes"].str.len() > 2]
merged_df["id"] = merged_df.apply(lambda row: f"{row['id']}~{row['category']}", axis=1)
Expand Down Expand Up @@ -271,11 +349,15 @@ def main():
if download_data("mouse") == 0:
print("Mouse file not available on the server yet")
return
download_data("mouse")
format_string_data("mouse")
print("Pathway download succesfull for mouse")
print("Downloading Pathway data for human")
if download_data("human") == 0:
print("Human file not available on the server yet")
return
download_data("human")
format_string_data("human")
print("Pathway download succesfull for human")
util.update_line(filepath, gene_pattern, geneset_name)
if kegg_update:
Expand Down
8 changes: 4 additions & 4 deletions backend/src/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
result_names = list(genes_set - aliases_set) + list(symbols_set - genes_set)
query = f"""
MATCH (protein:Protein:{species})
WHERE protein.SYMBOL IN {str([n.title() for n in result_names])}
OR protein.ENSEMBL_PROTEIN IN {str([n.title() for n in result_names])}
WHERE protein.SYMBOL IN {str([n.capitalize() for n in result_names])}
OR protein.ENSEMBL_PROTEIN IN {str([n.capitalize() for n in result_names])}
RETURN protein, protein.ENSEMBL_PROTEIN AS id
"""
with driver.session() as session:
Expand Down Expand Up @@ -161,7 +161,7 @@ def _convert_to_symbol_alias(result: neo4j.Result) -> (set[str], set[str]):
aliases.add(alias)
# Only add the (symbol: alias) if the symbol isnt there already
if row["symbol"] not in mapping:
mapping[symbol.title()] = alias.title()
mapping[symbol.capitalize()] = alias.capitalize()
return symbols, aliases, mapping


Expand All @@ -184,4 +184,4 @@ def _convert_to_connection_info_score(
else:
score.append(float(row["score"]))

return nodes, source, target, score
return nodes, source, target, score
6 changes: 5 additions & 1 deletion backend/src/util/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ def parse_drug_line(line):
return drug_id, drug_name

def parse_gene_line(line):
gene_id, gene_names = line.strip().split(" ")
parts = line.strip().split(" ")
if len(parts) >= 2:
gene_id, gene_names = parts
else:
return
if ";" in gene_names: # Mutliple names
names = list(map(lambda string: string.strip(), gene_names.split(";")))
short_name, long_name = names[0], "; ".join(names[1:])
Expand Down

0 comments on commit 78a629e

Please sign in to comment.