Skip to content

Commit

Permalink
Added example queries, meancount-> context
Browse files Browse the repository at this point in the history
  • Loading branch information
kataikko committed Oct 1, 2023
1 parent e3d5345 commit 5c63213
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 52 deletions.
8 changes: 4 additions & 4 deletions db/scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

os.environ["_TIME_FUNCTIONS"] = str(True)
os.environ["_SILENT"] = str(False)
os.environ["_PRODUCTION"] = str(True)
os.environ["_UPDATE_NEO4J"] = str(True)
os.environ["_PRODUCTION"] = str(False)
os.environ["_ACCESS_NEO4J"] = str(True)


@time_function
Expand Down Expand Up @@ -176,5 +176,5 @@ def upload_workflow():


if __name__ == "__main__":
upload_workflow()
# run_queries()
# upload_workflow()
run_queries()
60 changes: 36 additions & 24 deletions db/scripts/querier.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,48 @@
import pandas as pd
from utils import start_driver, stop_driver
from query.query_functions import (
get_tg_ensembl_by_symbol,
get_or_by_da_under_contexts,
get_or_by_distance_to_tg,
get_or_by_motif_to_tf,
get_tg_by_correlation_tf,
get_tg_by_de_under_contexts,
get_tg_by_link_ft,
get_tf_correlated_tg,
query_1,
query_2,
query_3,
query_4,
query_5,
query_6,
query_7,
query_8
)


def run_queries():
driver = start_driver()

genes = list(pd.read_csv("../source/misc/gene_selection_10k.csv")["SYMBOL"])

open_regions = list(pd.read_csv("../source/processed/or_extended.csv")["id"])
target_genes = list(pd.read_csv("../source/processed/tg.csv")["ENSEMBL"])
transcription_factor = list(pd.read_csv("../source/processed/tf.csv")["ENSEMBL"])
sources = list(pd.read_csv("../source/processed/sources.csv")["id"])
celltypes = list(pd.read_csv("../source/processed/celltypes.csv")["name"])

# Queries

gene_subset = get_tg_ensembl_by_symbol(gene_list=genes, driver=driver)

or_subset = [i[0] for i in get_or_by_distance_to_tg(subset=gene_subset, driver=driver)]
get_or_by_da_under_contexts(
contexts=["12h-0h", "24h-0h"], subset=or_subset, positive=True, threshold=0.5, driver=driver
)
get_or_by_motif_to_tf(tf="ENSMUSG00000052684", subset=or_subset, driver=driver)

get_tg_by_correlation_tf(tf="ENSMUSG00000052684", subset=gene_subset, positive=True, threshold=0.5, driver=driver)
get_tg_by_de_under_contexts(
contexts=["6h-0h", "24h-0h"], subset=gene_subset, positive=True, threshold=0.5, driver=driver
)
get_tg_by_link_ft(ft="GO:0070851", subset=gene_subset, driver=driver)
for i in range(1, len(transcription_factor), 100):
tmp = transcription_factor[:i]
query_1(i=i, list=tmp, threshold=0.5, driver=driver)
query_2(i=i, list=tmp, threshold=0.5, driver=driver)

for i in range(1, len(open_regions), 1000):
tmp = open_regions[:i]
query_3(i=i, list=open_regions, threshold=0.5, driver=driver)

for i in range(1, len(sources)):
tmp = sources[:i]
query_4(i=i, list=sources, driver=driver)
query_5(i=i, list=sources, driver=driver)

for i in range(1, len(celltypes)):
tmp = celltypes[:i]
query_6(i=i, list=celltypes, driver=driver)

for i in range(1, len(target_genes), 100):
tmp = target_genes[:i]
query_7(i=i, list=target_genes, threshold=0.5, driver=driver)
query_8(i=i, list=target_genes, threshold=0.5, driver=driver)

stop_driver(driver=driver)
100 changes: 96 additions & 4 deletions db/scripts/query/query_functions.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import neo4j
from utils import execute_query
from utils import execute_query, time_function


def get_tg_ensembl_by_symbol(gene_list: list[str], driver: neo4j.Driver):
def get_tg_ensembl_by_symbol(list: list[str], type:str, driver: neo4j.Driver):
query = f"""
MATCH (n:TG)
WHERE n.SYMBOL IN {gene_list}
MATCH (n:{type})
WHERE n.SYMBOL IN {list}
RETURN n.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
Expand Down Expand Up @@ -89,6 +89,98 @@ def get_or_by_da_under_contexts(
# TODO
return result

@time_function
def query_1(list: list[str], threshold: float, driver: neo4j.Driver):
query = f"""
MATCH (n:TF:Mus_Musculus)-[:MOTIF]->(:OR:Mus_Musculus)-[c:CORRELATION]->(m:TG:Mus_Musculus)
WHERE n.ENSEMBL IN {list}
AND c.Correlation >= {threshold}
RETURN n.ENSEMBL, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
return result

@time_function
def query_2(list: list[str], threshold: float, driver: neo4j.Driver):
query = f"""
MATCH (n:TF:Mus_Musculus)-[c:CORRELATION]->(m:TG:Mus_Musculus)
WHERE n.ENSEMBL IN {list}
AND c.Correlation >= {threshold}
RETURN n.ENSEMBL, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_3(list: list[str], threshold: float, driver: neo4j.Driver):
query = f"""
MATCH (n:OR:Mus_Musculus)-[c:CORRELATION]->(m:TG:Mus_Musculus)
WHERE n.id IN {list}
AND c.Correlation >= {threshold}
RETURN n.id, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_4(list: list[str], driver: neo4j.Driver):
query = f"""
MATCH (s:Source:Mus_Musculus)-[:HAS]->(c:Context:Mus_Musculus)-[v:VALUE]->(m:TG:Mus_Musculus)
WHERE s.id IN {list}
RETURN s.id, c.Context, v.Value, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_5(list: list[str], driver: neo4j.Driver):
query = f"""
MATCH (s:Source:Mus_Musculus)-[:HAS]->(c:Context:Mus_Musculus)-[v:VALUE]->(m:OR:Mus_Musculus)
WHERE s.id IN {list}
RETURN s.id, c.Context, v.Value, m.id
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_6(list: list[str], driver: neo4j.Driver):
query = f"""
MATCH (s:Celltype:Mus_Musculus)-[:1*IS|HAS]->(:Source:Mus_Musculus)-[:Has]->(t:Context:Mus_Musculus)-[v:VALUE]->(m:OR:Mus_Musculus)
WHERE s.name IN {list}
RETURN t.Context, v.Value, m.id
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_7(list: list[str], threshold: float, driver: neo4j.Driver):
query = f"""
MATCH (n:TF:Mus_Musculus)-[c:CORRELATION]->(m:TG:Mus_Musculus)
WHERE m.ENSEMBL IN {list}
AND c.Correlation >= {threshold}
RETURN n.ENSEMBL, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result

@time_function
def query_8(list: list[str], threshold: float, driver: neo4j.Driver):
query = f"""
MATCH (n:OR:Mus_Musculus)-[c:CORRELATION]->(m:TG:Mus_Musculus)
WHERE m.ENSEMBL IN {list}
AND c.Correlation >= {threshold}
RETURN n.id, m.ENSEMBL
"""
result = execute_query(query=query, read=True, driver=driver)
# TODO
return result


# ---------------------- NOT FOR PRODUCTION ----------------------
# Used by Christina to get TGs correlated with list of TFs
Expand Down
2 changes: 1 addition & 1 deletion db/scripts/upload/upload_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def create_study_cell_source_meancount(species: str, driver: Driver):
create_study_query = "CREATE (s:Study {})".format(study_info_str)
create_celltype_query = "MERGE (c:Celltype:{} {})".format(species, celltype_info_str)
create_source_query = f"MERGE (s)-[:HAS]->(o:Source:{species})<-[:HAS]-(c) SET o.id = id(o)"
create_meancount = f"MERGE (m:MeanCount:{species})"
create_meancount = f"MERGE (m:Context:MeanCount:{species}{{'Context': 'Meancount'}})"
create_source_meancount_edge = "MERGE (o)-[:HAS]->(m)"
return_id = "RETURN id(o) AS id"

Expand Down
21 changes: 14 additions & 7 deletions db/scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,16 @@ def stop_driver(driver: neo4j.Driver):


def execute_query(query: str, read: bool, driver: neo4j.Driver) -> pd.DataFrame:
if os.getenv("_UPDATE_NEO4J") == str(True):
with driver.session() as session:
tmp = session.run(query).values()
return tmp
if os.getenv("_ACCESS_NEO4J") == str(True):
if read:
with driver.session() as session:
result = session.run(query)
return result.consume()
else:
with driver.session() as session:
tmp = session.run(query).values()
return tmp
else:
print(query)
return [[0]]


Expand All @@ -59,14 +63,17 @@ def save_df_to_csv(file_name: str, df: pd.DataFrame, override_prod: bool = False


def time_function(function):
def timing(**variables):
def timing(i=None, **variables):
start_time = time()
result = function(**variables)
end_time = time()
if os.getenv("_TIME_FUNCTIONS") == str(True):
with open(os.getenv("_FUNCTION_TIME_PATH"), "a", newline="\n") as csvfile:
writer = csv.writer(csvfile, delimiter="\t")
writer.writerow([function.__name__, end_time - start_time])
if i is not None:
writer.writerow([i, function.__name__, end_time - start_time])
else:
writer.writerow([function.__name__, end_time - start_time])
return result

return timing
Expand Down
26 changes: 14 additions & 12 deletions docs/GraphModel.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,43 +130,45 @@
### Nodes:
| Type | old (Mouse) | new (Mouse)| new (Human) |
| --- | --- | --- | --- |
| Terms / (in new DB: FT) | 24.170 | 28.742 | 29.773 |
| Terms / (in new DB: FT) | 24.170 | 28.715 | 29.794 |
| Proteins | 22.048 | 67.249 | 123.031 |
| Target Genes (TG) | 0 | 22.478 | 23.900 |
| Transcription Factors (TF, are also TGs) | 0 | 2.894 | 3.529 |
| Open Regions (OR) | 0 | 1.431.123 | - |
| Context | 0 | 23 | - |
| Context | 0 | 24 | - |
| Celltype | 0 | 16 | - |
| Subtype | 0 | 209 | - |
| MeanCount | 0 | 1 | - |
| Source | 0 | 216 | - |
| Study | 0 | 2 | - |
| Total | 46.218 | 1.550.059 | 176.704 |
| Total | 46.218 | 1.550.032 | 176.725 |

#### Total Nodes in new DB: 1.726.763
#### Total Nodes in new DB: 1.726.757

### Edges:
| Type | old (Mouse) | new (Mouse) | new (Human) |
| --- | --- | --- | --- |
| ASSOCIATION / (in new DB: STRING) | 7.248.179 | 6.342.177 | 6.857.702 |
| CORRELATION (TG, TF) | 0 | 1.739.921 | 0 |
| CORRELATION (TG, OR) | 0 | 4.066.833 | 0 |
| VALUE (OR) | 0 | 29.480.276 | 0 |
| VALUE (TG) | 0 | 50.135 | 0 |
| VALUE (Timeframe, OR) | 0 | 533.220 | 0 |
| VALUE (Location, OR) | 0 | 28.947.056 | 0 |
| VALUE (MeanCount, OR) | 0 | 106.644 | 0 |
| VALUE (Timeframe, TG) | 0 | 50.135 | 0 |
| VALUE (MeanCount, TG) | 0 | 10.027 | 0 |
| DISTANCE | 0 | 1.286.065 (with Dummies) | 0 |
| KAPPA | 81.676 | 0 (to be deprecated) | 0 |
| LINK (Protein, FT) | 0 | 7.274.921 | 14.174.256 |
| LINK (Gene, FT) | 0 | 2.100.868 | 2.108.637 |
| LINK (Protein, FT) | 0 | 7.276.727 | 14.253.772 |
| LINK (Gene, FT) | 0 | 2.104.222 | 2.112.933 |
| MEANCOUNT (TG) | 0 | 10.027 | 0 |
| MEANCOUNT (OR) | 0 | 106.644 | 0 |
| MOTIF | 0 | 34.553.249 (with Dummies) | 0 |
| OVERLAP | 0 | 6.762.321 | 6.925.609 |
| OVERLAP | 0 | 4.584.738 | 4.754.455 |
| PRODUCT | 0 | 66.953 | 122.614 |
| HAS | 0 | 655 | - |
| IS | 0 | 209 | - |
| Total | 7.329.855 | 93.841.254 | 30.188.818 |
| Total | 7.329.855 | 91.668.831 | 28.101.476 |

#### Total Edges in new DB: 124.030.072
#### Total Edges in new DB: 119.770.307

## Notes

Expand Down

0 comments on commit 5c63213

Please sign in to comment.