-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_ModelNodesOnly.py
75 lines (59 loc) · 3.55 KB
/
script_ModelNodesOnly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pickle
from arboreto.algo import grnboost2
from arboreto.utils import load_tf_names
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell, derive_auc_threshold
from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
import pandas as pd
import os
import seaborn as sns
import matplotlib as plt
from datetime import datetime
if __name__ == '__main__':
ex_matrix = pd.read_csv("./data/expression_mat.csv", index_col=0) # load count matrix
nodes = ["en", "ind", "vnd", "ac", "Dr", "gsb", "Wg", "svp", 'Hh','hkb', 'Gro', 'Pan', 'Ptc', 'Smo', 'slp1', 'gcm', 'gcm2', 'Pnt']
all_tf_names = load_tf_names("./data/allTFs_dmel.txt") # Derive list of Transcription Factors(TF) for Drosophila
tf_names = [tf fot tf in all_tf_names if tf in nodes]
db_fnames = "./data/dm6-5kb-upstream-full-tx-11species.mc8nr.genes_vs_motifs.rankings.feather"
dbs = [RankingDatabase(fname=db_fnames, name=os.path.splitext(os.path.basename(db_fnames))[0])]
motif_annotation_file = "./data/motifs-v8-nr.flybase-m0.001-o0.0.tbl"
if not os.path.exists("results/"):
os.mkdir("results")
startTime = datetime.now()
n = 50
all_results = [None] * n
for i in range(0, n):
run_num = i+1
if not os.path.exists(f"results/run_{run_num}"): os.mkdir(f"results/run_{i+1}")
""" phase 1 - GRN inference, generation of co-expression modules """
adjacencies = grnboost2(ex_matrix, tf_names, verbose=True) # adjacencies table of tf, target and importance weight
# modules = list(modules_from_adjacencies(adjacencies, ex_matrix))
modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) # module generation - candidate regulons from TF-target gene interactions
# save to files:
adjacencies.to_csv(f"results/run_{run_num}/adjacencies.csv", index=False, sep='\t')
with open(f"results/run_{run_num}/modules.pkl", 'wb') as f:
pickle.dump(modules, f)
print("finished phase 1")
""" phase 2+3 - Regulon prediction """
df = prune2df(dbs, modules, motif_annotations_fname=motif_annotation_file) # Prune modules for targets with cis regulatory footprints (RcisTarget)
regulons = df2regulons(df) # convert data frame to rergulons
# save to files:
df.to_csv(f"results/run_{run_num}/motifs.csv")
with open(f"results/run_{run_num}/regulons.pkl", 'wb') as f:
pickle.dump(regulons, f)
print("phase 2+3")
""" phase 4 - cellular enrichment """
auc_mtx = aucell(ex_matrix, regulons, num_workers=1) # Calculate enrichment of gene signatures for single cells. # TODO: change num_workers
auc_mtx.to_csv(f"results/run_{run_num}/AUCell_mat.csv")
auc_mtx.to_pickle(f"results/results_{run_num}.pkl")# pickle results to results/ folder for later analysis
# AUCell returns A dataframe with the AUCs (n_cells x n_modules).
types_df = pd.read_csv("./data/cell_type.csv", index_col=0)
# heatmap
lut = dict(zip(types_df.type.unique(), sns.color_palette("hls", len(types_df.type.unique()))))
cell_colors = types_df.type.map(lut)
row_colors = auc_mtx.merge(cell_colors, how='left', left_index=True, right_index=True).type
ax= sns.clustermap(auc_mtx, figsize=(12,12,),yticklabels=True, xticklabels=True, row_colors=row_colors)
ax.savefig(f"results/run_{run_num}/AUCell_heatmap.png")
with open("time.txt", 'w') as file:
file.write(str(datetime.now() - startTime))