Skip to content

Commit

Permalink
from binary to weighed mutation profile, based on observed/expected s…
Browse files Browse the repository at this point in the history
…core of gnomAD
  • Loading branch information
0m1n0 committed Dec 5, 2019
1 parent 705b98a commit bccbf7b
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 74 deletions.
2 changes: 1 addition & 1 deletion stratipy/biostat_go.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def merge_2_subgroups(data_folder, ssc_mutation_data, ssc_subgroups, gene_data,

df_enrich1, df_pcount1 = p_df_by_subgroup(
data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data,
n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
min_category_size, max_category_size, max_category_depth)

################ SSC 2 ################
Expand Down
2 changes: 1 addition & 1 deletion stratipy/biostat_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,

# k=20 -> figsize=(5, 9)
fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1],
sharex=True, sharey=True, figsize=(5, 9))
sharex=True, sharey=True, figsize=(5, 20))
if lambd > 0:
nmf = 'GNMF'
else:
Expand Down
111 changes: 80 additions & 31 deletions stratipy/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from stratipy.nbs_class import Ppi
from tqdm import trange
import glob
from statistics import median

# NOTE some variable names changed:
# dataFolder -> data_folder
Expand Down Expand Up @@ -80,9 +81,10 @@ def get_indiv_list(indiv_from_df):
return alist


def mutation_profile_coordinate(df, indiv_list, indiv_type):
def mutation_profile_coordinate_score(df, indiv_list, indiv_type, score_list):
coord_gene = []
coord_indiv = []
weight = []

for i in trange(df.shape[0], desc="mutation profile coordinates"):
# for each row (each gene), we get list of individuals' ID
Expand All @@ -93,8 +95,51 @@ def mutation_profile_coordinate(df, indiv_list, indiv_type):
coord_gene.append(i)
# individual is saved as his/her INDEX (not ID) in indiv_list
coord_indiv.append(j)
# pLI score for each point
weight.append(score_list[i])

return coord_indiv, coord_gene
return coord_indiv, coord_gene, weight


def generate_mp(df, pLI_type):
gene_id = [int(i) for i in df['entrez_id'].tolist()]
indiv_type = 'Admixture_individual' # Admixture European probands
df[indiv_type] = df[indiv_type].apply(eval)
# create individual ID list
indiv = get_indiv_list(df[indiv_type])
if '[' in indiv: indiv.remove('[')
if ']' in indiv: indiv.remove(']')
# pLI_oe score list
pLI_score = df[pLI_type].tolist()

# calculate coordinates genes x individuals -> sparse matrix
coord_indiv, coord_gene, weight = mutation_profile_coordinate_score(
df, indiv, indiv_type, pLI_score)

mutation_profile = sp.coo_matrix((weight, (coord_indiv, coord_gene)),
shape=(len(indiv), len(gene_id)),
dtype=np.float32).tocsr()
return mutation_profile, gene_id, indiv


def merge_lof_mis(df_lof, df_mis, pLI_lof_name, pLI_mis_name):
mp_lof, gene_lof, indiv_lof = generate_mp(df_lof, pLI_lof_name)
mp_mis, gene_mis, indiv_mis = generate_mp(df_mis, pLI_mis_name)

df_lof = pd.DataFrame(mp_lof.toarray(), columns=gene_lof, index=indiv_lof)
df_mis = pd.DataFrame(mp_mis.toarray(), columns=gene_mis, index=indiv_mis)

# merge 2 dataframes and take high value if needed
df = pd.concat([df_lof, df_mis]).groupby(level=0).max()
# fill NA by 0
df = df.fillna(0)

# dataframe to scipy sparse matrix
mutation_profile = sp.csr_matrix(df.values)
indiv = df.index.values.tolist()
gene_id = df.columns.tolist()

return mutation_profile, gene_id, indiv


def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
Expand All @@ -115,29 +160,18 @@ def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
print('overall_{}_mutation_profile file is calculating.....'
.format(ssc_mutation_data))

df = pd.read_csv(data_folder + 'SSC_{}_gene_filtered.csv'
.format(ssc_mutation_data), sep=';')
# create gene ID (EntrezGene ID) list
gene_id = [int(i) for i in df['entrez_id'].tolist()]

# individuals' ID list for each row is transformed in string of list
# we thus reformate to list
# indiv_type = 'individual' # all individuals
indiv_type = 'Admixture_individual' # Admixture European probands
df[indiv_type] = df[indiv_type].apply(eval)
# create individual ID list
indiv = get_indiv_list(df[indiv_type])
if '[' in indiv: indiv.remove('[')
if ']' in indiv: indiv.remove(']')

# calculate coordinates genes x individuals -> sparse matrix
coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv, indiv_type)
# mutation weight = 1
weight = np.ones(len(coord_gene))
# coo matrix then to csr matrix
mutation_profile = sp.coo_matrix((weight, (coord_indiv, coord_gene)),
shape=(len(indiv), len(gene_id)),
dtype=np.float32).tocsr()
df_lof = pd.read_csv(
data_folder + 'SSC_MAF1_LoF_gene_filtered_pLI_oe.csv', sep=';')

if 'mis15' in ssc_mutation_data:
df_mis = pd.read_csv(
data_folder + 'SSC_MAF1_mis15_gene_filtered_pLI_oe.csv', sep=';')
elif 'mis30' in ssc_mutation_data:
df_mis = pd.read_csv(
data_folder + 'SSC_MAF1_mis30_gene_filtered_pLI_oe.csv', sep=';')

mutation_profile, gene_id, indiv = merge_lof_mis(
df_lof, df_mis, 'oe_lof_rescaled', 'oe_mis_rescaled')

savemat(overall_mutation_profile_file,
{'mutation_profile': mutation_profile,
Expand Down Expand Up @@ -168,15 +202,30 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data))
print("SSC overall mutation profile matrix\n shape: {}\n stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))

# if SSC 1 or 2
if ssc_subgroups != "SSC":
print('{}_{}_mutation_profile file is calculating.....'.format(
ssc_mutation_data, ssc_subgroups))
if (ssc_subgroups == 'SSC1') or (ssc_subgroups == 'SSC2'):
print('{}_{}_mutation_profile file is calculating.....'.format(
ssc_mutation_data, ssc_subgroups))

df_ssc = pd.read_csv(data_folder + '{}_phenotype.csv'
.format(ssc_subgroups), sep='\t')
ind_ssc_raw = df_ssc.individual.tolist()
# ind_ssc_raw = df_ssc.individual.apply(eval).tolist()

# 'SSC_all', 'SSC_male', 'SSC_female'
else:
if ssc_subgroups == 'SSC_all':
indiv_file = data_folder + 'admixture_1175_all.txt'
elif ssc_subgroups == 'SSC_male':
indiv_file = data_folder + 'admixture_1175_male.txt'
elif ssc_subgroups == 'SSC_female':
indiv_file = data_folder + 'admixture_1175_female.txt'

with open(indiv_file) as f:
ind_ssc_raw = [line.rstrip('\n') for line in f]

df_ssc = pd.read_csv(data_folder + '{}_phenotype.csv'
.format(ssc_subgroups), sep='\t')
ind_ssc_raw = df_ssc.individual.tolist()
# ind_ssc_raw = df_ssc.individual.apply(eval).tolist()
# looking for corresponding individual ID in overall data (indiv)
# then append their index in a list (ind_ssc)
ind_ssc = []
Expand All @@ -189,7 +238,7 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
# slice overall mutation profile by SSC subgroup individuals
mutation_profile = mutation_profile[ind_ssc, :]

if gene_data != 'all':
if gene_data != 'allGenes':
print('genes filtering according to:', gene_data)
df_gene = pd.read_csv(data_folder + 'EntrezGene_{}.csv'
.format(gene_data), sep='\t')
Expand Down
130 changes: 89 additions & 41 deletions stratipy/nbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,20 @@
from scipy.io import loadmat, savemat


def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
def initiation(mut_type, patient_data, data_folder, ssc_mutation_data,
ssc_subgroups, gene_data, ppi_data, lambd, n_components):
if mut_type == 'raw':
alpha = 0
result_folder = (
data_folder + 'result_' + ssc_mutation_data + '_' +
ssc_subgroups + '_' + gene_data + '/' + mut_type + '/')

if patient_data == 'SSC':
if mut_type == 'raw':
alpha = 0
ppi_data = 'noPPI'

else:
if ppi_data == 'APID':
alpha = 0.6

elif ppi_data == 'STRING':
alpha = 0.581

result_folder = (
data_folder + 'result_' + ssc_mutation_data + '_' +
ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
Expand All @@ -33,31 +38,33 @@ def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
else:
result_folder = (data_folder + 'result_' + patient_data + '_' +
ppi_data + '/')
if mut_type == 'raw':
alpha = 0

print(result_folder, flush=True)
print("\nIndividuals =", ssc_subgroups, flush=True)
print("mutation type =", mut_type, flush=True)
print("alpha =", alpha, flush=True)
print("lambda =", lambd, flush=True)
print("k =", n_components, flush=True)

return alpha, result_folder
return alpha, result_folder, ppi_data


def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
ppi_data, result_folder, influence_weight, simplification,
compute, overwrite, alpha, tol, ngh_max,
keep_singletons, min_mutation, max_mutation, mut_type):
def preprocess_noRaw(ppi_data, mut_type, ssc_subgroups, data_folder, patient_data, ssc_mutation_data,
gene_data, influence_weight, simplification, compute, overwrite, tol, ngh_max,
keep_singletons, min_mutation, max_mutation, result_folder, alpha):
print("------------ load_data.py ------------ {}"
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
flush=True)
if patient_data == 'TCGA_UCEC':
(patient_id, mutation_profile, gene_id_patient, gene_symbol_profile) = (
(individual_id, mutation_profile, gene_id, gene_symbol_profile) = (
load_data.load_TCGA_UCEC_patient_data(data_folder))
elif patient_data == 'Faroe':
mutation_profile, gene_id_patient = (
mutation_profile, gene_id = (
load_data.load_Faroe_Islands_data(data_folder))
elif patient_data == 'SSC':
mutation_profile, gene_id_patient, patient_id = (
mutation_profile, gene_id, individual_id = (
load_data.load_specific_SSC_mutation_profile(
data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

Expand All @@ -73,7 +80,12 @@ def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, g
flush=True)
idx_ppi, idx_ppi_only, ppi_total, mut_total, ppi_filt = (
formatting_data.formatting(
network, mutation_profile, gene_id_ppi, gene_id_patient))
network, mutation_profile, gene_id_ppi, gene_id))

# EntrezGene ID to int
entrez_ppi = [int(i) for i in gene_id_ppi]
# EntrezGene indexes in PPI after formatting
idx_filtred = idx_ppi + idx_ppi_only

print("------------ filtering_diffusion.py ------------ {}"
.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
Expand All @@ -84,42 +96,78 @@ def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, g
overwrite, alpha, tol, ppi_total, mut_total, ngh_max,
keep_singletons, min_mutation, max_mutation, mut_type))

return gene_id_ppi, idx_ppi, idx_ppi_only
return gene_id, individual_id, entrez_ppi, idx_filtred, mut_propag


def preprocessing(ppi_data, mut_type, ssc_subgroups, data_folder, patient_data,
ssc_mutation_data, gene_data, influence_weight,
simplification, compute, overwrite, tol, ngh_max,
keep_singletons, min_mutation, max_mutation, result_folder, alpha, return_val=False):
if mut_type == 'raw':
mutation_profile, mp_gene, mp_indiv = (
load_data.load_specific_SSC_mutation_profile(
data_folder, ssc_mutation_data, ssc_subgroups, gene_data))
else:
gene_id, mp_indiv, entrez_ppi, idx_filtred, mutation_profile = preprocess_noRaw(
ppi_data, mut_type, ssc_subgroups, data_folder, patient_data, ssc_mutation_data,
gene_data, influence_weight, simplification, compute, overwrite, tol, ngh_max,
keep_singletons, min_mutation, max_mutation, result_folder, alpha)
# Entrez Gene ID in filtered/formatted mutation profile
mp_gene = [entrez_ppi[i] for i in idx_filtred]

if return_val:
return mutation_profile, mp_indiv, mp_gene, entrez_ppi, idx_filtred


def parallel_bootstrap(result_folder, mut_type, influence_weight,
simplification, alpha, tol, keep_singletons,
ngh_max, min_mutation, max_mutation, n_components,
n_permutations, run_bootstrap, lambd, tol_nmf,
compute_gene_clustering, sub_perm):
final_influence_mutation_directory = result_folder + 'final_influence/'
final_influence_mutation_file = (
final_influence_mutation_directory +
'final_influence_mutation_profile_{}_alpha={}_tol={}.mat'.format(
mut_type, alpha, tol))
final_influence_data = loadmat(final_influence_mutation_file)
mut_propag = final_influence_data['mut_propag']

ppi_final_file = (
final_influence_mutation_directory +
'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'
.format(influence_weight, simplification, alpha, tol, keep_singletons,
ngh_max))
ppi_final_data = loadmat(ppi_final_file)
ppi_final = ppi_final_data['ppi_final']

nmf_bootstrap.bootstrap(
result_folder, mut_type, mut_propag, ppi_final,
influence_weight, simplification, alpha, tol, keep_singletons,
ngh_max, min_mutation, max_mutation, n_components,
n_permutations, run_bootstrap, lambd, tol_nmf,
compute_gene_clustering, sub_perm)
compute_gene_clustering, sub_perm, data_folder, ssc_mutation_data, ssc_subgroups, gene_data):
# if mut_type == 'raw':
# mut_propag, mp_gene, mp_indiv = (
# load_data.load_specific_SSC_mutation_profile(
# data_folder, ssc_mutation_data, ssc_subgroups, gene_data))
# # fake
# ppi_final_file = '../data/result_MAF1_LoF_mis15_SSC_all_allGenes_APID/final_influence/PPI_final_weight=min_simp=True_alpha=0.7_tol=0.01_singletons=False_ngh=11.mat'
#
# else:
# final_influence_mutation_directory = result_folder + 'final_influence/'
# final_influence_mutation_file = (
# final_influence_mutation_directory +
# 'final_influence_mutation_profile_{}_alpha={}_tol={}.mat'.format(
# mut_type, alpha, tol))
# final_influence_data = loadmat(final_influence_mutation_file)
# mut_propag = final_influence_data['mut_propag']
#
# ppi_final_file = (
# final_influence_mutation_directory +
# 'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'
# .format(influence_weight, simplification, alpha, tol, keep_singletons,
# ngh_max))
# ppi_final_data = loadmat(ppi_final_file)
# ppi_final = ppi_final_data['ppi_final']
#
# nmf_bootstrap.bootstrap(
# result_folder, mut_type, mut_propag, ppi_final,
# influence_weight, simplification, alpha, tol, keep_singletons,
# ngh_max, min_mutation, max_mutation, n_components,
# n_permutations, run_bootstrap, lambd, tol_nmf,
# compute_gene_clustering, sub_perm)

nmf_bootstrap.bootstrap(data_folder, ssc_mutation_data, ssc_subgroups, gene_data,
result_folder, mut_type, influence_weight,
simplification, alpha, tol, keep_singletons, ngh_max,
min_mutation, max_mutation, n_components, n_permutations,
run_bootstrap, lambd, tol_nmf,
compute_gene_clustering, sub_perm)


def post_bootstrap(result_folder, mut_type, influence_weight, simplification,
alpha, tol, keep_singletons, ngh_max, min_mutation,
max_mutation, n_components, n_permutations, lambd, tol_nmf,
compute_gene_clustering, run_consensus, linkage_method,
compute_gene_clustering, run_consensus, run_bootstrap,
linkage_method,
ppi_data, patient_data, data_folder, ssc_subgroups,
ssc_mutation_data, gene_data, p_val_threshold, compute,
overwrite):
Expand Down

0 comments on commit bccbf7b

Please sign in to comment.